[hg] galaxy 3764: Add 'filter transcripts via tracking' tool to ...

11 May 2010

details:   http://www.bx.psu.edu/hg/galaxy/rev/4e5ca6c44804
changeset: 3764:4e5ca6c44804
user:      jeremy goecks <jeremy.goecks@emory.edu>
date:      Mon May 10 12:42:05 2010 -0400
description:
Add 'filter transcripts via tracking' tool to RNA-seq tools. This tool is useful for operating on output files from cufftools suite.

diffstat:

 tool_conf.xml.sample                              |   1 +
 tools/ngs_rna/filter_transcripts_via_tracking.py  |  70 +++++++++++++++++++++++
 tools/ngs_rna/filter_transcripts_via_tracking.xml |  32 ++++++++++
 3 files changed, 103 insertions(+), 0 deletions(-)

diffs (121 lines):

diff -r 1110a91888e2 -r 4e5ca6c44804 tool_conf.xml.sample

--- a/tool_conf.xml.sample	Mon May 10 11:55:08 2010 -0400
+++ b/tool_conf.xml.sample	Mon May 10 12:42:05 2010 -0400
@@ -239,6 +239,7 @@
    <tool file="ngs_rna/cufflinks_wrapper.xml" />
    <tool file="ngs_rna/cuffcompare_wrapper.xml" />
    <tool file="ngs_rna/cuffdiff_wrapper.xml" />
+   <tool file="ngs_rna/filter_transcripts_via_tracking.xml" />
   </section>
   <section name="NGS: SAM Tools" id="samtools">
    <tool file="samtools/sam_bitwise_flag_filter.xml" />
diff -r 1110a91888e2 -r 4e5ca6c44804 tools/ngs_rna/filter_transcripts_via_tracking.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ngs_rna/filter_transcripts_via_tracking.py	Mon May 10 12:42:05 2010 -0400
@@ -0,0 +1,70 @@
+#!/usr/bin/env python
+import os, sys, tempfile
+
+assert sys.version_info[:2] >= ( 2, 4 )
+
+def __main__():
+    """
+    Utility script for analyzing Cufflinks data: uses a tracking file (produced by cuffcompare) to filter a GTF file of transcripts (usually the transcripts
+    produced by cufflinks). Filtering is done by extracting transcript IDs from tracking file and then filtering the GTF so that the output GTF contains only
+    transcript found in the tracking file. Because a tracking file has multiple samples, a sample number is used to filter transcripts for
+    a particular sample.
+    """
+    # Read parms.
+    tracking_file_name = sys.argv[1]
+    transcripts_file_name = sys.argv[2]
+    output_file_name = sys.argv[3]
+    sample_number = int ( sys.argv[4] )
+
+    # Open files.
+    transcripts_file = open( transcripts_file_name, 'r' )
+    output_file = open( output_file_name, 'w' )
+    
+    # Read transcript IDs from tracking file.
+    transcript_ids = {}
+    for i, line in enumerate( file( tracking_file_name ) ) :
+        # Split line into elements. Line format is 
+        # [Transfrag ID] [Locus ID] [Ref Gene ID] [Ref Transcript ID] [Class code] [qJ:<gene_id>|<transcript_id>|<FMI>|<FPKM>|<conf_lo>|<conf_hi>]
+        line = line.rstrip( '\r\n' )
+        elems = line.split( '\t' )
+        
+        # Get transcript info.
+        if sample_number == 1:
+            transcript_info = elems[4]
+        elif sample_number == 2:
+            transcript_info = elems[5]
+        if not transcript_info.startswith('q'):
+            # No transcript for this sample.
+            continue
+        
+        # Get and store transcript id.
+        transcript_id = transcript_info.split('|')[1]
+        transcript_id = transcript_id.strip('"')
+        transcript_ids[transcript_id] = ""
+        
+    # Filter transcripts file using transcript_ids
+    for i, line in enumerate( file( transcripts_file_name ) ):
+        # GTF format: chrom source, name, chromStart, chromEnd, score, strand, frame, attributes.
+        elems = line.split( '\t' )
+        
+        # Get attributes.
+        attributes_list = elems[8].split(";")
+        attributes = {}
+        for name_value_pair in attributes_list:
+            pair = name_value_pair.strip().split(" ")
+            name = pair[0].strip()
+            if name == '':
+                continue
+            # Need to strip double quote from values
+            value = pair[1].strip(" \"")
+            attributes[name] = value
+            
+        # Get element's transcript id.
+        transcript_id = attributes['transcript_id']
+        if transcript_id in transcript_ids:
+            output_file.write(line)
+        
+    # Clean up.
+    output_file.close()
+    
+if __name__ == "__main__": __main__()
diff -r 1110a91888e2 -r 4e5ca6c44804 tools/ngs_rna/filter_transcripts_via_tracking.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/ngs_rna/filter_transcripts_via_tracking.xml	Mon May 10 12:42:05 2010 -0400
@@ -0,0 +1,32 @@
+<tool id="filter_combined_via_tracking" name="Filter Combined Transcripts" version="0.1">
+    <description>using tracking file</description>
+    <command interpreter="python">
+        filter_transcripts_via_tracking.py 
+            $tracking_file
+            $transcripts_file
+            $filtered_transcripts
+            $sample_num
+    </command>
+    <inputs>
+        <param format="gtf" name="transcripts_file" type="data" label="Cufflinks assembled transcripts" help=""/>
+        <param format="tabular" name="tracking_file" type="data" label="Cuffcompare tracking file" help=""/>
+        <param name="sample_num" type="select" label="Sample Number">
+            <option value="1">1</option>
+            <option value="2">2</option>
+        </param>
+    </inputs>
+
+    <outputs>
+        <data format="gtf" name="filtered_transcripts"/>
+    </outputs>
+
+    <tests>
+    </tests>
+
+    <help>
+        Uses a tracking file (produced by cuffcompare) to filter a GTF file of transcripts (usually the transcripts produced by 
+        cufflinks). Filtering is done by extracting transcript IDs from tracking file and then filtering the 
+        GTF so that the output GTF contains only transcript found in the tracking file. Because a tracking file has multiple 
+        samples, a sample number is used to filter transcripts for a particular sample.
+    </help>
+</tool>

    

Nate Coraor

tags

participants (1)