details: http://www.bx.psu.edu/hg/galaxy/rev/4e5ca6c44804 changeset: 3764:4e5ca6c44804 user: jeremy goecks <jeremy.goecks@emory.edu> date: Mon May 10 12:42:05 2010 -0400 description: Add 'filter transcripts via tracking' tool to RNA-seq tools. This tool is useful for operating on output files from cufftools suite. diffstat: tool_conf.xml.sample | 1 + tools/ngs_rna/filter_transcripts_via_tracking.py | 70 +++++++++++++++++++++++ tools/ngs_rna/filter_transcripts_via_tracking.xml | 32 ++++++++++ 3 files changed, 103 insertions(+), 0 deletions(-) diffs (121 lines): diff -r 1110a91888e2 -r 4e5ca6c44804 tool_conf.xml.sample --- a/tool_conf.xml.sample Mon May 10 11:55:08 2010 -0400 +++ b/tool_conf.xml.sample Mon May 10 12:42:05 2010 -0400 @@ -239,6 +239,7 @@ <tool file="ngs_rna/cufflinks_wrapper.xml" /> <tool file="ngs_rna/cuffcompare_wrapper.xml" /> <tool file="ngs_rna/cuffdiff_wrapper.xml" /> + <tool file="ngs_rna/filter_transcripts_via_tracking.xml" /> </section> <section name="NGS: SAM Tools" id="samtools"> <tool file="samtools/sam_bitwise_flag_filter.xml" /> diff -r 1110a91888e2 -r 4e5ca6c44804 tools/ngs_rna/filter_transcripts_via_tracking.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/ngs_rna/filter_transcripts_via_tracking.py Mon May 10 12:42:05 2010 -0400 @@ -0,0 +1,70 @@ +#!/usr/bin/env python +import os, sys, tempfile + +assert sys.version_info[:2] >= ( 2, 4 ) + +def __main__(): + """ + Utility script for analyzing Cufflinks data: uses a tracking file (produced by cuffcompare) to filter a GTF file of transcripts (usually the transcripts + produced by cufflinks). Filtering is done by extracting transcript IDs from tracking file and then filtering the GTF so that the output GTF contains only + transcript found in the tracking file. Because a tracking file has multiple samples, a sample number is used to filter transcripts for + a particular sample. + """ + # Read parms. + tracking_file_name = sys.argv[1] + transcripts_file_name = sys.argv[2] + output_file_name = sys.argv[3] + sample_number = int ( sys.argv[4] ) + + # Open files. + transcripts_file = open( transcripts_file_name, 'r' ) + output_file = open( output_file_name, 'w' ) + + # Read transcript IDs from tracking file. + transcript_ids = {} + for i, line in enumerate( file( tracking_file_name ) ) : + # Split line into elements. Line format is + # [Transfrag ID] [Locus ID] [Ref Gene ID] [Ref Transcript ID] [Class code] [qJ:<gene_id>|<transcript_id>|<FMI>|<FPKM>|<conf_lo>|<conf_hi>] + line = line.rstrip( '\r\n' ) + elems = line.split( '\t' ) + + # Get transcript info. + if sample_number == 1: + transcript_info = elems[4] + elif sample_number == 2: + transcript_info = elems[5] + if not transcript_info.startswith('q'): + # No transcript for this sample. + continue + + # Get and store transcript id. + transcript_id = transcript_info.split('|')[1] + transcript_id = transcript_id.strip('"') + transcript_ids[transcript_id] = "" + + # Filter transcripts file using transcript_ids + for i, line in enumerate( file( transcripts_file_name ) ): + # GTF format: chrom source, name, chromStart, chromEnd, score, strand, frame, attributes. + elems = line.split( '\t' ) + + # Get attributes. + attributes_list = elems[8].split(";") + attributes = {} + for name_value_pair in attributes_list: + pair = name_value_pair.strip().split(" ") + name = pair[0].strip() + if name == '': + continue + # Need to strip double quote from values + value = pair[1].strip(" \"") + attributes[name] = value + + # Get element's transcript id. + transcript_id = attributes['transcript_id'] + if transcript_id in transcript_ids: + output_file.write(line) + + # Clean up. + output_file.close() + +if __name__ == "__main__": __main__() diff -r 1110a91888e2 -r 4e5ca6c44804 tools/ngs_rna/filter_transcripts_via_tracking.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/ngs_rna/filter_transcripts_via_tracking.xml Mon May 10 12:42:05 2010 -0400 @@ -0,0 +1,32 @@ +<tool id="filter_combined_via_tracking" name="Filter Combined Transcripts" version="0.1"> + <description>using tracking file</description> + <command interpreter="python"> + filter_transcripts_via_tracking.py + $tracking_file + $transcripts_file + $filtered_transcripts + $sample_num + </command> + <inputs> + <param format="gtf" name="transcripts_file" type="data" label="Cufflinks assembled transcripts" help=""/> + <param format="tabular" name="tracking_file" type="data" label="Cuffcompare tracking file" help=""/> + <param name="sample_num" type="select" label="Sample Number"> + <option value="1">1</option> + <option value="2">2</option> + </param> + </inputs> + + <outputs> + <data format="gtf" name="filtered_transcripts"/> + </outputs> + + <tests> + </tests> + + <help> + Uses a tracking file (produced by cuffcompare) to filter a GTF file of transcripts (usually the transcripts produced by + cufflinks). Filtering is done by extracting transcript IDs from tracking file and then filtering the + GTF so that the output GTF contains only transcript found in the tracking file. Because a tracking file has multiple + samples, a sample number is used to filter transcripts for a particular sample. + </help> +</tool>