commit/galaxy-central: jgoecks: Full and proper sorting for GTF datasets: sort by, in order, transcript_id, chrom, and start.
1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/changeset/95c05fcbbceb/ changeset: 95c05fcbbceb user: jgoecks date: 2012-05-24 19:26:48 summary: Full and proper sorting for GTF datasets: sort by, in order, transcript_id, chrom, and start. affected #: 2 files diff -r a0d6e382f2e198e46613705ad8a3962d0bc58fca -r 95c05fcbbceb69e9100d8689694c22cc13d16b23 lib/galaxy/datatypes/util/gff_util.py --- a/lib/galaxy/datatypes/util/gff_util.py +++ b/lib/galaxy/datatypes/util/gff_util.py @@ -1,9 +1,12 @@ """ Provides utilities for working with GFF files. """ + +import copy import pkg_resources; pkg_resources.require( "bx-python" ) from bx.intervals.io import * from bx.tabular.io import Header, Comment +from galaxy.util.odict import odict class GFFInterval( GenomicInterval ): """ @@ -48,7 +51,8 @@ def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, end_col=4, \ strand_col=6, score_col=5, default_strand='.', fix_strand=False, intervals=[], \ raw_size=0 ): - GFFInterval.__init__( self, reader, intervals[0].fields, chrom_col, feature_col, \ + # Use copy so that first interval and feature do not share fields. + GFFInterval.__init__( self, reader, copy.deepcopy( intervals[0].fields ), chrom_col, feature_col, \ start_col, end_col, strand_col, score_col, default_strand, \ fix_strand=fix_strand ) self.intervals = intervals @@ -356,4 +360,46 @@ for name, value in attrs.items(): attrs_strs.append( format_string % ( name, value ) ) return " ; ".join( attrs_strs ) - \ No newline at end of file + +def read_unordered_gtf( iterator ): + """ + Returns GTF features found in an iterator. GTF lines need not be ordered + or clustered for reader to work. Reader returns GFFFeature objects sorted + by transcript_id, chrom, and start position. + """ + + # Aggregate intervals by transcript_id. + feature_intervals = odict() + for count, line in enumerate( iterator ): + line_attrs = parse_gff_attributes( line.split('\t')[8] ) + transcript_id = line_attrs[ 'transcript_id' ] + if transcript_id in feature_intervals: + feature = feature_intervals[ transcript_id ] + else: + feature = [] + feature_intervals[ transcript_id ] = feature + feature.append( GFFInterval( None, line.split( '\t' ) ) ) + + # Create features. + chroms_features = {} + for count, intervals in enumerate( feature_intervals.values() ): + # Sort intervals by start position. + intervals.sort( lambda a,b: cmp( a.start, b.start ) ) + feature = GFFFeature( None, intervals=intervals ) + if feature.chrom not in chroms_features: + chroms_features[ feature.chrom ] = [] + chroms_features[ feature.chrom ].append( feature ) + + # Sort features by chrom, start position. + chroms_features_sorted = [] + for chrom_features in chroms_features.values(): + chroms_features_sorted.append( chrom_features ) + chroms_features_sorted.sort( lambda a,b: cmp( a[0].chrom, b[0].chrom ) ) + for features in chroms_features_sorted: + features.sort( lambda a,b: cmp( a.start, b.start ) ) + + # Yield. + for chrom_features in chroms_features_sorted: + for feature in chrom_features: + yield feature + \ No newline at end of file diff -r a0d6e382f2e198e46613705ad8a3962d0bc58fca -r 95c05fcbbceb69e9100d8689694c22cc13d16b23 tools/filters/gff/sort_gtf.py --- /dev/null +++ b/tools/filters/gff/sort_gtf.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python + +import sys +from galaxy import eggs +from galaxy.datatypes.util.gff_util import read_unordered_gtf + +# Older py compatibility +try: + set() +except: + from sets import Set as set + +assert sys.version_info[:2] >= ( 2, 4 ) + +# +# Process inputs. +# + +in_fname = sys.argv[1] +out_fname = sys.argv[2] + +out = open( out_fname, 'w' ) +for feature in read_unordered_gtf( open( in_fname, 'r' ) ): + # Print feature. + for interval in feature.intervals: + out.write( "\t".join(interval.fields) ) + +# TODO: print status information: how many lines processed and features found. \ No newline at end of file Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
Bitbucket