commit/galaxy-central: jgoecks: Use GFFReaderWrapper in gff_filter_by_feature_count tool in order to leverage support for GTF, GFF, and GFF3 found in reader wrapper. This also simplifies the tool considerably.

9 Mar 2011

1 new changeset in galaxy-central:

http://bitbucket.org/galaxy/galaxy-central/changeset/c9e6bc81817a/
changeset:   r5202:c9e6bc81817a
user:        jgoecks
date:        2011-03-09 01:01:59
summary:     Use GFFReaderWrapper in gff_filter_by_feature_count tool in order to leverage support for GTF, GFF, and GFF3 found in reader wrapper. This also simplifies the tool considerably.
affected #:  2 files (2.7 KB)

--- a/lib/galaxy/datatypes/util/gff_util.py	Tue Mar 08 16:17:09 2011 -0500
+++ b/lib/galaxy/datatypes/util/gff_util.py	Tue Mar 08 19:01:59 2011 -0500
@@ -10,14 +10,18 @@
     A GFF interval, including attributes. If file is strictly a GFF file,
     only attribute is 'group.'
     """
-    def __init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \
-                  score_col, default_strand, fix_strand=False, raw_line='' ):
+    def __init__( self, reader, fields, chrom_col, feature_col, start_col, end_col, \
+                  strand_col, score_col, default_strand, fix_strand=False, raw_line='' ):
         GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \
                                   default_strand, fix_strand=fix_strand )
-        # Handle score column.
+        # Handle feature, score column.
+        self.feature_col = feature_col
+        if self.feature_col >= self.nfields:
+            raise MissingFieldError( "No field for feature_col (%d)" % feature_col )
+        self.feature = self.fields[ self.feature_col ]
         self.score_col = score_col
         if self.score_col >= self.nfields:
-          raise MissingFieldError( "No field for score_col (%d)" % score_col )
+            raise MissingFieldError( "No field for score_col (%d)" % score_col )
         self.score = self.fields[ self.score_col ]
         
         # Attributes specific to GFF.
@@ -28,10 +32,11 @@
     """
     A GFF feature, which can include multiple intervals.
     """
-    def __init__( self, reader, chrom_col, start_col, end_col, strand_col, score_col, default_strand, \
-                  fix_strand=False, intervals=[] ):
-        GFFInterval.__init__( self, reader, intervals[0].fields, chrom_col, start_col, end_col, \
-                                strand_col, score_col, default_strand, fix_strand=fix_strand )
+    def __init__( self, reader, chrom_col, feature_col, start_col, end_col, \
+                  strand_col, score_col, default_strand, fix_strand=False, intervals=[] ):
+        GFFInterval.__init__( self, reader, intervals[0].fields, chrom_col, feature_col, \
+                              start_col, end_col, strand_col, score_col, default_strand, \
+                              fix_strand=fix_strand )
         self.intervals = intervals
         # Use intervals to set feature attributes.
         for interval in self.intervals:
@@ -99,20 +104,20 @@
         expect traditional interval format.
     """
     
-    def __init__( self, reader, chrom_col=0, start_col=3, end_col=4, strand_col=6, score_col=5, \
-                  fix_strand=False, **kwargs ):
+    def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, \
+                  end_col=4, strand_col=6, score_col=5, fix_strand=False, **kwargs ):
         NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, \
                                     strand_col=strand_col, fix_strand=fix_strand, **kwargs )
-        # HACK: NiceReaderWrapper (bx-python) does not handle score_col yet, so store ourselves.
+        self.feature_col = feature_col
         self.score_col = score_col
         self.last_line = None
         self.cur_offset = 0
         self.seed_interval = None
     
     def parse_row( self, line ):
-        interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \
-                                self.end_col, self.strand_col, self.score_col, self.default_strand, \
-                                fix_strand=self.fix_strand, raw_line=line )
+        interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, \
+                                self.start_col, self.end_col, self.strand_col, self.score_col, \
+                                self.default_strand, fix_strand=self.fix_strand, raw_line=line )
         return interval
         
     def next( self ):
@@ -196,8 +201,9 @@
         self.seed_interval = interval
     
         # Return GFF feature with all intervals.    
-        return GFFFeature( self, self.chrom_col, self.start_col, self.end_col, self.strand_col, \
-                           self.score_col, self.default_strand, fix_strand=self.fix_strand, \
+        return GFFFeature( self, self.chrom_col, self.feature_col, self.start_col, \
+                           self.end_col, self.strand_col, self.score_col, \
+                           self.default_strand, fix_strand=self.fix_strand, \
                            intervals=feature_intervals )
         
 


--- a/tools/filters/gff/gff_filter_by_feature_count.py	Tue Mar 08 16:17:09 2011 -0500
+++ b/tools/filters/gff/gff_filter_by_feature_count.py	Tue Mar 08 19:01:59 2011 -0500
@@ -7,7 +7,7 @@
 """
 import sys
 from galaxy import eggs
-from galaxy.datatypes.util.gff_util import parse_gff_attributes
+from galaxy.datatypes.util.gff_util import GFFReaderWrapper
 
 assert sys.version_info[:2] >= ( 2, 4 )
 
@@ -58,77 +58,25 @@
             break
 
     # Do filtering.
-    kept_lines = 0
+    kept_features = 0
     skipped_lines = 0
     first_skipped_line = 0
     out = open( output_name, 'w' )
-    i = 0
-    cur_transcript_id = None
-    cur_transcript_lines = []
-    cur_transcript_feature_counts = {} # Key is feature name, value is feature count.
-    for i, line in enumerate( file( input_name ) ):
-        line = line.rstrip( '\r\n' )
-        if line and not line.startswith( '#' ):
-            try:
-                # GFF format: chrom, source, feature, chromStart, chromEnd, score, strand, attributes
-                elems = line.split( '\t' )
-                feature = elems[2]
-                start = str( long( elems[3] ) - 1 )
-                coords = [ long( start ), long( elems[4] ) ]
-                strand = elems[6]
-                attributes = parse_gff_attributes( elems[8] )
-                t_id = attributes.get( "transcript_id", None )
-                    
-                if not t_id:
-                    # No transcript id, so pass line to output.
-                    out.write( line )
-                    kept_lines += 1
-                    continue
-                
-                # There is a transcript ID, so process line at transcript level.
-                if t_id == cur_transcript_id:
-                    # Line is element of transcript; increment feature count.
-                    if not feature in cur_transcript_feature_counts:
-                        cur_transcript_feature_counts[feature] = 0
-                    cur_transcript_feature_counts[feature] += 1
-                    cur_transcript_lines.append( line )
-                    continue
-                    
-                #
-                # Line is part of new transcript; filter previous transcript.
-                #
-                
-                # Filter/write previous transcript.
-                result = eval( '%s %s' % ( cur_transcript_feature_counts.get( feature_name, 0 ), condition ) )
-                if cur_transcript_id and result:
-                    # Transcript passes filter; write transcript line to file."
-                    out.write( "\n".join( cur_transcript_lines ) + "\n" )
-                    kept_lines += len( cur_transcript_lines )
-
-                # Start new transcript.
-                cur_transcript_id = t_id
-                cur_transcript_feature_counts = {}
-                cur_transcript_feature_counts[feature] = 1
-                cur_transcript_lines = [ line ]
-            except Exception, e:
-                print e
-                skipped_lines += 1
-                if not first_skipped_line:
-                    first_skipped_line = i + 1
-        else:
-            skipped_lines += 1
-            if not first_skipped_line:
-                first_skipped_line = i + 1
-    
-    # Write last transcript.
-    if cur_transcript_id and eval( '%s %s' % ( cur_transcript_feature_counts[feature_name], condition ) ):
-        # Transcript passes filter; write transcript lints to file.
-        out.write( "\n".join( cur_transcript_lines ) + "\n" )
-        kept_lines += len( cur_transcript_lines )
+    for i, feature in enumerate( GFFReaderWrapper( open( input_name ), fix_strand=True ) ):
+        count = 0
+        for interval in feature.intervals:
+            if interval.feature == feature_name:
+                count += 1
+        if eval( '%s %s' % ( count, condition ) ):
+            # Keep feature.
+            for interval in feature.intervals:
+                out.write( "\t".join(interval.fields) + '\n' )
+            kept_features += 1
 
     # Clean up.
     out.close()
-    info_msg = "%i lines kept (%.2f%%) using condition %s.  " % ( kept_lines, float(kept_lines)/i * 100.0, feature_name + condition )
+    info_msg = "%i of %i features kept (%.2f%%) using condition %s.  " % \
+        ( kept_features, i, float(kept_features)/i * 100.0, feature_name + condition )
     if skipped_lines > 0:
         info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line )
     print info_msg

Repository URL: https://bitbucket.org/galaxy/galaxy-central/

--

This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.

    

Bitbucket

tags

participants (1)