commit/galaxy-central: jgoecks: Use GFFReaderWrapper in gff_filter_by_feature_count tool in order to leverage support for GTF, GFF, and GFF3 found in reader wrapper. This also simplifies the tool considerably.
1 new changeset in galaxy-central: http://bitbucket.org/galaxy/galaxy-central/changeset/c9e6bc81817a/ changeset: r5202:c9e6bc81817a user: jgoecks date: 2011-03-09 01:01:59 summary: Use GFFReaderWrapper in gff_filter_by_feature_count tool in order to leverage support for GTF, GFF, and GFF3 found in reader wrapper. This also simplifies the tool considerably. affected #: 2 files (2.7 KB) --- a/lib/galaxy/datatypes/util/gff_util.py Tue Mar 08 16:17:09 2011 -0500 +++ b/lib/galaxy/datatypes/util/gff_util.py Tue Mar 08 19:01:59 2011 -0500 @@ -10,14 +10,18 @@ A GFF interval, including attributes. If file is strictly a GFF file, only attribute is 'group.' """ - def __init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \ - score_col, default_strand, fix_strand=False, raw_line='' ): + def __init__( self, reader, fields, chrom_col, feature_col, start_col, end_col, \ + strand_col, score_col, default_strand, fix_strand=False, raw_line='' ): GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \ default_strand, fix_strand=fix_strand ) - # Handle score column. + # Handle feature, score column. + self.feature_col = feature_col + if self.feature_col >= self.nfields: + raise MissingFieldError( "No field for feature_col (%d)" % feature_col ) + self.feature = self.fields[ self.feature_col ] self.score_col = score_col if self.score_col >= self.nfields: - raise MissingFieldError( "No field for score_col (%d)" % score_col ) + raise MissingFieldError( "No field for score_col (%d)" % score_col ) self.score = self.fields[ self.score_col ] # Attributes specific to GFF. @@ -28,10 +32,11 @@ """ A GFF feature, which can include multiple intervals. """ - def __init__( self, reader, chrom_col, start_col, end_col, strand_col, score_col, default_strand, \ - fix_strand=False, intervals=[] ): - GFFInterval.__init__( self, reader, intervals[0].fields, chrom_col, start_col, end_col, \ - strand_col, score_col, default_strand, fix_strand=fix_strand ) + def __init__( self, reader, chrom_col, feature_col, start_col, end_col, \ + strand_col, score_col, default_strand, fix_strand=False, intervals=[] ): + GFFInterval.__init__( self, reader, intervals[0].fields, chrom_col, feature_col, \ + start_col, end_col, strand_col, score_col, default_strand, \ + fix_strand=fix_strand ) self.intervals = intervals # Use intervals to set feature attributes. for interval in self.intervals: @@ -99,20 +104,20 @@ expect traditional interval format. """ - def __init__( self, reader, chrom_col=0, start_col=3, end_col=4, strand_col=6, score_col=5, \ - fix_strand=False, **kwargs ): + def __init__( self, reader, chrom_col=0, feature_col=2, start_col=3, \ + end_col=4, strand_col=6, score_col=5, fix_strand=False, **kwargs ): NiceReaderWrapper.__init__( self, reader, chrom_col=chrom_col, start_col=start_col, end_col=end_col, \ strand_col=strand_col, fix_strand=fix_strand, **kwargs ) - # HACK: NiceReaderWrapper (bx-python) does not handle score_col yet, so store ourselves. + self.feature_col = feature_col self.score_col = score_col self.last_line = None self.cur_offset = 0 self.seed_interval = None def parse_row( self, line ): - interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \ - self.end_col, self.strand_col, self.score_col, self.default_strand, \ - fix_strand=self.fix_strand, raw_line=line ) + interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.feature_col, \ + self.start_col, self.end_col, self.strand_col, self.score_col, \ + self.default_strand, fix_strand=self.fix_strand, raw_line=line ) return interval def next( self ): @@ -196,8 +201,9 @@ self.seed_interval = interval # Return GFF feature with all intervals. - return GFFFeature( self, self.chrom_col, self.start_col, self.end_col, self.strand_col, \ - self.score_col, self.default_strand, fix_strand=self.fix_strand, \ + return GFFFeature( self, self.chrom_col, self.feature_col, self.start_col, \ + self.end_col, self.strand_col, self.score_col, \ + self.default_strand, fix_strand=self.fix_strand, \ intervals=feature_intervals ) --- a/tools/filters/gff/gff_filter_by_feature_count.py Tue Mar 08 16:17:09 2011 -0500 +++ b/tools/filters/gff/gff_filter_by_feature_count.py Tue Mar 08 19:01:59 2011 -0500 @@ -7,7 +7,7 @@ """ import sys from galaxy import eggs -from galaxy.datatypes.util.gff_util import parse_gff_attributes +from galaxy.datatypes.util.gff_util import GFFReaderWrapper assert sys.version_info[:2] >= ( 2, 4 ) @@ -58,77 +58,25 @@ break # Do filtering. - kept_lines = 0 + kept_features = 0 skipped_lines = 0 first_skipped_line = 0 out = open( output_name, 'w' ) - i = 0 - cur_transcript_id = None - cur_transcript_lines = [] - cur_transcript_feature_counts = {} # Key is feature name, value is feature count. - for i, line in enumerate( file( input_name ) ): - line = line.rstrip( '\r\n' ) - if line and not line.startswith( '#' ): - try: - # GFF format: chrom, source, feature, chromStart, chromEnd, score, strand, attributes - elems = line.split( '\t' ) - feature = elems[2] - start = str( long( elems[3] ) - 1 ) - coords = [ long( start ), long( elems[4] ) ] - strand = elems[6] - attributes = parse_gff_attributes( elems[8] ) - t_id = attributes.get( "transcript_id", None ) - - if not t_id: - # No transcript id, so pass line to output. - out.write( line ) - kept_lines += 1 - continue - - # There is a transcript ID, so process line at transcript level. - if t_id == cur_transcript_id: - # Line is element of transcript; increment feature count. - if not feature in cur_transcript_feature_counts: - cur_transcript_feature_counts[feature] = 0 - cur_transcript_feature_counts[feature] += 1 - cur_transcript_lines.append( line ) - continue - - # - # Line is part of new transcript; filter previous transcript. - # - - # Filter/write previous transcript. - result = eval( '%s %s' % ( cur_transcript_feature_counts.get( feature_name, 0 ), condition ) ) - if cur_transcript_id and result: - # Transcript passes filter; write transcript line to file." - out.write( "\n".join( cur_transcript_lines ) + "\n" ) - kept_lines += len( cur_transcript_lines ) - - # Start new transcript. - cur_transcript_id = t_id - cur_transcript_feature_counts = {} - cur_transcript_feature_counts[feature] = 1 - cur_transcript_lines = [ line ] - except Exception, e: - print e - skipped_lines += 1 - if not first_skipped_line: - first_skipped_line = i + 1 - else: - skipped_lines += 1 - if not first_skipped_line: - first_skipped_line = i + 1 - - # Write last transcript. - if cur_transcript_id and eval( '%s %s' % ( cur_transcript_feature_counts[feature_name], condition ) ): - # Transcript passes filter; write transcript lints to file. - out.write( "\n".join( cur_transcript_lines ) + "\n" ) - kept_lines += len( cur_transcript_lines ) + for i, feature in enumerate( GFFReaderWrapper( open( input_name ), fix_strand=True ) ): + count = 0 + for interval in feature.intervals: + if interval.feature == feature_name: + count += 1 + if eval( '%s %s' % ( count, condition ) ): + # Keep feature. + for interval in feature.intervals: + out.write( "\t".join(interval.fields) + '\n' ) + kept_features += 1 # Clean up. out.close() - info_msg = "%i lines kept (%.2f%%) using condition %s. " % ( kept_lines, float(kept_lines)/i * 100.0, feature_name + condition ) + info_msg = "%i of %i features kept (%.2f%%) using condition %s. " % \ + ( kept_features, i, float(kept_features)/i * 100.0, feature_name + condition ) if skipped_lines > 0: info_msg += "Skipped %d blank/comment/invalid lines starting with line #%d." %( skipped_lines, first_skipped_line ) print info_msg Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
Bitbucket