1 new changeset in galaxy-central: http://bitbucket.org/galaxy/galaxy-central/changeset/917fe1e0356e/ changeset: r5203:917fe1e0356e user: jgoecks date: 2011-03-09 20:22:10 summary: Fix bugs in GFFReaderWrapper so that GFF3 files are read properly. Add GFF3 test to gff_filter_by_feature_count. affected #: 4 files (903 bytes) --- a/lib/galaxy/datatypes/util/gff_util.py Tue Mar 08 19:01:59 2011 -0500 +++ b/lib/galaxy/datatypes/util/gff_util.py Wed Mar 09 14:22:10 2011 -0500 @@ -12,8 +12,18 @@ """ def __init__( self, reader, fields, chrom_col, feature_col, start_col, end_col, \ strand_col, score_col, default_strand, fix_strand=False, raw_line='' ): + # HACK: GFF format allows '.' for strand but GenomicInterval does not. To get around this, + # temporarily set strand and then unset after initing GenomicInterval. + unknown_strand = False + if not fix_strand and fields[ strand_col ] == '.': + unknown_strand = True + fields[ strand_col ] = '+' GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \ default_strand, fix_strand=fix_strand ) + if unknown_strand: + self.strand = '.' + self.fields[ strand_col ] = '.' + # Handle feature, score column. self.feature_col = feature_col if self.feature_col >= self.nfields: @@ -40,13 +50,10 @@ self.intervals = intervals # Use intervals to set feature attributes. for interval in self.intervals: - # Error checking. + # Error checking. NOTE: intervals need not share the same strand. if interval.chrom != self.chrom: - raise ValueError( "interval chrom does not match self chrom: %i != %i" % \ + raise ValueError( "interval chrom does not match self chrom: %s != %s" % \ ( interval.chrom, self.chrom ) ) - if interval.strand != self.strand: - raise ValueError( "interval strand does not match self strand: %s != %s" % \ - ( interval.strand, self.strand ) ) # Set start, end of interval. if interval.start < self.start: self.start = interval.start @@ -140,7 +147,7 @@ # For debugging, uncomment this to propogate parsing exceptions up. # I.e. the underlying reason for an unexpected StopIteration exception # can be found by uncommenting this. - # raise e + #raise e # # Get next GFFFeature @@ -163,7 +170,7 @@ # Initialize feature name from seed. feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF - feature_id = self.seed_interval.attributes.get( 'id', None ) # For GFF3 + feature_id = self.seed_interval.attributes.get( 'ID', None ) # For GFF3 feature_gene_id = self.seed_interval.attributes.get( 'gene_id', None ) # For GTF feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None ) # For GTF @@ -183,11 +190,14 @@ # If interval not associated with feature, break. group = interval.attributes.get( 'group', None ) + # GFF test: if group and feature_group != group: break - id = interval.attributes.get( 'id', None ) - if id and feature_id != id: + # GFF3 test: + parent = interval.attributes.get( 'Parent', None ) + if feature_id and feature_id != parent: break + # GTF test: gene_id = interval.attributes.get( 'gene_id', None ) transcript_id = interval.attributes.get( 'transcript_id', None ) if ( transcript_id and transcript_id != feature_transcript_id ) or \ --- a/tools/filters/gff/gff_filter_by_feature_count.py Tue Mar 08 19:01:59 2011 -0500 +++ b/tools/filters/gff/gff_filter_by_feature_count.py Wed Mar 09 14:22:10 2011 -0500 @@ -8,8 +8,7 @@ import sys from galaxy import eggs from galaxy.datatypes.util.gff_util import GFFReaderWrapper - -assert sys.version_info[:2] >= ( 2, 4 ) +from bx.intervals.io import GenomicInterval # Valid operators, ordered so that complex operators (e.g. '>=') are # recognized before simple operators (e.g. '>') @@ -62,7 +61,9 @@ skipped_lines = 0 first_skipped_line = 0 out = open( output_name, 'w' ) - for i, feature in enumerate( GFFReaderWrapper( open( input_name ), fix_strand=True ) ): + for i, feature in enumerate( GFFReaderWrapper( open( input_name ) ) ): + if not isinstance( feature, GenomicInterval ): + continue count = 0 for interval in feature.intervals: if interval.feature == feature_name: @@ -73,6 +74,9 @@ out.write( "\t".join(interval.fields) + '\n' ) kept_features += 1 + # Needed because i is 0-based but want to display stats using 1-based. + i += 1 + # Clean up. out.close() info_msg = "%i of %i features kept (%.2f%%) using condition %s. " % \ --- a/tools/filters/gff/gff_filter_by_feature_count.xml Tue Mar 08 19:01:59 2011 -0500 +++ b/tools/filters/gff/gff_filter_by_feature_count.xml Wed Mar 09 14:22:10 2011 -0500 @@ -20,12 +20,20 @@ <data format="input" name="out_file1" metadata_source="input_file1"/></outputs><tests> - <test> - <param name="input_file1" value="gops_subtract_in1.gff"/> - <param name="feature_name" value="exon"/> - <param name="cond" value=">1"/> - <output name="out_file1" file="gff_filter_by_feature_count_out1.gff"/> - </test> + <!-- Test GTF filtering. --> + <test> + <param name="input_file1" value="gops_subtract_in1.gff"/> + <param name="feature_name" value="exon"/> + <param name="cond" value=">1"/> + <output name="out_file1" file="gff_filter_by_feature_count_out1.gff"/> + </test> + <!-- Test GFF3 filtering. --> + <test> + <param name="input_file1" value="5.gff3"/> + <param name="feature_name" value="HSP"/> + <param name="cond" value=">=5"/> + <output name="out_file1" file="gff_filter_by_feature_count_out2.gff"/> + </test></tests><help> Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.