galaxy-dist commit 2b8161827380: Move gff_util.py from galaxy/tools/util to galaxy/datatypes/util because gff_utils are now used by the framework as well as tools.
# HG changeset patch -- Bitbucket.org # Project galaxy-dist # URL http://bitbucket.org/galaxy/galaxy-dist/overview # User jeremy goecks <jeremy.goecks@emory.edu> # Date 1289752184 18000 # Node ID 2b81618273808135efd9409f953161481944fec2 # Parent 98009db17cb2b21db626fc9c185a8fd22109a4cd Move gff_util.py from galaxy/tools/util to galaxy/datatypes/util because gff_utils are now used by the framework as well as tools. --- a/tools/new_operations/gops_subtract.py +++ b/tools/new_operations/gops_subtract.py @@ -21,7 +21,7 @@ from bx.intervals.io import * from bx.intervals.operations.subtract import * from bx.cookbook import doc_optparse from galaxy.tools.util.galaxyops import * -from galaxy.tools.util.gff_util import * +from galaxy.datatypes.util.gff_util import * assert sys.version_info[:2] >= ( 2, 4 ) --- a/tools/new_operations/gops_intersect.py +++ b/tools/new_operations/gops_intersect.py @@ -21,7 +21,7 @@ from bx.intervals.io import * from bx.intervals.operations.intersect import * from bx.cookbook import doc_optparse from galaxy.tools.util.galaxyops import * -from galaxy.tools.util.gff_util import * +from galaxy.datatypes.util.gff_util import * assert sys.version_info[:2] >= ( 2, 4 ) --- a/tools/extract/extract_genomic_dna.py +++ b/tools/extract/extract_genomic_dna.py @@ -15,7 +15,7 @@ from bx.cookbook import doc_optparse import bx.seq.nib import bx.seq.twobit from galaxy.tools.util.galaxyops import * -from galaxy.tools.util.gff_util import * +from galaxy.datatypes.util.gff_util import * assert sys.version_info[:2] >= ( 2, 4 ) --- a/lib/galaxy/datatypes/converters/interval_to_interval_index_converter.py +++ b/lib/galaxy/datatypes/converters/interval_to_interval_index_converter.py @@ -14,7 +14,7 @@ from galaxy import eggs import pkg_resources; pkg_resources.require( "bx-python" ) from galaxy.visualization.tracks.summary import * from bx.cookbook import doc_optparse -from galaxy.tools.util.gff_util import convert_gff_coords_to_bed +from galaxy.datatypes.util.gff_util import convert_gff_coords_to_bed from bx.interval_index_file import Indexes def main(): --- a/tools/filters/gff/gff_filter_by_feature_count.py +++ b/tools/filters/gff/gff_filter_by_feature_count.py @@ -7,7 +7,7 @@ Usage: """ import sys from galaxy import eggs -from galaxy.tools.util.gff_util import parse_gff_attributes +from galaxy.datatypes.util.gff_util import parse_gff_attributes assert sys.version_info[:2] >= ( 2, 4 ) --- a/tools/filters/gff_to_bed_converter.py +++ b/tools/filters/gff_to_bed_converter.py @@ -1,7 +1,7 @@ #!/usr/bin/env python import sys from galaxy import eggs -from galaxy.tools.util.gff_util import parse_gff_attributes +from galaxy.datatypes.util.gff_util import parse_gff_attributes assert sys.version_info[:2] >= ( 2, 4 ) --- /dev/null +++ b/lib/galaxy/datatypes/util/gff_util.py @@ -0,0 +1,226 @@ +""" +Provides utilities for working with GFF files. +""" + +from bx.intervals.io import * + +class GFFInterval( GenomicInterval ): + """ + A GFF interval, including attributes. If file is strictly a GFF file, + only attribute is 'group.' + """ + def __init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, default_strand, \ + fix_strand=False, raw_line='' ): + GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \ + default_strand, fix_strand=fix_strand ) + self.raw_line = raw_line + self.attributes = parse_gff_attributes( fields[8] ) + +class GFFFeature( GenomicInterval ): + """ + A GFF feature, which can include multiple intervals. + """ + def __init__( self, reader, chrom_col, start_col, end_col, strand_col, default_strand, \ + fix_strand=False, intervals=[] ): + GenomicInterval.__init__( self, reader, intervals[0].fields, chrom_col, start_col, end_col, \ + strand_col, default_strand, fix_strand=fix_strand ) + self.intervals = intervals + # Use intervals to set feature attributes. + for interval in self.intervals: + # Error checking. + if interval.chrom != self.chrom: + raise ValueError( "interval chrom does not match self chrom: %i != %i" % \ + ( interval.chrom, self.chrom ) ) + if interval.strand != self.strand: + raise ValueError( "interval strand does not match self strand: %s != %s" % \ + ( interval.strand, self.strand ) ) + # Set start, end of interval. + if interval.start < self.start: + self.start = interval.start + if interval.end > self.end: + self.end = interval.end + +class GFFIntervalToBEDReaderWrapper( NiceReaderWrapper ): + """ + Reader wrapper that reads GFF intervals/lines and automatically converts + them to BED format. + """ + + def parse_row( self, line ): + # HACK: this should return a GFF interval, but bx-python operations + # require GenomicInterval objects and subclasses will not work. + interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \ + self.end_col, self.strand_col, self.default_strand, \ + fix_strand=self.fix_strand ) + interval = convert_gff_coords_to_bed( interval ) + return interval + +class GFFReaderWrapper( NiceReaderWrapper ): + """ + Reader wrapper for GFF files. + + Wrapper has two major functions: + (1) group entries for GFF file (via group column), GFF3 (via id attribute ), + or GTF (via gene_id/transcript id); + (2) convert coordinates from GFF format--starting and ending coordinates + are 1-based, closed--to the 'traditional'/BED interval format--0 based, + half-open. This is useful when using GFF files as inputs to tools that + expect traditional interval format. + """ + + def __init__( self, reader, **kwargs ): + """ + Create wrapper. Defaults are group_entries=False and + convert_coords_to_bed=True to support backward compatibility. + """ + NiceReaderWrapper.__init__( self, reader, **kwargs ) + self.group_entries = kwargs.get( 'group_entries', False ) + self.convert_coords_to_bed = kwargs.get( 'convert_coords_to_bed', True ) + self.last_line = None + self.cur_offset = 0 + self.seed_interval = None + + def parse_row( self, line ): + interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \ + self.end_col, self.strand_col, self.default_strand, \ + fix_strand=self.fix_strand, raw_line=line ) + if self.convert_coords_to_bed: + interval = convert_gff_coords_to_bed( interval ) + return interval + + def next( self ): + """ Returns next GFFFeature. """ + + # + # Helper function. + # + + def handle_parse_error( parse_error ): + """ Actions to take when ParseError found. """ + if self.outstream: + if self.print_delegate and hasattr(self.print_delegate,"__call__"): + self.print_delegate( self.outstream, e, self ) + self.skipped += 1 + # no reason to stuff an entire bad file into memmory + if self.skipped < 10: + self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) ) + + # + # Get next GFFFeature + # + + # If there is no seed interval, set one. Also, if there are no more + # intervals to read, this is where iterator dies. + if not self.seed_interval: + while not self.seed_interval: + try: + self.seed_interval = GenomicIntervalReader.next( self ) + except ParseError, e: + handle_parse_error( e ) + + # Initialize feature name from seed. + feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF + feature_id = self.seed_interval.attributes.get( 'id', None ) # For GFF3 + feature_gene_id = self.seed_interval.attributes.get( 'gene_id', None ) # For GTF + feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None ) # For GTF + + # Read all intervals associated with seed. + feature_intervals = [] + feature_intervals.append( self.seed_interval ) + while True: + try: + interval = GenomicIntervalReader.next( self ) + except StopIteration, e: + # No more intervals to read, but last feature needs to be + # returned. + interval = None + break + except ParseError, e: + handle_parse_error( e ) + + # If interval not associated with feature, break. + group = interval.attributes.get( 'group', None ) + if group and feature_group != group: + break + id = interval.attributes.get( 'id', None ) + if id and feature_id != id: + break + gene_id = interval.attributes.get( 'gene_id', None ) + transcript_id = interval.attributes.get( 'transcript_id', None ) + if transcript_id and transcript_id != feature_transcript_id and gene_id and \ + gene_id != feature_gene_id: + break + + # Interval associated with feature. + feature_intervals.append( interval ) + + # Last interval read is the seed for the next interval. + self.seed_interval = interval + + # Return GFF feature with all intervals. + return GFFFeature( self, self.chrom_col, self.start_col, self.end_col, self.strand_col, \ + self.default_strand, fix_strand=self.fix_strand, \ + intervals=feature_intervals ) + + +def convert_bed_coords_to_gff( interval ): + """ + Converts an interval object's coordinates from BED format to GFF format. + Accepted object types include GenomicInterval and list (where the first + element in the list is the interval's start, and the second element is + the interval's end). + """ + if type( interval ) is GenomicInterval: + interval.start += 1 + elif type ( interval ) is list: + interval[ 0 ] += 1 + return interval + +def convert_gff_coords_to_bed( interval ): + """ + Converts an interval object's coordinates from GFF format to BED format. + Accepted object types include GenomicInterval and list (where the first + element in the list is the interval's start, and the second element is + the interval's end). + """ + if type( interval ) is GenomicInterval: + interval.start -= 1 + elif type ( interval ) is list: + interval[ 0 ] -= 1 + return interval + +def parse_gff_attributes( attr_str ): + """ + Parses a GFF/GTF attribute string and returns a dictionary of name-value + pairs. The general format for a GFF3 attributes string is + name1=value1;name2=value2 + The general format for a GTF attribute string is + name1 "value1" ; name2 "value2" + The general format for a GFF attribute string is a single string that + denotes the interval's group; in this case, method returns a dictionary + with a single key-value pair, and key name is 'group' + """ + attributes_list = attr_str.split(";") + attributes = {} + for name_value_pair in attributes_list: + # Try splitting by space and, if necessary, by '=' sign. + pair = name_value_pair.strip().split(" ") + if len( pair ) == 1: + pair = name_value_pair.strip().split("=") + if len( pair ) == 1: + # Could not split for some reason -- raise exception? + continue + if pair == '': + continue + name = pair[0].strip() + if name == '': + continue + # Need to strip double quote from values + value = pair[1].strip(" \"") + attributes[ name ] = value + + if len( attributes ) == 0: + # Could not split attributes string, so entire string must be + # 'group' attribute. This is the case for strictly GFF files. + attributes['group'] = attr_str + return attributes --- a/lib/galaxy/datatypes/converters/gff_to_interval_index_converter.py +++ b/lib/galaxy/datatypes/converters/gff_to_interval_index_converter.py @@ -12,7 +12,7 @@ from __future__ import division import sys, fileinput from galaxy import eggs import pkg_resources; pkg_resources.require( "bx-python" ) -from galaxy.tools.util.gff_util import * +from galaxy.datatypes.util.gff_util import * from bx.interval_index_file import Indexes def main(): --- a/lib/galaxy/datatypes/converters/interval_to_summary_tree_converter.py +++ b/lib/galaxy/datatypes/converters/interval_to_summary_tree_converter.py @@ -14,7 +14,7 @@ import pkg_resources; pkg_resources.requ from galaxy.visualization.tracks.summary import * from bx.intervals.io import * from bx.cookbook import doc_optparse -from galaxy.tools.util.gff_util import * +from galaxy.datatypes.util.gff_util import * def main(): # Read options, args. --- a/tools/new_operations/flanking_features.py +++ b/tools/new_operations/flanking_features.py @@ -18,7 +18,7 @@ from bx.cookbook import doc_optparse from galaxy.tools.util.galaxyops import * from bx.intervals.io import * from bx.intervals.operations import quicksect -from galaxy.tools.util.gff_util import * +from galaxy.datatypes.util.gff_util import * assert sys.version_info[:2] >= ( 2, 4 ) --- a/lib/galaxy/tools/util/gff_util.py +++ /dev/null @@ -1,226 +0,0 @@ -""" -Provides utilities for working with GFF files. -""" - -from bx.intervals.io import * - -class GFFInterval( GenomicInterval ): - """ - A GFF interval, including attributes. If file is strictly a GFF file, - only attribute is 'group.' - """ - def __init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, default_strand, \ - fix_strand=False, raw_line='' ): - GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \ - default_strand, fix_strand=fix_strand ) - self.raw_line = raw_line - self.attributes = parse_gff_attributes( fields[8] ) - -class GFFFeature( GenomicInterval ): - """ - A GFF feature, which can include multiple intervals. - """ - def __init__( self, reader, chrom_col, start_col, end_col, strand_col, default_strand, \ - fix_strand=False, intervals=[] ): - GenomicInterval.__init__( self, reader, intervals[0].fields, chrom_col, start_col, end_col, \ - strand_col, default_strand, fix_strand=fix_strand ) - self.intervals = intervals - # Use intervals to set feature attributes. - for interval in self.intervals: - # Error checking. - if interval.chrom != self.chrom: - raise ValueError( "interval chrom does not match self chrom: %i != %i" % \ - ( interval.chrom, self.chrom ) ) - if interval.strand != self.strand: - raise ValueError( "interval strand does not match self strand: %s != %s" % \ - ( interval.strand, self.strand ) ) - # Set start, end of interval. - if interval.start < self.start: - self.start = interval.start - if interval.end > self.end: - self.end = interval.end - -class GFFIntervalToBEDReaderWrapper( NiceReaderWrapper ): - """ - Reader wrapper that reads GFF intervals/lines and automatically converts - them to BED format. - """ - - def parse_row( self, line ): - # HACK: this should return a GFF interval, but bx-python operations - # require GenomicInterval objects and subclasses will not work. - interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \ - self.end_col, self.strand_col, self.default_strand, \ - fix_strand=self.fix_strand ) - interval = convert_gff_coords_to_bed( interval ) - return interval - -class GFFReaderWrapper( NiceReaderWrapper ): - """ - Reader wrapper for GFF files. - - Wrapper has two major functions: - (1) group entries for GFF file (via group column), GFF3 (via id attribute ), - or GTF (via gene_id/transcript id); - (2) convert coordinates from GFF format--starting and ending coordinates - are 1-based, closed--to the 'traditional'/BED interval format--0 based, - half-open. This is useful when using GFF files as inputs to tools that - expect traditional interval format. - """ - - def __init__( self, reader, **kwargs ): - """ - Create wrapper. Defaults are group_entries=False and - convert_coords_to_bed=True to support backward compatibility. - """ - NiceReaderWrapper.__init__( self, reader, **kwargs ) - self.group_entries = kwargs.get( 'group_entries', False ) - self.convert_coords_to_bed = kwargs.get( 'convert_coords_to_bed', True ) - self.last_line = None - self.cur_offset = 0 - self.seed_interval = None - - def parse_row( self, line ): - interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \ - self.end_col, self.strand_col, self.default_strand, \ - fix_strand=self.fix_strand, raw_line=line ) - if self.convert_coords_to_bed: - interval = convert_gff_coords_to_bed( interval ) - return interval - - def next( self ): - """ Returns next GFFFeature. """ - - # - # Helper function. - # - - def handle_parse_error( parse_error ): - """ Actions to take when ParseError found. """ - if self.outstream: - if self.print_delegate and hasattr(self.print_delegate,"__call__"): - self.print_delegate( self.outstream, e, self ) - self.skipped += 1 - # no reason to stuff an entire bad file into memmory - if self.skipped < 10: - self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) ) - - # - # Get next GFFFeature - # - - # If there is no seed interval, set one. Also, if there are no more - # intervals to read, this is where iterator dies. - if not self.seed_interval: - while not self.seed_interval: - try: - self.seed_interval = GenomicIntervalReader.next( self ) - except ParseError, e: - handle_parse_error( e ) - - # Initialize feature name from seed. - feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF - feature_id = self.seed_interval.attributes.get( 'id', None ) # For GFF3 - feature_gene_id = self.seed_interval.attributes.get( 'gene_id', None ) # For GTF - feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None ) # For GTF - - # Read all intervals associated with seed. - feature_intervals = [] - feature_intervals.append( self.seed_interval ) - while True: - try: - interval = GenomicIntervalReader.next( self ) - except StopIteration, e: - # No more intervals to read, but last feature needs to be - # returned. - interval = None - break - except ParseError, e: - handle_parse_error( e ) - - # If interval not associated with feature, break. - group = interval.attributes.get( 'group', None ) - if group and feature_group != group: - break - id = interval.attributes.get( 'id', None ) - if id and feature_id != id: - break - gene_id = interval.attributes.get( 'gene_id', None ) - transcript_id = interval.attributes.get( 'transcript_id', None ) - if transcript_id and transcript_id != feature_transcript_id and gene_id and \ - gene_id != feature_gene_id: - break - - # Interval associated with feature. - feature_intervals.append( interval ) - - # Last interval read is the seed for the next interval. - self.seed_interval = interval - - # Return GFF feature with all intervals. - return GFFFeature( self, self.chrom_col, self.start_col, self.end_col, self.strand_col, \ - self.default_strand, fix_strand=self.fix_strand, \ - intervals=feature_intervals ) - - -def convert_bed_coords_to_gff( interval ): - """ - Converts an interval object's coordinates from BED format to GFF format. - Accepted object types include GenomicInterval and list (where the first - element in the list is the interval's start, and the second element is - the interval's end). - """ - if type( interval ) is GenomicInterval: - interval.start += 1 - elif type ( interval ) is list: - interval[ 0 ] += 1 - return interval - -def convert_gff_coords_to_bed( interval ): - """ - Converts an interval object's coordinates from GFF format to BED format. - Accepted object types include GenomicInterval and list (where the first - element in the list is the interval's start, and the second element is - the interval's end). - """ - if type( interval ) is GenomicInterval: - interval.start -= 1 - elif type ( interval ) is list: - interval[ 0 ] -= 1 - return interval - -def parse_gff_attributes( attr_str ): - """ - Parses a GFF/GTF attribute string and returns a dictionary of name-value - pairs. The general format for a GFF3 attributes string is - name1=value1;name2=value2 - The general format for a GTF attribute string is - name1 "value1" ; name2 "value2" - The general format for a GFF attribute string is a single string that - denotes the interval's group; in this case, method returns a dictionary - with a single key-value pair, and key name is 'group' - """ - attributes_list = attr_str.split(";") - attributes = {} - for name_value_pair in attributes_list: - # Try splitting by space and, if necessary, by '=' sign. - pair = name_value_pair.strip().split(" ") - if len( pair ) == 1: - pair = name_value_pair.strip().split("=") - if len( pair ) == 1: - # Could not split for some reason -- raise exception? - continue - if pair == '': - continue - name = pair[0].strip() - if name == '': - continue - # Need to strip double quote from values - value = pair[1].strip(" \"") - attributes[ name ] = value - - if len( attributes ) == 0: - # Could not split attributes string, so entire string must be - # 'group' attribute. This is the case for strictly GFF files. - attributes['group'] = attr_str - return attributes
participants (1)
-
commits-noreply@bitbucket.org