# HG changeset patch -- Bitbucket.org # Project galaxy-dist # URL http://bitbucket.org/galaxy/galaxy-dist/overview # User jeremy goecks <jeremy.goecks@emory.edu> # Date 1289591537 18000 # Node ID ecc1cefbccba6135f824f9573f06c130fd6d08b4 # Parent b124f54952dee106f4ca87ea89a7f828c41562c3 Enhance GFFReader to read and return complete features (genes/transcripts), which are composed of multiple intervals/blocks. This required hacking around a couple limitations of bx-python, but these can be easily fixed when bx-python is updated. Use enhanced functionality to correctly create converted datasets for visualizing GFF files. Also updated GOPS tools to use a simpler GFF reader wrapper to read in intervals and convert to BED coordinates. --- a/tools/new_operations/gops_subtract.py +++ b/tools/new_operations/gops_subtract.py @@ -44,11 +44,11 @@ def main(): # Set readers to handle either GFF or default format. if in1_gff_format: - in1_reader_wrapper = GFFReaderWrapper + in1_reader_wrapper = GFFIntervalToBEDReaderWrapper else: in1_reader_wrapper = NiceReaderWrapper if in2_gff_format: - in2_reader_wrapper = GFFReaderWrapper + in2_reader_wrapper = GFFIntervalToBEDReaderWrapper else: in2_reader_wrapper = NiceReaderWrapper --- a/tools/new_operations/gops_intersect.py +++ b/tools/new_operations/gops_intersect.py @@ -44,11 +44,11 @@ def main(): # Set readers to handle either GFF or default format. if in1_gff_format: - in1_reader_wrapper = GFFReaderWrapper + in1_reader_wrapper = GFFIntervalToBEDReaderWrapper else: in1_reader_wrapper = NiceReaderWrapper if in2_gff_format: - in2_reader_wrapper = GFFReaderWrapper + in2_reader_wrapper = GFFIntervalToBEDReaderWrapper else: in2_reader_wrapper = NiceReaderWrapper @@ -66,10 +66,10 @@ def main(): fix_strand=True ) out_file = open( out_fname, "w" ) - + try: for line in intersect( [g1,g2], pieces=pieces, mincols=mincols ): - if type( line ) == GenomicInterval: + if isinstance( line, GenomicInterval ): if in1_gff_format: line = convert_bed_coords_to_gff( line ) out_file.write( "%s\n" % "\t".join( line.fields ) ) --- /dev/null +++ b/lib/galaxy/datatypes/converters/gff_to_interval_index_converter.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python + +""" +Convert from GFF file to interval index file. + +usage: + python gff_to_interval_index_converter.py [input] [output] +""" + +from __future__ import division + +import sys, fileinput +from galaxy import eggs +import pkg_resources; pkg_resources.require( "bx-python" ) +from galaxy.tools.util.gff_util import * +from bx.interval_index_file import Indexes + +def main(): + # Arguments + input_fname, out_fname = sys.argv[1:] + + # Do conversion. + chr_col, start_col, end_col, strand_col = ( 0, 3, 4, 6 ) + index = Indexes() + offset = 0 + reader_wrapper = GFFReaderWrapper( fileinput.FileInput( input_fname ), + chrom_col=chr_col, + start_col=start_col, + end_col=end_col, + strand_col=strand_col, + fix_strand=True ) + for feature in list( reader_wrapper ): + # TODO: need to address comments: + # if comment: + # increment_offset. + + # Add feature; index expects BED coordinates. + convert_gff_coords_to_bed( feature ) + index.add( feature.chrom, feature.start, feature.end, offset ) + + # Increment offset by feature length; feature length is all + # intervals/lines that comprise feature. + feature_len = 0 + for interval in feature.intervals: + # HACK: +1 for EOL char. Need bx-python to provide raw_line itself + # b/c TableReader strips EOL characters, thus changing the line + # length. + feature_len += len( interval.raw_line ) + 1 + offset += feature_len + + index.write( open(out_fname, "w") ) + +if __name__ == "__main__": + main() + --- a/lib/galaxy/datatypes/converters/interval_to_summary_tree_converter.py +++ b/lib/galaxy/datatypes/converters/interval_to_summary_tree_converter.py @@ -14,7 +14,7 @@ import pkg_resources; pkg_resources.requ from galaxy.visualization.tracks.summary import * from bx.intervals.io import * from bx.cookbook import doc_optparse -from galaxy.tools.util.gff_util import GFFReaderWrapper +from galaxy.tools.util.gff_util import * def main(): # Read options, args. @@ -40,9 +40,12 @@ def main(): strand_col=strand_col, fix_strand=True ) st = SummaryTree(block_size=25, levels=6, draw_cutoff=150, detail_cutoff=30) - for line in list( reader_wrapper ): - if type( line ) is GenomicInterval: - st.insert_range( line.chrom, long( line.start ), long( line.end ) ) + for feature in list( reader_wrapper ): + if isinstance( feature, GenomicInterval ): + # Tree expects BED coordinates. + if type( feature ) is GFFFeature: + convert_gff_coords_to_bed( feature ) + st.insert_range( feature.chrom, long( feature.start ), long( feature.end ) ) st.write(out_fname) --- a/lib/galaxy/datatypes/converters/gff_to_interval_index_converter.xml +++ b/lib/galaxy/datatypes/converters/gff_to_interval_index_converter.xml @@ -1,6 +1,6 @@ -<tool id="CONVERTER_gff_to_interval_index_0" name="Convert BED to Interval Index" version="1.0.0" hidden="true"> +<tool id="CONVERTER_gff_to_interval_index_0" name="Convert GFF to Interval Index" version="1.0.0" hidden="true"><!-- <description>__NOT_USED_CURRENTLY_FOR_CONVERTERS__</description> --> - <command interpreter="python">interval_to_interval_index_converter.py $input1 $output1 --gff</command> + <command interpreter="python">gff_to_interval_index_converter.py $input1 $output1</command><inputs><page><param format="gff" name="input1" type="data" label="Choose GFF file"/> --- a/tools/new_operations/flanking_features.py +++ b/tools/new_operations/flanking_features.py @@ -153,11 +153,11 @@ def main(): # Set readers to handle either GFF or default format. if in1_gff_format: - in1_reader_wrapper = GFFReaderWrapper + in1_reader_wrapper = GFFIntervalToBEDReaderWrapper else: in1_reader_wrapper = NiceReaderWrapper if in2_gff_format: - in2_reader_wrapper = GFFReaderWrapper + in2_reader_wrapper = GFFIntervalToBEDReaderWrapper else: in2_reader_wrapper = NiceReaderWrapper --- a/lib/galaxy/tools/util/gff_util.py +++ b/lib/galaxy/tools/util/gff_util.py @@ -2,25 +2,173 @@ Provides utilities for working with GFF files. """ -from bx.intervals.io import NiceReaderWrapper, GenomicInterval +from bx.intervals.io import * + +class GFFInterval( GenomicInterval ): + """ + A GFF interval, including attributes. If file is strictly a GFF file, + only attribute is 'group.' + """ + def __init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, default_strand, \ + fix_strand=False, raw_line='' ): + GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \ + default_strand, fix_strand=fix_strand ) + self.raw_line = raw_line + self.attributes = parse_gff_attributes( fields[8] ) + +class GFFFeature( GenomicInterval ): + """ + A GFF feature, which can include multiple intervals. + """ + def __init__( self, reader, chrom_col, start_col, end_col, strand_col, default_strand, \ + fix_strand=False, intervals=[] ): + GenomicInterval.__init__( self, reader, intervals[0].fields, chrom_col, start_col, end_col, \ + strand_col, default_strand, fix_strand=fix_strand ) + self.intervals = intervals + # Use intervals to set feature attributes. + for interval in self.intervals: + # Error checking. + if interval.chrom != self.chrom: + raise ValueError( "interval chrom does not match self chrom: %i != %i" % \ + ( interval.chrom, self.chrom ) ) + if interval.strand != self.strand: + raise ValueError( "interval strand does not match self strand: %s != %s" % \ + ( interval.strand, self.strand ) ) + # Set start, end of interval. + if interval.start < self.start: + self.start = interval.start + if interval.end > self.end: + self.end = interval.end + +class GFFIntervalToBEDReaderWrapper( NiceReaderWrapper ): + """ + Reader wrapper that reads GFF intervals/lines and automatically converts + them to BED format. + """ + + def parse_row( self, line ): + # HACK: this should return a GFF interval, but bx-python operations + # require GenomicInterval objects and subclasses will not work. + interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \ + self.end_col, self.strand_col, self.default_strand, \ + fix_strand=self.fix_strand ) + interval = convert_gff_coords_to_bed( interval ) + return interval class GFFReaderWrapper( NiceReaderWrapper ): """ - Reader wrapper converts GFF format--starting and ending coordinates are 1-based, closed--to the - 'traditional'/BED interval format--0 based, half-open. This is useful when using GFF files as inputs - to tools that expect traditional interval format. + Reader wrapper for GFF files. + + Wrapper has two major functions: + (1) group entries for GFF file (via group column), GFF3 (via id attribute ), + or GTF (via gene_id/transcript id); + (2) convert coordinates from GFF format--starting and ending coordinates + are 1-based, closed--to the 'traditional'/BED interval format--0 based, + half-open. This is useful when using GFF files as inputs to tools that + expect traditional interval format. """ + + def __init__( self, reader, **kwargs ): + """ + Create wrapper. Defaults are group_entries=False and + convert_coords_to_bed=True to support backward compatibility. + """ + NiceReaderWrapper.__init__( self, reader, **kwargs ) + self.group_entries = kwargs.get( 'group_entries', False ) + self.convert_coords_to_bed = kwargs.get( 'convert_coords_to_bed', True ) + self.last_line = None + self.cur_offset = 0 + self.seed_interval = None + def parse_row( self, line ): - interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, self.end_col, \ - self.strand_col, self.default_strand, fix_strand=self.fix_strand ) - interval = convert_gff_coords_to_bed( interval ) + interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \ + self.end_col, self.strand_col, self.default_strand, \ + fix_strand=self.fix_strand, raw_line=line ) + if self.convert_coords_to_bed: + interval = convert_gff_coords_to_bed( interval ) return interval + def next( self ): + """ Returns next GFFFeature. """ + + # + # Helper function. + # + + def handle_parse_error( parse_error ): + """ Actions to take when ParseError found. """ + if self.outstream: + if self.print_delegate and hasattr(self.print_delegate,"__call__"): + self.print_delegate( self.outstream, e, self ) + self.skipped += 1 + # no reason to stuff an entire bad file into memmory + if self.skipped < 10: + self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) ) + + # + # Get next GFFFeature + # + + # If there is no seed interval, set one. Also, if there are no more + # intervals to read, this is where iterator dies. + if not self.seed_interval: + while not self.seed_interval: + try: + self.seed_interval = GenomicIntervalReader.next( self ) + except ParseError, e: + handle_parse_error( e ) + + # Initialize feature name from seed. + feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF + feature_id = self.seed_interval.attributes.get( 'id', None ) # For GFF3 + feature_gene_id = self.seed_interval.attributes.get( 'gene_id', None ) # For GTF + feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None ) # For GTF + + # Read all intervals associated with seed. + feature_intervals = [] + feature_intervals.append( self.seed_interval ) + while True: + try: + interval = GenomicIntervalReader.next( self ) + except StopIteration, e: + # No more intervals to read, but last feature needs to be + # returned. + interval = None + break + except ParseError, e: + handle_parse_error( e ) + + # If interval not associated with feature, break. + group = interval.attributes.get( 'group', None ) + if group and feature_group != group: + break + id = interval.attributes.get( 'id', None ) + if id and feature_id != id: + break + gene_id = interval.attributes.get( 'gene_id', None ) + transcript_id = interval.attributes.get( 'transcript_id', None ) + if transcript_id and transcript_id != feature_transcript_id and gene_id and \ + gene_id != feature_gene_id: + break + + # Interval associated with feature. + feature_intervals.append( interval ) + + # Last interval read is the seed for the next interval. + self.seed_interval = interval + + # Return GFF feature with all intervals. + return GFFFeature( self, self.chrom_col, self.start_col, self.end_col, self.strand_col, \ + self.default_strand, fix_strand=self.fix_strand, \ + intervals=feature_intervals ) + + def convert_bed_coords_to_gff( interval ): """ - Converts an interval object's coordinates from BED format to GFF format. Accepted object types include - GenomicInterval and list (where the first element in the list is the interval's start, and the second - element is the interval's end). + Converts an interval object's coordinates from BED format to GFF format. + Accepted object types include GenomicInterval and list (where the first + element in the list is the interval's start, and the second element is + the interval's end). """ if type( interval ) is GenomicInterval: interval.start += 1 @@ -30,9 +178,10 @@ def convert_bed_coords_to_gff( interval def convert_gff_coords_to_bed( interval ): """ - Converts an interval object's coordinates from GFF format to BED format. Accepted object types include - GenomicInterval and list (where the first element in the list is the interval's start, and the second - element is the interval's end). + Converts an interval object's coordinates from GFF format to BED format. + Accepted object types include GenomicInterval and list (where the first + element in the list is the interval's start, and the second element is + the interval's end). """ if type( interval ) is GenomicInterval: interval.start -= 1 @@ -42,10 +191,15 @@ def convert_gff_coords_to_bed( interval def parse_gff_attributes( attr_str ): """ - Parses a GFF/GTF attribute string and returns a dictionary of name-value pairs. - The general format for a GFF3 attributes string is name1=value1;name2=value2 - The general format for a GTF attribute string is name1 "value1" ; name2 "value2" - """ + Parses a GFF/GTF attribute string and returns a dictionary of name-value + pairs. The general format for a GFF3 attributes string is + name1=value1;name2=value2 + The general format for a GTF attribute string is + name1 "value1" ; name2 "value2" + The general format for a GFF attribute string is a single string that + denotes the interval's group; in this case, method returns a dictionary + with a single key-value pair, and key name is 'group' + """ attributes_list = attr_str.split(";") attributes = {} for name_value_pair in attributes_list: @@ -53,6 +207,9 @@ def parse_gff_attributes( attr_str ): pair = name_value_pair.strip().split(" ") if len( pair ) == 1: pair = name_value_pair.strip().split("=") + if len( pair ) == 1: + # Could not split for some reason -- raise exception? + continue if pair == '': continue name = pair[0].strip() @@ -61,4 +218,9 @@ def parse_gff_attributes( attr_str ): # Need to strip double quote from values value = pair[1].strip(" \"") attributes[ name ] = value + + if len( attributes ) == 0: + # Could not split attributes string, so entire string must be + # 'group' attribute. This is the case for strictly GFF files. + attributes['group'] = attr_str return attributes