[galaxy-commits] galaxy-dist commit ecc1cefbccba: Enhance GFFReader to read and return complete features (genes/transcripts), which are composed of multiple intervals/blocks. This required hacking around a couple limitations of bx-python, but these can be easily fixed when bx-python is updated. Use enhanced functionality to correctly create converted datasets for visualizing GFF files.

19 Nov 2010

# HG changeset patch -- Bitbucket.org
# Project galaxy-dist
# URL http://bitbucket.org/galaxy/galaxy-dist/overview
# User jeremy goecks <jeremy.goecks@emory.edu>
# Date 1289591537 18000
# Node ID ecc1cefbccba6135f824f9573f06c130fd6d08b4
# Parent  b124f54952dee106f4ca87ea89a7f828c41562c3
Enhance GFFReader to read and return complete features (genes/transcripts), which are composed of multiple intervals/blocks. This required hacking around a couple limitations of bx-python, but these can be easily fixed when bx-python is updated. Use enhanced functionality to correctly create converted datasets for visualizing GFF files.

Also updated GOPS tools to use a simpler GFF reader wrapper to read in intervals and convert to BED coordinates.

--- a/tools/new_operations/gops_subtract.py
+++ b/tools/new_operations/gops_subtract.py
@@ -44,11 +44,11 @@ def main():
 
     # Set readers to handle either GFF or default format.
     if in1_gff_format:
-        in1_reader_wrapper = GFFReaderWrapper
+        in1_reader_wrapper = GFFIntervalToBEDReaderWrapper
     else:
         in1_reader_wrapper = NiceReaderWrapper
     if in2_gff_format:
-        in2_reader_wrapper = GFFReaderWrapper
+        in2_reader_wrapper = GFFIntervalToBEDReaderWrapper
     else:
         in2_reader_wrapper = NiceReaderWrapper
         

--- a/tools/new_operations/gops_intersect.py
+++ b/tools/new_operations/gops_intersect.py
@@ -44,11 +44,11 @@ def main():
         
     # Set readers to handle either GFF or default format.
     if in1_gff_format:
-        in1_reader_wrapper = GFFReaderWrapper
+        in1_reader_wrapper = GFFIntervalToBEDReaderWrapper
     else:
         in1_reader_wrapper = NiceReaderWrapper
     if in2_gff_format:
-        in2_reader_wrapper = GFFReaderWrapper
+        in2_reader_wrapper = GFFIntervalToBEDReaderWrapper
     else:
         in2_reader_wrapper = NiceReaderWrapper
         
@@ -66,10 +66,10 @@ def main():
                             fix_strand=True )
 
     out_file = open( out_fname, "w" )
-
+    
     try:
         for line in intersect( [g1,g2], pieces=pieces, mincols=mincols ):
-            if type( line ) == GenomicInterval:
+            if isinstance( line, GenomicInterval ):
                 if in1_gff_format:
                     line = convert_bed_coords_to_gff( line )
                 out_file.write( "%s\n" % "\t".join( line.fields ) )

--- /dev/null
+++ b/lib/galaxy/datatypes/converters/gff_to_interval_index_converter.py
@@ -0,0 +1,55 @@
+#!/usr/bin/env python
+
+"""
+Convert from GFF file to interval index file.
+
+usage:
+    python gff_to_interval_index_converter.py [input] [output]
+"""
+
+from __future__ import division
+
+import sys, fileinput
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from galaxy.tools.util.gff_util import *
+from bx.interval_index_file import Indexes
+
+def main():
+    # Arguments
+    input_fname, out_fname = sys.argv[1:]
+        
+    # Do conversion.
+    chr_col, start_col, end_col, strand_col = ( 0, 3, 4, 6 )
+    index = Indexes()
+    offset = 0
+    reader_wrapper = GFFReaderWrapper( fileinput.FileInput( input_fname ),
+                                        chrom_col=chr_col,
+                                        start_col=start_col,
+                                        end_col=end_col,
+                                        strand_col=strand_col,
+                                        fix_strand=True )
+    for feature in list( reader_wrapper ):
+        # TODO: need to address comments:
+        # if comment:
+        #   increment_offset.
+        
+        # Add feature; index expects BED coordinates.
+        convert_gff_coords_to_bed( feature )
+        index.add( feature.chrom, feature.start, feature.end, offset )
+        
+        # Increment offset by feature length; feature length is all 
+        # intervals/lines that comprise feature.
+        feature_len = 0
+        for interval in feature.intervals:
+            # HACK: +1 for EOL char. Need bx-python to provide raw_line itself 
+            # b/c TableReader strips EOL characters, thus changing the line
+            # length.
+            feature_len += len( interval.raw_line ) + 1
+        offset += feature_len
+            
+    index.write( open(out_fname, "w") )
+
+if __name__ == "__main__": 
+    main()
+    

--- a/lib/galaxy/datatypes/converters/interval_to_summary_tree_converter.py
+++ b/lib/galaxy/datatypes/converters/interval_to_summary_tree_converter.py
@@ -14,7 +14,7 @@ import pkg_resources; pkg_resources.requ
 from galaxy.visualization.tracks.summary import *
 from bx.intervals.io import *
 from bx.cookbook import doc_optparse
-from galaxy.tools.util.gff_util import GFFReaderWrapper
+from galaxy.tools.util.gff_util import *
 
 def main():
     # Read options, args.
@@ -40,9 +40,12 @@ def main():
                                             strand_col=strand_col,
                                             fix_strand=True )
     st = SummaryTree(block_size=25, levels=6, draw_cutoff=150, detail_cutoff=30)
-    for line in list( reader_wrapper ):
-        if type( line ) is GenomicInterval:
-            st.insert_range( line.chrom, long( line.start ), long( line.end ) )
+    for feature in list( reader_wrapper ):
+        if isinstance( feature, GenomicInterval ):
+            # Tree expects BED coordinates.
+            if type( feature ) is GFFFeature:
+                convert_gff_coords_to_bed( feature )
+            st.insert_range( feature.chrom, long( feature.start ), long( feature.end ) )
     
     st.write(out_fname)
 

--- a/lib/galaxy/datatypes/converters/gff_to_interval_index_converter.xml
+++ b/lib/galaxy/datatypes/converters/gff_to_interval_index_converter.xml
@@ -1,6 +1,6 @@
-<tool id="CONVERTER_gff_to_interval_index_0" name="Convert BED to Interval Index" version="1.0.0" hidden="true">
+<tool id="CONVERTER_gff_to_interval_index_0" name="Convert GFF to Interval Index" version="1.0.0" hidden="true"><!--  <description>__NOT_USED_CURRENTLY_FOR_CONVERTERS__</description> -->
-  <command interpreter="python">interval_to_interval_index_converter.py $input1 $output1 --gff</command>
+  <command interpreter="python">gff_to_interval_index_converter.py $input1 $output1</command><inputs><page><param format="gff" name="input1" type="data" label="Choose GFF file"/>

--- a/tools/new_operations/flanking_features.py
+++ b/tools/new_operations/flanking_features.py
@@ -153,11 +153,11 @@ def main():
         
     # Set readers to handle either GFF or default format.
     if in1_gff_format:
-        in1_reader_wrapper = GFFReaderWrapper
+        in1_reader_wrapper = GFFIntervalToBEDReaderWrapper
     else:
         in1_reader_wrapper = NiceReaderWrapper
     if in2_gff_format:
-        in2_reader_wrapper = GFFReaderWrapper
+        in2_reader_wrapper = GFFIntervalToBEDReaderWrapper
     else:
         in2_reader_wrapper = NiceReaderWrapper
 

--- a/lib/galaxy/tools/util/gff_util.py
+++ b/lib/galaxy/tools/util/gff_util.py
@@ -2,25 +2,173 @@
 Provides utilities for working with GFF files.
 """
 
-from bx.intervals.io import NiceReaderWrapper, GenomicInterval
+from bx.intervals.io import *
+
+class GFFInterval( GenomicInterval ):
+    """ 
+    A GFF interval, including attributes. If file is strictly a GFF file,
+    only attribute is 'group.'
+    """
+    def __init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, default_strand, \
+                  fix_strand=False, raw_line='' ):
+        GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \
+                                  default_strand, fix_strand=fix_strand )
+        self.raw_line = raw_line
+        self.attributes = parse_gff_attributes( fields[8] )
+                
+class GFFFeature( GenomicInterval ):
+    """
+    A GFF feature, which can include multiple intervals.
+    """
+    def __init__( self, reader, chrom_col, start_col, end_col, strand_col, default_strand, \
+                  fix_strand=False, intervals=[] ):
+        GenomicInterval.__init__( self, reader, intervals[0].fields, chrom_col, start_col, end_col, \
+                                  strand_col, default_strand, fix_strand=fix_strand )
+        self.intervals = intervals
+        # Use intervals to set feature attributes.
+        for interval in self.intervals:
+            # Error checking.
+            if interval.chrom != self.chrom:
+                raise ValueError( "interval chrom does not match self chrom: %i != %i" % \
+                                  ( interval.chrom, self.chrom ) )
+            if interval.strand != self.strand:
+                raise ValueError( "interval strand does not match self strand: %s != %s" % \
+                                  ( interval.strand, self.strand ) )
+            # Set start, end of interval.
+            if interval.start < self.start:
+                self.start = interval.start
+            if interval.end > self.end:
+                self.end = interval.end
+                
+class GFFIntervalToBEDReaderWrapper( NiceReaderWrapper ):
+    """ 
+    Reader wrapper that reads GFF intervals/lines and automatically converts
+    them to BED format. 
+    """
+    
+    def parse_row( self, line ):
+        # HACK: this should return a GFF interval, but bx-python operations 
+        # require GenomicInterval objects and subclasses will not work.
+        interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \
+                                    self.end_col, self.strand_col, self.default_strand, \
+                                    fix_strand=self.fix_strand )
+        interval = convert_gff_coords_to_bed( interval )
+        return interval
 
 class GFFReaderWrapper( NiceReaderWrapper ):
     """
-    Reader wrapper converts GFF format--starting and ending coordinates are 1-based, closed--to the 
-    'traditional'/BED interval format--0 based, half-open. This is useful when using GFF files as inputs 
-    to tools that expect traditional interval format.
+    Reader wrapper for GFF files.
+    
+    Wrapper has two major functions:
+    (1) group entries for GFF file (via group column), GFF3 (via id attribute ), 
+        or GTF (via gene_id/transcript id);
+    (2) convert coordinates from GFF format--starting and ending coordinates 
+        are 1-based, closed--to the 'traditional'/BED interval format--0 based, 
+        half-open. This is useful when using GFF files as inputs to tools that 
+        expect traditional interval format.
     """
+    
+    def __init__( self, reader, **kwargs ):
+        """
+        Create wrapper. Defaults are group_entries=False and 
+        convert_coords_to_bed=True to support backward compatibility.
+        """
+        NiceReaderWrapper.__init__( self, reader, **kwargs )
+        self.group_entries = kwargs.get( 'group_entries', False )
+        self.convert_coords_to_bed = kwargs.get( 'convert_coords_to_bed', True )
+        self.last_line = None
+        self.cur_offset = 0
+        self.seed_interval = None
+    
     def parse_row( self, line ):
-        interval = GenomicInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, self.end_col, \
-                                    self.strand_col, self.default_strand, fix_strand=self.fix_strand )
-        interval = convert_gff_coords_to_bed( interval )
+        interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \
+                                self.end_col, self.strand_col, self.default_strand, \
+                                fix_strand=self.fix_strand, raw_line=line )
+        if self.convert_coords_to_bed:
+            interval = convert_gff_coords_to_bed( interval )
         return interval
         
+    def next( self ):
+        """ Returns next GFFFeature. """
+        
+        #
+        # Helper function.
+        #
+        
+        def handle_parse_error( parse_error ):
+            """ Actions to take when ParseError found. """
+            if self.outstream:
+               if self.print_delegate and hasattr(self.print_delegate,"__call__"):
+                   self.print_delegate( self.outstream, e, self )
+            self.skipped += 1
+            # no reason to stuff an entire bad file into memmory
+            if self.skipped < 10:
+               self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
+               
+        #
+        # Get next GFFFeature
+        # 
+
+        # If there is no seed interval, set one. Also, if there are no more 
+        # intervals to read, this is where iterator dies.
+        if not self.seed_interval:
+            while not self.seed_interval:
+                try:
+                    self.seed_interval = GenomicIntervalReader.next( self )
+                except ParseError, e:
+                    handle_parse_error( e )
+    
+        # Initialize feature name from seed.
+        feature_group = self.seed_interval.attributes.get( 'group', None ) # For GFF
+        feature_id = self.seed_interval.attributes.get( 'id', None ) # For GFF3
+        feature_gene_id = self.seed_interval.attributes.get( 'gene_id', None ) # For GTF
+        feature_transcript_id = self.seed_interval.attributes.get( 'transcript_id', None ) # For GTF
+
+        # Read all intervals associated with seed.
+        feature_intervals = []
+        feature_intervals.append( self.seed_interval )
+        while True:
+            try:
+                interval = GenomicIntervalReader.next( self )
+            except StopIteration, e:
+                # No more intervals to read, but last feature needs to be 
+                # returned.
+                interval = None
+                break
+            except ParseError, e:
+                handle_parse_error( e )
+            
+            # If interval not associated with feature, break.
+            group = interval.attributes.get( 'group', None )
+            if group and feature_group != group:
+                break
+            id = interval.attributes.get( 'id', None )
+            if id and feature_id != id:
+                break
+            gene_id = interval.attributes.get( 'gene_id', None )
+            transcript_id = interval.attributes.get( 'transcript_id', None )
+            if transcript_id and transcript_id != feature_transcript_id and gene_id and \
+               gene_id != feature_gene_id:
+                break
+    
+            # Interval associated with feature.
+            feature_intervals.append( interval )
+   
+        # Last interval read is the seed for the next interval.
+        self.seed_interval = interval
+    
+        # Return GFF feature with all intervals.    
+        return GFFFeature( self, self.chrom_col, self.start_col, self.end_col, self.strand_col, \
+                           self.default_strand, fix_strand=self.fix_strand, \
+                           intervals=feature_intervals )
+        
+
 def convert_bed_coords_to_gff( interval ):
     """
-    Converts an interval object's coordinates from BED format to GFF format. Accepted object types include 
-    GenomicInterval and list (where the first element in the list is the interval's start, and the second 
-    element is the interval's end).
+    Converts an interval object's coordinates from BED format to GFF format. 
+    Accepted object types include GenomicInterval and list (where the first 
+    element in the list is the interval's start, and the second element is 
+    the interval's end).
     """
     if type( interval ) is GenomicInterval:
         interval.start += 1
@@ -30,9 +178,10 @@ def convert_bed_coords_to_gff( interval 
     
 def convert_gff_coords_to_bed( interval ):
     """
-    Converts an interval object's coordinates from GFF format to BED format. Accepted object types include 
-    GenomicInterval and list (where the first element in the list is the interval's start, and the second 
-    element is the interval's end).
+    Converts an interval object's coordinates from GFF format to BED format. 
+    Accepted object types include GenomicInterval and list (where the first
+    element in the list is the interval's start, and the second element is 
+    the interval's end).
     """
     if type( interval ) is GenomicInterval:
         interval.start -= 1
@@ -42,10 +191,15 @@ def convert_gff_coords_to_bed( interval 
     
 def parse_gff_attributes( attr_str ):
     """
-    Parses a GFF/GTF attribute string and returns a dictionary of name-value pairs.
-    The general format for a GFF3 attributes string is name1=value1;name2=value2
-    The general format for a GTF attribute string is name1 "value1" ; name2 "value2"
-    """
+    Parses a GFF/GTF attribute string and returns a dictionary of name-value 
+    pairs. The general format for a GFF3 attributes string is 
+        name1=value1;name2=value2
+    The general format for a GTF attribute string is 
+        name1 "value1" ; name2 "value2"
+    The general format for a GFF attribute string is a single string that
+    denotes the interval's group; in this case, method returns a dictionary 
+    with a single key-value pair, and key name is 'group'
+    """    
     attributes_list = attr_str.split(";")
     attributes = {}
     for name_value_pair in attributes_list:
@@ -53,6 +207,9 @@ def parse_gff_attributes( attr_str ):
         pair = name_value_pair.strip().split(" ")
         if len( pair ) == 1:
             pair = name_value_pair.strip().split("=")
+        if len( pair ) == 1:
+            # Could not split for some reason -- raise exception?
+            continue
         if pair == '':
             continue
         name = pair[0].strip()
@@ -61,4 +218,9 @@ def parse_gff_attributes( attr_str ):
         # Need to strip double quote from values
         value = pair[1].strip(" \"")
         attributes[ name ] = value
+        
+    if len( attributes ) == 0:
+        # Could not split attributes string, so entire string must be 
+        # 'group' attribute. This is the case for strictly GFF files.
+        attributes['group'] = attr_str
     return attributes

    

commits-noreply＠bitbucket.org