[galaxy-commits] galaxy-dist commit 9b2bc4d4d6ca: Add full GFF support to trackster: GFF features blocks are now displayed correctly, along with name, strand, and score information. Added score column to GFFReaderWrapper as well.

20 Nov 2010

# HG changeset patch -- Bitbucket.org
# Project galaxy-dist
# URL http://bitbucket.org/galaxy/galaxy-dist/overview
# User jeremy goecks <jeremy.goecks@emory.edu>
# Date 1289766276 18000
# Node ID 9b2bc4d4d6ca9b1f4275cb335460c8d570c7eeaf
# Parent  1449307bdaf7671bea8a879a6c0990bc0c3b7aa7
Add full GFF support to trackster: GFF features blocks are now displayed correctly, along with name, strand, and score information. Added score column to GFFReaderWrapper as well.

--- a/lib/galaxy/visualization/tracks/data_providers.py
+++ b/lib/galaxy/visualization/tracks/data_providers.py
@@ -5,6 +5,7 @@ Data providers for tracks visualizations
 from math import floor, ceil, log, pow
 import pkg_resources
 pkg_resources.require( "bx-python" ); pkg_resources.require( "pysam" ); pkg_resources.require( "numpy" )
+from galaxy.datatypes.util.gff_util import *
 from bx.interval_index_file import Indexes
 from bx.arrays.array_tree import FileArrayTreeDict
 from galaxy.util.lrucache import LRUCache
@@ -330,25 +331,43 @@ class IntervalIndexDataProvider( TracksD
                 break
             count += 1
             source.seek(offset)
-            feature = source.readline().split()
             payload = [ offset, start, end ]
             # TODO: can we use column metadata to fill out payload?
             # TODO: use function to set payload data
             if "no_detail" not in kwargs:
-                length = len(feature)
                 if isinstance( self.original_dataset.datatype, Gff ):
                     # GFF dataset.
-                    if length >= 3:
-                        payload.append( feature[2] ) # name
-                    if length >= 7:
-                        payload.append( feature[6] ) # strand
+                    reader = GFFReaderWrapper( source )
+                    feature = reader.next()
+                    
+                    payload.append( feature.name() )
+                    # Strand:
+                    payload.append( feature.strand )
+                    
+                    # No notion of thick start, end in GFF, so make everything
+                    # thick.
+                    payload.append( start )
+                    payload.append( end )
+                    
+                    # Add blocks.
+                    feature = convert_gff_coords_to_bed( feature )
+                    block_sizes = [ (interval.end - interval.start ) for interval in feature.intervals ]
+                    block_starts = [ ( interval.start - feature.start ) for interval in feature.intervals ]
+                    blocks = zip( block_sizes, block_starts )
+                    payload.append( [ ( start + block[1], start + block[1] + block[0] ) for block in blocks ] )
+                    
+                    # Score.
+                    payload.append( feature.score )
                 elif isinstance( self.original_dataset.datatype, Bed ):
                     # BED dataset.
+                    feature = source.readline().split()
+                    length = len(feature)
                     if length >= 4:
                         payload.append(feature[3]) # name
                     if length >= 6: # strand
                         payload.append(feature[5])
 
+                    # Thick start, end.
                     if length >= 8:
                         payload.append(int(feature[6]))
                         payload.append(int(feature[7]))
@@ -356,8 +375,8 @@ class IntervalIndexDataProvider( TracksD
                     if length >= 12:
                         block_sizes = [ int(n) for n in feature[10].split(',') if n != '']
                         block_starts = [ int(n) for n in feature[11].split(',') if n != '' ]
-                        blocks = zip(block_sizes, block_starts)
-                        payload.append( [ (start + block[1], start + block[1] + block[0]) for block in blocks] )
+                        blocks = zip( block_sizes, block_starts )
+                        payload.append( [ ( start + block[1], start + block[1] + block[0] ) for block in blocks ] )
                         
                     if length >= 5:
                         payload.append( int(feature[4]) ) # score

--- a/lib/galaxy/datatypes/util/gff_util.py
+++ b/lib/galaxy/datatypes/util/gff_util.py
@@ -9,21 +9,28 @@ class GFFInterval( GenomicInterval ):
     A GFF interval, including attributes. If file is strictly a GFF file,
     only attribute is 'group.'
     """
-    def __init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, default_strand, \
-                  fix_strand=False, raw_line='' ):
+    def __init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \
+                  score_col, default_strand, fix_strand=False, raw_line='' ):
         GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \
                                   default_strand, fix_strand=fix_strand )
+        # Handle score column.
+        self.score_col = score_col
+        if self.score_col >= self.nfields:
+          raise MissingFieldError( "No field for score_col (%d)" % score_col )
+        self.score = self.fields[ self.score_col ]
+        
+        # Attributes specific to GFF.
         self.raw_line = raw_line
         self.attributes = parse_gff_attributes( fields[8] )
                 
-class GFFFeature( GenomicInterval ):
+class GFFFeature( GFFInterval ):
     """
     A GFF feature, which can include multiple intervals.
     """
-    def __init__( self, reader, chrom_col, start_col, end_col, strand_col, default_strand, \
+    def __init__( self, reader, chrom_col, start_col, end_col, strand_col, score_col, default_strand, \
                   fix_strand=False, intervals=[] ):
-        GenomicInterval.__init__( self, reader, intervals[0].fields, chrom_col, start_col, end_col, \
-                                  strand_col, default_strand, fix_strand=fix_strand )
+        GFFInterval.__init__( self, reader, intervals[0].fields, chrom_col, start_col, end_col, \
+                                strand_col, score_col, default_strand, fix_strand=fix_strand )
         self.intervals = intervals
         # Use intervals to set feature attributes.
         for interval in self.intervals:
@@ -40,6 +47,15 @@ class GFFFeature( GenomicInterval ):
             if interval.end > self.end:
                 self.end = interval.end
                 
+    def name( self ):
+        """ Returns feature's name. """
+        name = self.attributes.get( 'transcript_id', None )
+        if not name:
+            name = self.attributes.get( 'id', None )
+        if not name:
+            name = self.attributes.get( 'group', None )
+        return name
+                
 class GFFIntervalToBEDReaderWrapper( NiceReaderWrapper ):
     """ 
     Reader wrapper that reads GFF intervals/lines and automatically converts
@@ -60,7 +76,7 @@ class GFFReaderWrapper( NiceReaderWrappe
     Reader wrapper for GFF files.
     
     Wrapper has two major functions:
-    (1) group entries for GFF file (via group column), GFF3 (via id attribute ), 
+    (1) group entries for GFF file (via group column), GFF3 (via id attribute), 
         or GTF (via gene_id/transcript id);
     (2) convert coordinates from GFF format--starting and ending coordinates 
         are 1-based, closed--to the 'traditional'/BED interval format--0 based, 
@@ -68,24 +84,29 @@ class GFFReaderWrapper( NiceReaderWrappe
         expect traditional interval format.
     """
     
-    def __init__( self, reader, **kwargs ):
+    def __init__( self, reader, chrom_col=0, start_col=3, end_col=4, strand_col=6, score_col=5, **kwargs ):
         """
         Create wrapper. Defaults are group_entries=False and 
         convert_coords_to_bed=True to support backward compatibility.
         """
+        
+        # Add columns to kwargs here so that defaults can be used rather than 
+        # requiring them to be passed in.
+        kwargs[ 'chrom_col' ] = chrom_col
+        kwargs[ 'start_col' ] = start_col
+        kwargs[ 'end_col' ] = end_col
+        kwargs[ 'strand_col' ] = strand_col
         NiceReaderWrapper.__init__( self, reader, **kwargs )
-        self.group_entries = kwargs.get( 'group_entries', False )
-        self.convert_coords_to_bed = kwargs.get( 'convert_coords_to_bed', True )
+        # HACK: NiceReaderWrapper (bx-python) does not handle score_col yet, so store ourselves.
+        self.score_col = score_col
         self.last_line = None
         self.cur_offset = 0
         self.seed_interval = None
     
     def parse_row( self, line ):
         interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \
-                                self.end_col, self.strand_col, self.default_strand, \
+                                self.end_col, self.strand_col, self.score_col, self.default_strand, \
                                 fix_strand=self.fix_strand, raw_line=line )
-        if self.convert_coords_to_bed:
-            interval = convert_gff_coords_to_bed( interval )
         return interval
         
     def next( self ):
@@ -105,6 +126,9 @@ class GFFReaderWrapper( NiceReaderWrappe
             if self.skipped < 10:
                self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) )
                
+            # For debugging, uncomment this to propogate parsing exceptions up.
+            # raise e
+               
         #
         # Get next GFFFeature
         # 
@@ -159,7 +183,7 @@ class GFFReaderWrapper( NiceReaderWrappe
     
         # Return GFF feature with all intervals.    
         return GFFFeature( self, self.chrom_col, self.start_col, self.end_col, self.strand_col, \
-                           self.default_strand, fix_strand=self.fix_strand, \
+                           self.score_col, self.default_strand, fix_strand=self.fix_strand, \
                            intervals=feature_intervals )
         
 
@@ -179,12 +203,15 @@ def convert_bed_coords_to_gff( interval 
 def convert_gff_coords_to_bed( interval ):
     """
     Converts an interval object's coordinates from GFF format to BED format. 
-    Accepted object types include GenomicInterval and list (where the first
-    element in the list is the interval's start, and the second element is 
-    the interval's end).
+    Accepted object types include GFFFeature, GenomicInterval, and list (where
+    the first element in the list is the interval's start, and the second 
+    element is the interval's end).
     """
-    if type( interval ) is GenomicInterval:
+    if isinstance( interval, GenomicInterval ):
         interval.start -= 1
+        if isinstance( interval, GFFFeature ):
+            for subinterval in interval:
+                convert_gff_coords_to_bed( subinterval )
     elif type ( interval ) is list:
         interval[ 0 ] -= 1
     return interval

--- a/lib/galaxy/datatypes/converters/gff_to_interval_index_converter.py
+++ b/lib/galaxy/datatypes/converters/gff_to_interval_index_converter.py
@@ -20,15 +20,9 @@ def main():
     input_fname, out_fname = sys.argv[1:]
         
     # Do conversion.
-    chr_col, start_col, end_col, strand_col = ( 0, 3, 4, 6 )
     index = Indexes()
     offset = 0
-    reader_wrapper = GFFReaderWrapper( fileinput.FileInput( input_fname ),
-                                        chrom_col=chr_col,
-                                        start_col=start_col,
-                                        end_col=end_col,
-                                        strand_col=strand_col,
-                                        fix_strand=True )
+    reader_wrapper = GFFReaderWrapper( fileinput.FileInput( input_fname ), fix_strand=True )
     for feature in list( reader_wrapper ):
         # TODO: need to address comments:
         # if comment:
@@ -49,7 +43,7 @@ def main():
         offset += feature_len
             
     index.write( open(out_fname, "w") )
-
+    
 if __name__ == "__main__": 
     main()

    

[galaxy-commits] galaxy-dist commit 9b2bc4d4d6ca: Add full GFF support to trackster: GFF features blocks are now displayed correctly, along with name, strand, and score information. Added score column to GFFReaderWrapper as well.

commits-noreply＠bitbucket.org