# HG changeset patch -- Bitbucket.org # Project galaxy-dist # URL http://bitbucket.org/galaxy/galaxy-dist/overview # User jeremy goecks <jeremy.goecks@emory.edu> # Date 1289766276 18000 # Node ID 9b2bc4d4d6ca9b1f4275cb335460c8d570c7eeaf # Parent 1449307bdaf7671bea8a879a6c0990bc0c3b7aa7 Add full GFF support to trackster: GFF features blocks are now displayed correctly, along with name, strand, and score information. Added score column to GFFReaderWrapper as well. --- a/lib/galaxy/visualization/tracks/data_providers.py +++ b/lib/galaxy/visualization/tracks/data_providers.py @@ -5,6 +5,7 @@ Data providers for tracks visualizations from math import floor, ceil, log, pow import pkg_resources pkg_resources.require( "bx-python" ); pkg_resources.require( "pysam" ); pkg_resources.require( "numpy" ) +from galaxy.datatypes.util.gff_util import * from bx.interval_index_file import Indexes from bx.arrays.array_tree import FileArrayTreeDict from galaxy.util.lrucache import LRUCache @@ -330,25 +331,43 @@ class IntervalIndexDataProvider( TracksD break count += 1 source.seek(offset) - feature = source.readline().split() payload = [ offset, start, end ] # TODO: can we use column metadata to fill out payload? # TODO: use function to set payload data if "no_detail" not in kwargs: - length = len(feature) if isinstance( self.original_dataset.datatype, Gff ): # GFF dataset. - if length >= 3: - payload.append( feature[2] ) # name - if length >= 7: - payload.append( feature[6] ) # strand + reader = GFFReaderWrapper( source ) + feature = reader.next() + + payload.append( feature.name() ) + # Strand: + payload.append( feature.strand ) + + # No notion of thick start, end in GFF, so make everything + # thick. + payload.append( start ) + payload.append( end ) + + # Add blocks. + feature = convert_gff_coords_to_bed( feature ) + block_sizes = [ (interval.end - interval.start ) for interval in feature.intervals ] + block_starts = [ ( interval.start - feature.start ) for interval in feature.intervals ] + blocks = zip( block_sizes, block_starts ) + payload.append( [ ( start + block[1], start + block[1] + block[0] ) for block in blocks ] ) + + # Score. + payload.append( feature.score ) elif isinstance( self.original_dataset.datatype, Bed ): # BED dataset. + feature = source.readline().split() + length = len(feature) if length >= 4: payload.append(feature[3]) # name if length >= 6: # strand payload.append(feature[5]) + # Thick start, end. if length >= 8: payload.append(int(feature[6])) payload.append(int(feature[7])) @@ -356,8 +375,8 @@ class IntervalIndexDataProvider( TracksD if length >= 12: block_sizes = [ int(n) for n in feature[10].split(',') if n != ''] block_starts = [ int(n) for n in feature[11].split(',') if n != '' ] - blocks = zip(block_sizes, block_starts) - payload.append( [ (start + block[1], start + block[1] + block[0]) for block in blocks] ) + blocks = zip( block_sizes, block_starts ) + payload.append( [ ( start + block[1], start + block[1] + block[0] ) for block in blocks ] ) if length >= 5: payload.append( int(feature[4]) ) # score --- a/lib/galaxy/datatypes/util/gff_util.py +++ b/lib/galaxy/datatypes/util/gff_util.py @@ -9,21 +9,28 @@ class GFFInterval( GenomicInterval ): A GFF interval, including attributes. If file is strictly a GFF file, only attribute is 'group.' """ - def __init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, default_strand, \ - fix_strand=False, raw_line='' ): + def __init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \ + score_col, default_strand, fix_strand=False, raw_line='' ): GenomicInterval.__init__( self, reader, fields, chrom_col, start_col, end_col, strand_col, \ default_strand, fix_strand=fix_strand ) + # Handle score column. + self.score_col = score_col + if self.score_col >= self.nfields: + raise MissingFieldError( "No field for score_col (%d)" % score_col ) + self.score = self.fields[ self.score_col ] + + # Attributes specific to GFF. self.raw_line = raw_line self.attributes = parse_gff_attributes( fields[8] ) -class GFFFeature( GenomicInterval ): +class GFFFeature( GFFInterval ): """ A GFF feature, which can include multiple intervals. """ - def __init__( self, reader, chrom_col, start_col, end_col, strand_col, default_strand, \ + def __init__( self, reader, chrom_col, start_col, end_col, strand_col, score_col, default_strand, \ fix_strand=False, intervals=[] ): - GenomicInterval.__init__( self, reader, intervals[0].fields, chrom_col, start_col, end_col, \ - strand_col, default_strand, fix_strand=fix_strand ) + GFFInterval.__init__( self, reader, intervals[0].fields, chrom_col, start_col, end_col, \ + strand_col, score_col, default_strand, fix_strand=fix_strand ) self.intervals = intervals # Use intervals to set feature attributes. for interval in self.intervals: @@ -40,6 +47,15 @@ class GFFFeature( GenomicInterval ): if interval.end > self.end: self.end = interval.end + def name( self ): + """ Returns feature's name. """ + name = self.attributes.get( 'transcript_id', None ) + if not name: + name = self.attributes.get( 'id', None ) + if not name: + name = self.attributes.get( 'group', None ) + return name + class GFFIntervalToBEDReaderWrapper( NiceReaderWrapper ): """ Reader wrapper that reads GFF intervals/lines and automatically converts @@ -60,7 +76,7 @@ class GFFReaderWrapper( NiceReaderWrappe Reader wrapper for GFF files. Wrapper has two major functions: - (1) group entries for GFF file (via group column), GFF3 (via id attribute ), + (1) group entries for GFF file (via group column), GFF3 (via id attribute), or GTF (via gene_id/transcript id); (2) convert coordinates from GFF format--starting and ending coordinates are 1-based, closed--to the 'traditional'/BED interval format--0 based, @@ -68,24 +84,29 @@ class GFFReaderWrapper( NiceReaderWrappe expect traditional interval format. """ - def __init__( self, reader, **kwargs ): + def __init__( self, reader, chrom_col=0, start_col=3, end_col=4, strand_col=6, score_col=5, **kwargs ): """ Create wrapper. Defaults are group_entries=False and convert_coords_to_bed=True to support backward compatibility. """ + + # Add columns to kwargs here so that defaults can be used rather than + # requiring them to be passed in. + kwargs[ 'chrom_col' ] = chrom_col + kwargs[ 'start_col' ] = start_col + kwargs[ 'end_col' ] = end_col + kwargs[ 'strand_col' ] = strand_col NiceReaderWrapper.__init__( self, reader, **kwargs ) - self.group_entries = kwargs.get( 'group_entries', False ) - self.convert_coords_to_bed = kwargs.get( 'convert_coords_to_bed', True ) + # HACK: NiceReaderWrapper (bx-python) does not handle score_col yet, so store ourselves. + self.score_col = score_col self.last_line = None self.cur_offset = 0 self.seed_interval = None def parse_row( self, line ): interval = GFFInterval( self, line.split( "\t" ), self.chrom_col, self.start_col, \ - self.end_col, self.strand_col, self.default_strand, \ + self.end_col, self.strand_col, self.score_col, self.default_strand, \ fix_strand=self.fix_strand, raw_line=line ) - if self.convert_coords_to_bed: - interval = convert_gff_coords_to_bed( interval ) return interval def next( self ): @@ -105,6 +126,9 @@ class GFFReaderWrapper( NiceReaderWrappe if self.skipped < 10: self.skipped_lines.append( ( self.linenum, self.current_line, str( e ) ) ) + # For debugging, uncomment this to propogate parsing exceptions up. + # raise e + # # Get next GFFFeature # @@ -159,7 +183,7 @@ class GFFReaderWrapper( NiceReaderWrappe # Return GFF feature with all intervals. return GFFFeature( self, self.chrom_col, self.start_col, self.end_col, self.strand_col, \ - self.default_strand, fix_strand=self.fix_strand, \ + self.score_col, self.default_strand, fix_strand=self.fix_strand, \ intervals=feature_intervals ) @@ -179,12 +203,15 @@ def convert_bed_coords_to_gff( interval def convert_gff_coords_to_bed( interval ): """ Converts an interval object's coordinates from GFF format to BED format. - Accepted object types include GenomicInterval and list (where the first - element in the list is the interval's start, and the second element is - the interval's end). + Accepted object types include GFFFeature, GenomicInterval, and list (where + the first element in the list is the interval's start, and the second + element is the interval's end). """ - if type( interval ) is GenomicInterval: + if isinstance( interval, GenomicInterval ): interval.start -= 1 + if isinstance( interval, GFFFeature ): + for subinterval in interval: + convert_gff_coords_to_bed( subinterval ) elif type ( interval ) is list: interval[ 0 ] -= 1 return interval --- a/lib/galaxy/datatypes/converters/gff_to_interval_index_converter.py +++ b/lib/galaxy/datatypes/converters/gff_to_interval_index_converter.py @@ -20,15 +20,9 @@ def main(): input_fname, out_fname = sys.argv[1:] # Do conversion. - chr_col, start_col, end_col, strand_col = ( 0, 3, 4, 6 ) index = Indexes() offset = 0 - reader_wrapper = GFFReaderWrapper( fileinput.FileInput( input_fname ), - chrom_col=chr_col, - start_col=start_col, - end_col=end_col, - strand_col=strand_col, - fix_strand=True ) + reader_wrapper = GFFReaderWrapper( fileinput.FileInput( input_fname ), fix_strand=True ) for feature in list( reader_wrapper ): # TODO: need to address comments: # if comment: @@ -49,7 +43,7 @@ def main(): offset += feature_len index.write( open(out_fname, "w") ) - + if __name__ == "__main__": main()