commit/galaxy-central: jgoecks: Add feature/attribute name indexing framework to converters. Provide full text indexing for GFF attributes.
1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/changeset/9e9e104ad5c9/ changeset: 9e9e104ad5c9 user: jgoecks date: 2012-08-16 22:32:00 summary: Add feature/attribute name indexing framework to converters. Provide full text indexing for GFF attributes. affected #: 7 files diff -r 0bfd0f25956e3be0f04f68c9e05c1d07f2948f98 -r 9e9e104ad5c93a2268bb00cfae09b57a31ece13c datatypes_conf.xml.sample --- a/datatypes_conf.xml.sample +++ b/datatypes_conf.xml.sample @@ -4,6 +4,7 @@ <datatype extension="ab1" type="galaxy.datatypes.binary:Ab1" mimetype="application/octet-stream" display_in_upload="true"/><datatype extension="afg" type="galaxy.datatypes.assembly:Amos" display_in_upload="false"/><datatype extension="axt" type="galaxy.datatypes.sequence:Axt" display_in_upload="true"/> + <datatype extension="fli" type="galaxy.datatypes.tabular:FeatureLocationIndex" display_in_upload="false"/><datatype extension="bam" type="galaxy.datatypes.binary:Bam" mimetype="application/octet-stream" display_in_upload="true"><converter file="bam_to_bai.xml" target_datatype="bai"/><converter file="bam_to_summary_tree_converter.xml" target_datatype="summary_tree" depends_on="bai"/> @@ -79,6 +80,7 @@ <converter file="gff_to_bed_converter.xml" target_datatype="bed"/><converter file="gff_to_interval_index_converter.xml" target_datatype="interval_index"/><converter file="gff_to_summary_tree_converter.xml" target_datatype="summary_tree"/> + <converter file="gff_to_fli_converter.xml" target_datatype="fli"/><display file="ensembl/ensembl_gff.xml" inherit="True"/><!-- <display file="gbrowse/gbrowse_gff.xml" inherit="True" /> --></datatype> diff -r 0bfd0f25956e3be0f04f68c9e05c1d07f2948f98 -r 9e9e104ad5c93a2268bb00cfae09b57a31ece13c lib/galaxy/datatypes/converters/gff_to_fli.py --- /dev/null +++ b/lib/galaxy/datatypes/converters/gff_to_fli.py @@ -0,0 +1,53 @@ +''' +Creates a feature location index for a given GFF file. +''' + +import sys +from galaxy import eggs +from galaxy.datatypes.util.gff_util import read_unordered_gtf, convert_gff_coords_to_bed + +# Process arguments. +in_fname = sys.argv[1] +out_fname = sys.argv[2] + +# Create dict of name-location pairings. +name_loc_dict = {} +for feature in read_unordered_gtf( open( in_fname, 'r' ) ): + for name in feature.attributes: + val = feature.attributes[ name ] + try: + float( val ) + continue + except: + convert_gff_coords_to_bed( feature ) + # Value is not a number, so it can be indexed. + if val not in name_loc_dict: + # Value is not in dictionary. + name_loc_dict[ val ] = { + 'contig': feature.chrom, + 'start': feature.start, + 'end': feature.end + } + else: + # Value already in dictionary, so update dictionary. + loc = name_loc_dict[ val ] + if feature.start < loc[ 'start' ]: + loc[ 'start' ] = feature.start + if feature.end > loc[ 'end' ]: + loc[ 'end' ] = feature.end + +# Print name, loc in sorted order. +out = open( out_fname, 'w' ) +max_len = 0 +entries = [] +for name in sorted( name_loc_dict.iterkeys() ): + loc = name_loc_dict[ name ] + entry = '%s\t%s' % ( name, '%s:%i-%i' % ( loc[ 'contig' ], loc[ 'start' ], loc[ 'end' ] ) ) + if len( entry ) > max_len: + max_len = len( entry ) + entries.append( entry ) + +out.write( str( max_len + 1 ).ljust( max_len ) + '\n' ) +for entry in entries: + out.write( entry.ljust( max_len ) + '\n' ) +out.close() \ No newline at end of file diff -r 0bfd0f25956e3be0f04f68c9e05c1d07f2948f98 -r 9e9e104ad5c93a2268bb00cfae09b57a31ece13c lib/galaxy/datatypes/converters/gff_to_fli_converter.xml --- /dev/null +++ b/lib/galaxy/datatypes/converters/gff_to_fli_converter.xml @@ -0,0 +1,13 @@ +<tool id="CONVERTER_gff_to_fli_0" name="Convert GFF to Feature Location Index"> + <!-- <description>__NOT_USED_CURRENTLY_FOR_CONVERTERS__</description> --> + <!-- Used on the metadata edit page. --> + <command interpreter="python">gff_to_fli.py $input1 $output1</command> + <inputs> + <param format="gff" name="input1" type="data" label="Choose GFF file"/> + </inputs> + <outputs> + <data format="fli" name="output1"/> + </outputs> + <help> + </help> +</tool> diff -r 0bfd0f25956e3be0f04f68c9e05c1d07f2948f98 -r 9e9e104ad5c93a2268bb00cfae09b57a31ece13c lib/galaxy/datatypes/tabular.py --- a/lib/galaxy/datatypes/tabular.py +++ b/lib/galaxy/datatypes/tabular.py @@ -638,3 +638,10 @@ dataset.metadata.reads = reads.keys() +class FeatureLocationIndex( Tabular ): + """ + An index that stores feature locations in tabular format. + """ + file_ext='fli' + MetadataElement( name="columns", default=2, desc="Number of columns", readonly=True, visible=False ) + MetadataElement( name="column_types", default=['str', 'str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False, no_value=[] ) \ No newline at end of file diff -r 0bfd0f25956e3be0f04f68c9e05c1d07f2948f98 -r 9e9e104ad5c93a2268bb00cfae09b57a31ece13c lib/galaxy/visualization/tracks/data_providers.py --- a/lib/galaxy/visualization/tracks/data_providers.py +++ b/lib/galaxy/visualization/tracks/data_providers.py @@ -2,7 +2,7 @@ Data providers for tracks visualizations. """ -import sys +import os, sys from math import ceil, log import pkg_resources pkg_resources.require( "bx-python" ) @@ -59,6 +59,51 @@ def _chrom_naming_matches( chrom1, chrom2 ): return ( chrom1.startswith( 'chr' ) and chrom2.startswith( 'chr' ) ) or ( not chrom1.startswith( 'chr' ) and not chrom2.startswith( 'chr' ) ) + +class FeatureLocationIndexDataProvider( object ): + ''' + + ''' + + def __init__( self, converted_dataset ): + self.converted_dataset = converted_dataset + + def get_data( self, query ): + # Init. + textloc_file = open( self.converted_dataset.file_name, 'r' ) + line_len = int( textloc_file.readline() ) + file_len = os.path.getsize( self.converted_dataset.file_name ) + + # Find query in file using binary search. + low = 0 + high = file_len / line_len + while low < high: + mid = ( low + high ) // 2 + position = mid * line_len + textloc_file.seek( position ) + + # Compare line with query and update low, high. + line = textloc_file.readline() + print '--', mid, line + if line < query: + low = mid + 1 + else: + high = mid + + position = low * line_len + + # At right point in file, generate hits. + result = [ ] + while True: + line = textloc_file.readline() + if not line.startswith( query ): + break + if line[ -1: ] == '\n': + line = line[ :-1 ] + result.append( line.split() ) + + textloc_file.close() + return result class TracksDataProvider( object ): """ Base class for tracks data providers. """ diff -r 0bfd0f25956e3be0f04f68c9e05c1d07f2948f98 -r 9e9e104ad5c93a2268bb00cfae09b57a31ece13c lib/galaxy/web/controllers/tracks.py --- a/lib/galaxy/web/controllers/tracks.py +++ b/lib/galaxy/web/controllers/tracks.py @@ -345,6 +345,20 @@ # Have data if we get here return { "status": messages.DATA, "valid_chroms": valid_chroms } + + @web.json + def feature_loc( self, trans, hda_ldda, dataset_id, query ): + """ + Returns features, locations in dataset that match query. Format is a + list of features; each feature is a list itself: [name, location] + """ + dataset = self.get_hda_or_ldda( trans, hda_ldda, dataset_id ) + converted_dataset = dataset.get_converted_dataset( trans, "fli" ) + data_provider = FeatureLocationIndexDataProvider( converted_dataset=converted_dataset ) + if data_provider: + return data_provider.get_data( query ) + else: + return 'None' @web.json def data( self, trans, hda_ldda, dataset_id, chrom, low, high, start_val=0, max_vals=None, **kwargs ): diff -r 0bfd0f25956e3be0f04f68c9e05c1d07f2948f98 -r 9e9e104ad5c93a2268bb00cfae09b57a31ece13c tools/filters/gff/sort_gtf.py --- a/tools/filters/gff/sort_gtf.py +++ b/tools/filters/gff/sort_gtf.py @@ -24,5 +24,6 @@ # Print feature. for interval in feature.intervals: out.write( "\t".join(interval.fields) ) +out.close() # TODO: print status information: how many lines processed and features found. \ No newline at end of file Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
Bitbucket