commit/galaxy-central: jgoecks: Viz framework: (a) push data provider creation to registry to simplify provider creation; (b) fix bugs in filters module naming; (c) enable deeper sampling in BBI data provider.
1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/changeset/bdd35af2a18a/ changeset: bdd35af2a18a user: jgoecks date: 2012-09-21 20:00:09 summary: Viz framework: (a) push data provider creation to registry to simplify provider creation; (b) fix bugs in filters module naming; (c) enable deeper sampling in BBI data provider. affected #: 8 files diff -r 8a8dcc0d36687fcfaaa1ae26e136e76b4a0d7bb0 -r bdd35af2a18afde11396fcf919b5fb4cacc986ff lib/galaxy/visualization/data_providers/basic.py --- a/lib/galaxy/visualization/data_providers/basic.py +++ b/lib/galaxy/visualization/data_providers/basic.py @@ -100,62 +100,3 @@ f.close() return data - -class DataProviderRegistry( object ): - """ - Registry for data providers that enables listing and lookup. - """ - - def __init__( self ): - # Mapping from dataset type name to a class that can fetch data from a file of that - # type. First key is converted dataset type; if result is another dict, second key - # is original dataset type. TODO: This needs to be more flexible. - self.dataset_type_name_to_data_provider = { - "tabix": { - Vcf: VcfTabixDataProvider, - Bed: BedTabixDataProvider, - Gtf: GtfTabixDataProvider, - ENCODEPeak: ENCODEPeakTabixDataProvider, - Interval: IntervalTabixDataProvider, - ChromatinInteractions: ChromatinInteractionsTabixDataProvider, - "default" : TabixDataProvider - }, - "interval_index": IntervalIndexDataProvider, - "bai": BamDataProvider, - "bam": SamDataProvider, - "summary_tree": SummaryTreeDataProvider, - "bigwig": BigWigDataProvider, - "bigbed": BigBedDataProvider - } - - def get_data_provider( name=None, original_dataset=None ): - """ - Returns data provider class by name and/or original dataset. - """ - data_provider = None - if name: - value = dataset_type_name_to_data_provider[ name ] - if isinstance( value, dict ): - # Get converter by dataset extension; if there is no data provider, - # get the default. - data_provider = value.get( original_dataset.datatype.__class__, value.get( "default" ) ) - else: - data_provider = value - elif original_dataset: - # Look up data provider from datatype's informaton. - try: - # Get data provider mapping and data provider for 'data'. If - # provider available, use it; otherwise use generic provider. - _ , data_provider_mapping = original_dataset.datatype.get_track_type() - if 'data_standalone' in data_provider_mapping: - data_provider_name = data_provider_mapping[ 'data_standalone' ] - else: - data_provider_name = data_provider_mapping[ 'data' ] - if data_provider_name: - data_provider = self.get_data_provider( name=data_provider_name, original_dataset=original_dataset ) - else: - data_provider = GenomeDataProvider - except: - pass - return data_provider - \ No newline at end of file diff -r 8a8dcc0d36687fcfaaa1ae26e136e76b4a0d7bb0 -r bdd35af2a18afde11396fcf919b5fb4cacc986ff lib/galaxy/visualization/data_providers/genome.py --- a/lib/galaxy/visualization/data_providers/genome.py +++ b/lib/galaxy/visualization/data_providers/genome.py @@ -112,6 +112,8 @@ class GenomeDataProvider( BaseDataProvider ): """ Base class for genome data providers. """ + + data_type = None """ Mapping from column name to payload data; this mapping is used to create @@ -314,6 +316,8 @@ class TabixDataProvider( FilterableMixin, GenomeDataProvider ): + data_type = 'tabix' + """ Tabix index data provider for the Galaxy track browser. """ @@ -354,8 +358,10 @@ # class IntervalDataProvider( GenomeDataProvider ): + data_type = 'interval_index' + """ - Processes BED data from native format to payload format. + Processes interval data from native format to payload format. Payload format: [ uid (offset), start, end, name, strand, thick_start, thick_end, blocks ] """ @@ -437,6 +443,8 @@ Payload format: [ uid (offset), start, end, name, strand, thick_start, thick_end, blocks ] """ + + data_type = 'interval_index' def get_iterator( self, chrom, start, end ): raise Exception( "Unimplemented Method" ) @@ -533,6 +541,8 @@ for large datasets. """ + data_type = 'interval_index' + def get_iterator( self, chrom=None, start=None, end=None ): # Read first line in order to match chrom naming format. line = source.readline() @@ -570,6 +580,8 @@ """ col_name_data_attr_mapping = { 'Qual' : { 'index': 6 , 'name' : 'Qual' } } + + data_type = 'bai' def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ): """ @@ -675,6 +687,8 @@ for large datasets. """ + data_type = 'tabix' + def get_iterator( self, chrom, start, end ): # Read first line in order to match chrom naming format. line = source.readline() @@ -706,6 +720,8 @@ """ Summary tree data provider for the Galaxy track browser. """ + + data_type = 'summary_tree' CACHE = LRUCache( 20 ) # Store 20 recently accessed indices for performance @@ -771,6 +787,8 @@ Provides access to intervals from a sorted indexed BAM file. Position data is reported in 1-based, closed format, i.e. SAM/BAM format. """ + + data_type = 'bai' def get_filters( self ): """ @@ -951,6 +969,8 @@ return { 'data': results, 'message': message, 'max_low': max_low, 'max_high': max_high } class SamDataProvider( BamDataProvider ): + + data_type = 'bai' def __init__( self, converted_dataset=None, original_dataset=None, dependencies=None ): """ Create SamDataProvider. """ @@ -966,6 +986,9 @@ """ BBI data provider for the Galaxy track browser. """ + + data_type = 'bigwig' + def valid_chroms( self ): # No way to return this info as of now return None @@ -976,7 +999,7 @@ f.close() return all_dat is not None - def get_data( self, chrom, start, end, start_val=0, max_vals=None, **kwargs ): + def get_data( self, chrom, start, end, start_val=0, max_vals=None, num_samples=1000, **kwargs ): # Bigwig can be a standalone bigwig file, in which case we use # original_dataset, or coming from wig->bigwig conversion in # which we use converted_dataset @@ -1009,14 +1032,11 @@ return dict( data=dict( min=min, max=max, mean=mean, sd=sd ) ) - # Sample from region using approximately this many samples. - N = 1000 - def summarize_region( bbi, chrom, start, end, num_points ): ''' Returns results from summarizing a region using num_points. NOTE: num_points cannot be greater than end - start or BBI - will return None for all positions.s + will return None for all positions. ''' result = [] @@ -1042,7 +1062,8 @@ return result # Approach is different depending on region size. - if end - start < N: + num_samples = int( num_samples ) + if end - start < num_samples: # Get values for individual bases in region, including start and end. # To do this, need to increase end to next base and request number of points. num_points = end - start + 1 @@ -1050,10 +1071,10 @@ else: # # The goal is to sample the region between start and end uniformly - # using ~N data points. The challenge is that the size of sampled - # intervals rarely is full bases, so sampling using N points will - # leave the end of the region unsampled due to remainders for each - # interval. To recitify this, a new N is calculated based on the + # using ~N (num_samples) data points. The challenge is that the size of + # sampled intervals rarely is full bases, so sampling using N points + # will leave the end of the region unsampled due to remainders for + # each interval. To recitify this, a new N is calculated based on the # step size that covers as much of the region as possible. # # However, this still leaves some of the region unsampled. This @@ -1063,7 +1084,7 @@ # # Start with N samples. - num_points = N + num_points = num_samples step_size = ( end - start ) / num_points # Add additional points to sample in the remainder not covered by # the initial N samples. @@ -1100,6 +1121,8 @@ Interval index files used only for GFF files. """ col_name_data_attr_mapping = { 4 : { 'index': 4 , 'name' : 'Score' } } + + data_type = 'interval_index' def write_data_to_file( self, regions, filename ): source = open( self.original_dataset.file_name ) @@ -1177,6 +1200,8 @@ NOTE: this data provider does not use indices, and hence will be very slow for large datasets. """ + + data_type = 'interval_index' def get_iterator( self, chrom, start, end ): """ diff -r 8a8dcc0d36687fcfaaa1ae26e136e76b4a0d7bb0 -r bdd35af2a18afde11396fcf919b5fb4cacc986ff lib/galaxy/visualization/data_providers/registry.py --- a/lib/galaxy/visualization/data_providers/registry.py +++ b/lib/galaxy/visualization/data_providers/registry.py @@ -28,46 +28,63 @@ "bigbed": BigBedDataProvider } - def get_data_provider( self, name=None, raw=False, original_dataset=None ): + def get_data_provider( self, trans, name=None, source='data', raw=False, original_dataset=None ): """ Returns data provider class by name and/or original dataset. """ - # If getting raw data, use original dataset type to get data provider. + data_provider = None if raw: + # Working with raw data. if isinstance( original_dataset.datatype, Gff ): - return RawGFFDataProvider + data_provider_class = RawGFFDataProvider elif isinstance( original_dataset.datatype, Bed ): - return RawBedDataProvider + data_provider_class = RawBedDataProvider elif isinstance( original_dataset.datatype, Vcf ): - return RawVcfDataProvider + data_provider_class = RawVcfDataProvider elif isinstance( original_dataset.datatype, Tabular ): - return ColumnDataProvider + data_provider_class = ColumnDataProvider - # Using converted dataset, so get corrsponding data provider. - data_provider = None - if name: - value = self.dataset_type_name_to_data_provider[ name ] - if isinstance( value, dict ): - # Get converter by dataset extension; if there is no data provider, - # get the default. - data_provider = value.get( original_dataset.datatype.__class__, value.get( "default" ) ) - else: - data_provider = value - elif original_dataset: - # Look up data provider from datatype's informaton. - try: - # Get data provider mapping and data provider for 'data'. If - # provider available, use it; otherwise use generic provider. + data_provider = data_provider_class( original_dataset=original_dataset ) + + else: + # Working with converted or standalone dataset. + + if name: + # Provider requested by name; get from mappings. + value = self.dataset_type_name_to_data_provider[ name ] + if isinstance( value, dict ): + # Get converter by dataset extension; if there is no data provider, + # get the default. + data_provider_class = value.get( original_dataset.datatype.__class__, value.get( "default" ) ) + else: + data_provider_class = value + + # If name is the same as original dataset's type, dataset is standalone. + # Otherwise, a converted dataset is being used. + if name == original_dataset.ext: + data_provider = data_provider_class( original_dataset=original_dataset ) + else: + converted_dataset = original_dataset.get_converted_dataset( trans, name ) + deps = original_dataset.get_converted_dataset_deps( trans, name ) + data_provider = data_provider_class( original_dataset=original_dataset, + converted_dataset=converted_dataset, + dependencies=deps ) + + elif original_dataset: + # No name, so look up a provider name from datatype's information. + + # Dataset must have get_track_type function to get data. + if not hasattr( original_dataset.datatype, 'get_track_type'): + return None + + # Get data provider mapping and data provider. _ , data_provider_mapping = original_dataset.datatype.get_track_type() if 'data_standalone' in data_provider_mapping: data_provider_name = data_provider_mapping[ 'data_standalone' ] else: - data_provider_name = data_provider_mapping[ 'data' ] - if data_provider_name: - data_provider = self.get_data_provider( name=data_provider_name, original_dataset=original_dataset ) - else: - data_provider = GenomeDataProvider - except: - pass + data_provider_name = data_provider_mapping[ source ] + + data_provider = self.get_data_provider( trans, name=data_provider_name, original_dataset=original_dataset ) + return data_provider \ No newline at end of file diff -r 8a8dcc0d36687fcfaaa1ae26e136e76b4a0d7bb0 -r bdd35af2a18afde11396fcf919b5fb4cacc986ff lib/galaxy/web/api/datasets.py --- a/lib/galaxy/web/api/datasets.py +++ b/lib/galaxy/web/api/datasets.py @@ -86,28 +86,14 @@ if msg: return msg - # NOTE: finding valid chroms is prohibitive for large summary trees and is not currently used by - # the client. - valid_chroms = None # Check for data in the genome window. data_provider_registry = trans.app.data_provider_registry - if data_sources.get( 'index' ): - tracks_dataset_type = data_sources['index']['name'] - converted_dataset = dataset.get_converted_dataset( trans, tracks_dataset_type ) - indexer = data_provider_registry.get_data_provider( tracks_dataset_type )( converted_dataset, dataset ) - if not indexer.has_data( chrom ): - return messages.NO_DATA - #valid_chroms = indexer.valid_chroms() - else: - # Standalone data provider - standalone_provider = data_provider_registry.get_data_provider( data_sources['data_standalone']['name'] )( dataset ) - kwargs = {"stats": True} - if not standalone_provider.has_data( chrom ): - return messages.NO_DATA - #valid_chroms = standalone_provider.valid_chroms() + data_provider = trans.app.data_provider_registry.get_data_provider( trans, original_dataset= dataset, source='index' ) + if not data_provider.has_data( chrom ): + return messages.NO_DATA # Have data if we get here - return { "status": messages.DATA, "valid_chroms": valid_chroms } + return { "status": messages.DATA, "valid_chroms": None } def _search_features( self, trans, dataset, query ): """ @@ -151,45 +137,32 @@ data_provider_registry = trans.app.data_provider_registry if mode == "Coverage": # Get summary using minimal cutoffs. - tracks_dataset_type = data_sources['index']['name'] - converted_dataset = dataset.get_converted_dataset( trans, tracks_dataset_type ) - indexer = data_provider_registry.get_data_provider( tracks_dataset_type )( converted_dataset, dataset ) + indexer = data_provider_registry.get_data_provider( trans, original_dataset=dataset, source='index' ) summary = indexer.get_data( chrom, low, high, resolution=kwargs[ 'resolution' ], detail_cutoff=0, draw_cutoff=0 ) if summary == "detail": # Use maximum level of detail--2--to get summary data no matter the resolution. summary = indexer.get_data( chrom, low, high, resolution=kwargs[ 'resolution' ], level=2, detail_cutoff=0, draw_cutoff=0 ) frequencies, max_v, avg_v, delta = summary - return { 'dataset_type': tracks_dataset_type, 'data': frequencies, 'max': max_v, 'avg': avg_v, 'delta': delta } + return { 'dataset_type': indexer.data_type, 'data': frequencies, 'max': max_v, 'avg': avg_v, 'delta': delta } if 'index' in data_sources and data_sources['index']['name'] == "summary_tree" and mode == "Auto": # Only check for summary_tree if it's Auto mode (which is the default) # # Have to choose between indexer and data provider - tracks_dataset_type = data_sources['index']['name'] - converted_dataset = dataset.get_converted_dataset( trans, tracks_dataset_type ) - indexer = data_provider_registry.get_data_provider( tracks_dataset_type )( converted_dataset, dataset ) + indexer = data_provider_registry.get_data_provider( trans, original_dataset=dataset, source='index' ) summary = indexer.get_data( chrom, low, high, resolution=kwargs[ 'resolution' ] ) if summary is None: - return { 'dataset_type': tracks_dataset_type, 'data': None } + return { 'dataset_type': indexer.data_type, 'data': None } if summary == "draw": kwargs["no_detail"] = True # meh extra_info = "no_detail" elif summary != "detail": frequencies, max_v, avg_v, delta = summary - return { 'dataset_type': tracks_dataset_type, 'data': frequencies, 'max': max_v, 'avg': avg_v, 'delta': delta } + return { 'dataset_type': indexer.data_type, 'data': frequencies, 'max': max_v, 'avg': avg_v, 'delta': delta } # Get data provider. - if "data_standalone" in data_sources: - tracks_dataset_type = data_sources['data_standalone']['name'] - data_provider_class = data_provider_registry.get_data_provider( name=tracks_dataset_type, original_dataset=dataset ) - data_provider = data_provider_class( original_dataset=dataset ) - else: - tracks_dataset_type = data_sources['data']['name'] - data_provider_class = data_provider_registry.get_data_provider( name=tracks_dataset_type, original_dataset=dataset ) - converted_dataset = dataset.get_converted_dataset( trans, tracks_dataset_type ) - deps = dataset.get_converted_dataset_deps( trans, tracks_dataset_type ) - data_provider = data_provider_class( converted_dataset=converted_dataset, original_dataset=dataset, dependencies=deps ) + data_provider = data_provider_registry.get_data_provider( trans, original_dataset=dataset, source='data' ) # Allow max_vals top be data provider set if not passed if max_vals is None: @@ -197,7 +170,7 @@ # Get and return data from data_provider. result = data_provider.get_data( chrom, int( low ), int( high ), int( start_val ), int( max_vals ), **kwargs ) - result.update( { 'dataset_type': tracks_dataset_type, 'extra_info': extra_info } ) + result.update( { 'dataset_type': data_provider.data_type, 'extra_info': extra_info } ) return result def _raw_data( self, trans, dataset, **kwargs ): @@ -214,7 +187,7 @@ # Return data. data = None - data_provider = trans.app.data_provider_registry.get_data_provider( raw=True, original_dataset=dataset ) + data_provider = trans.app.data_provider_registry.get_data_provider( trans, raw=True, original_dataset=dataset ) if data_provider == ColumnDataProvider: #pre: should have column kwargs @@ -222,13 +195,13 @@ #TODO??: could default to first two here assert 'cols' in kwargs, ( "ColumnDataProvider needs a 'cols' parameter in the query string" ) - data = data_provider( original_dataset=dataset ).get_data( **kwargs ) + data = data_provider.get_data( **kwargs ) else: # Default to genomic data. # FIXME: need better way to set dataset_type. low, high = int( kwargs.get( 'low' ) ), int( kwargs.get( 'high' ) ) - data = data_provider( original_dataset=dataset ).get_data( start=low, end=high, **kwargs ) + data = data_provider.get_data( start=low, end=high, **kwargs ) data[ 'dataset_type' ] = 'interval_index' data[ 'extra_info' ] = None if isinstance( dataset.datatype, Vcf ): diff -r 8a8dcc0d36687fcfaaa1ae26e136e76b4a0d7bb0 -r bdd35af2a18afde11396fcf919b5fb4cacc986ff lib/galaxy/web/api/tools.py --- a/lib/galaxy/web/api/tools.py +++ b/lib/galaxy/web/api/tools.py @@ -204,15 +204,12 @@ if run_on_regions: for jida in original_job.input_datasets: input_dataset = jida.dataset - if data_provider_registry.get_data_provider( original_dataset=input_dataset ): - # Can index dataset. - track_type, data_sources = input_dataset.datatype.get_track_type() - # Convert to datasource that provides 'data' because we need to - # extract the original data. - data_source = data_sources[ 'data' ] - msg = self.convert_dataset( trans, input_dataset, data_source ) - if msg is not None: - messages_list.append( msg ) + data_provider = data_provider_registry.get_data_provider( trans, original_dataset=input_dataset, source='data' ) + if data_provider: + if not data_provider.converted_dataset: + msg = self.convert_dataset( trans, input_dataset, data_source ) + if msg is not None: + messages_list.append( msg ) # Return any messages generated during conversions. return_message = get_highest_priority_msg( messages_list ) @@ -326,10 +323,7 @@ trans.app.security_agent.set_all_dataset_permissions( new_dataset.dataset, hda_permissions ) # Write subset of data to new dataset - data_provider_class = data_provider_registry.get_data_provider( original_dataset=input_dataset ) - data_provider = data_provider_class( original_dataset=input_dataset, - converted_dataset=converted_dataset, - dependencies=deps ) + data_provider = data_provider_registry.get_data_provider( trans, original_dataset=input_dataset, source='data' ) trans.app.object_store.create( new_dataset.dataset ) data_provider.write_data_to_file( regions, new_dataset.file_name ) diff -r 8a8dcc0d36687fcfaaa1ae26e136e76b4a0d7bb0 -r bdd35af2a18afde11396fcf919b5fb4cacc986ff lib/galaxy/web/base/controller.py --- a/lib/galaxy/web/base/controller.py +++ b/lib/galaxy/web/base/controller.py @@ -460,8 +460,9 @@ prefs = {} track_type, _ = dataset.datatype.get_track_type() - track_data_provider_class = trans.app.data_provider_registry.get_data_provider( original_dataset=dataset ) - track_data_provider = track_data_provider_class( original_dataset=dataset ) + track_data_provider = trans.app.data_provider_registry.get_data_provider( trans, + original_dataset=dataset, + source='data' ) return { "track_type": track_type, @@ -536,8 +537,8 @@ """ # Get data provider. track_type, _ = dataset.datatype.get_track_type() - track_data_provider_class = trans.app.data_provider_registry.get_data_provider( original_dataset=dataset ) - track_data_provider = track_data_provider_class( original_dataset=dataset ) + track_data_provider = trans.app.data_provider_registry.get_data_provider( trans, original_dataset=dataset ) + if isinstance( dataset, trans.app.model.HistoryDatasetAssociation ): hda_ldda = "hda" diff -r 8a8dcc0d36687fcfaaa1ae26e136e76b4a0d7bb0 -r bdd35af2a18afde11396fcf919b5fb4cacc986ff static/scripts/viz/trackster/filters.js --- a/static/scripts/viz/trackster/filters.js +++ b/static/scripts/viz/trackster/filters.js @@ -617,7 +617,8 @@ }); return { - FiltersManager: FiltersManager + FiltersManager: FiltersManager, + NumberFilter: NumberFilter }; }); diff -r 8a8dcc0d36687fcfaaa1ae26e136e76b4a0d7bb0 -r bdd35af2a18afde11396fcf919b5fb4cacc986ff static/scripts/viz/trackster/tracks.js --- a/static/scripts/viz/trackster/tracks.js +++ b/static/scripts/viz/trackster/tracks.js @@ -1,7 +1,7 @@ define( ["libs/underscore", "viz/visualization", "viz/trackster/util", "viz/trackster/slotting", "viz/trackster/painters", "mvc/data", "viz/trackster/filters" ], - function( _, visualization, util, slotting, painters, data, filters ) { + function( _, visualization, util, slotting, painters, data, filters_mod ) { var extend = _.extend; var get_random_color = util.get_random_color; @@ -587,7 +587,7 @@ moveable(this.container_div, this.drag_handle_class, ".group", this); // Set up filters. - this.filters_manager = new filters.FiltersManager(this); + this.filters_manager = new filters_mod.FiltersManager(this); this.header_div.after(this.filters_manager.parent_div); // For saving drawables' filter managers when group-level filtering is done: this.saved_filters_managers = []; @@ -601,7 +601,7 @@ if ('filters' in obj_dict) { // FIXME: Pass collection_dict to DrawableCollection/Drawable will make this easier. var old_manager = this.filters_manager; - this.filters_manager = new filters.FiltersManager(this, obj_dict.filters); + this.filters_manager = new filters_mod.FiltersManager(this, obj_dict.filters); old_manager.parent_div.replaceWith(this.filters_manager.parent_div); if (obj_dict.filters.visible) { @@ -761,7 +761,7 @@ if (filters.length === num_feature_tracks) { // Add new filter. // FIXME: can filter.copy() be used? - new_filter = new NumberFilter( { + new_filter = new filters_mod.NumberFilter( { name: filters[0].name, index: filters[0].index } ); @@ -2612,7 +2612,7 @@ moveable(track.container_div, track.drag_handle_class, ".group", track); // Attribute init. - this.filters_manager = new filters.FiltersManager(this, ('filters' in obj_dict ? obj_dict.filters : null)); + this.filters_manager = new filters_mod.FiltersManager(this, ('filters' in obj_dict ? obj_dict.filters : null)); // HACK: set filters manager for data manager. // FIXME: prolly need function to set filters and update data_manager reference. this.data_manager.set('filters_manager', this.filters_manager); @@ -4188,7 +4188,6 @@ } }; -// Exports return { View: View, DrawableGroup: DrawableGroup, Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
Bitbucket