1 new changeset in galaxy-central: http://bitbucket.org/galaxy/galaxy-central/changeset/fd714d78e05e/ changeset: r5413:fd714d78e05e user: kanwei date: 2011-04-19 00:07:52 summary: trackster: - Use tabix (through pysam) as the indexer for feature formats. Bed support included in this commit - Add bigBed format support (same interface as bigWig) - Improve implicit converter error handling affected #: 11 files (3.2 KB) --- a/datatypes_conf.xml.sample Mon Apr 18 17:02:32 2011 -0400 +++ b/datatypes_conf.xml.sample Mon Apr 18 18:07:52 2011 -0400 @@ -14,7 +14,8 @@ <datatype extension="bed" type="galaxy.datatypes.interval:Bed" display_in_upload="true"><converter file="bed_to_gff_converter.xml" target_datatype="gff"/><converter file="interval_to_coverage.xml" target_datatype="coverage"/> - <converter file="bed_to_interval_index_converter.xml" target_datatype="interval_index"/> + <converter file="bed_to_bgzip_converter.xml" target_datatype="bgzip"/> + <converter file="bed_to_tabix_converter.xml" target_datatype="tabix" depends_on="bgzip"/><converter file="bed_to_summary_tree_converter.xml" target_datatype="summary_tree"/><!-- <display file="ucsc/interval_as_bed.xml" /> --><display file="genetrack.xml" /> @@ -131,6 +132,7 @@ <datatype extension="summary_tree" type="galaxy.datatypes.data:Data" /><datatype extension="interval_index" type="galaxy.datatypes.data:Data" /><datatype extension="tabix" type="galaxy.datatypes.data:Data" /> + <datatype extension="bgzip" type="galaxy.datatypes.data:Data" /><!-- Start EMBOSS tools --><datatype extension="acedb" type="galaxy.datatypes.data:Text"/><datatype extension="asn1" type="galaxy.datatypes.data:Text"/> --- a/eggs.ini Mon Apr 18 17:02:32 2011 -0400 +++ b/eggs.ini Mon Apr 18 18:07:52 2011 -0400 @@ -69,7 +69,7 @@ bx_python = _494c2d1d68b3 GeneTrack = _dev_48da9e998f0caf01c5be731e926f4b0481f658f0 SQLAlchemy = _dev_r6498 -pysam = _kanwei_ae2bd50d9945 +pysam = _kanwei_595e4f94f935 ; dependency source urls, necessary for scrambling. for an explanation, see ; the wiki page above --- a/lib/galaxy/datatypes/binary.py Mon Apr 18 17:02:32 2011 -0400 +++ b/lib/galaxy/datatypes/binary.py Mon Apr 18 18:07:52 2011 -0400 @@ -288,3 +288,7 @@ Binary.__init__( self, **kwd ) self._magic = 0x8789F2EB self._name = "BigBed" + + def get_track_type( self ): + return "LineTrack", {"data_standalone": "bigbed"} + --- a/lib/galaxy/datatypes/interval.py Mon Apr 18 17:02:32 2011 -0400 +++ b/lib/galaxy/datatypes/interval.py Mon Apr 18 18:07:52 2011 -0400 @@ -533,7 +533,7 @@ except: return False def get_track_type( self ): - return "FeatureTrack", {"data": "interval_index", "index": "summary_tree"} + return "FeatureTrack", {"data": "tabix", "index": "summary_tree"} class BedStrict( Bed ): """Tab delimited data in strict BED format - no non-standard columns allowed""" --- a/lib/galaxy/model/__init__.py Mon Apr 18 17:02:32 2011 -0400 +++ b/lib/galaxy/model/__init__.py Mon Apr 18 18:07:52 2011 -0400 @@ -21,6 +21,18 @@ datatypes_registry = galaxy.datatypes.registry.Registry() #Default Value Required for unit tests +class NoConverterException(Exception): + def __init__(self, value): + self.value = value + def __str__(self): + return repr(self.value) + +class ConverterDependencyException(Exception): + def __init__(self, value): + self.value = value + def __str__(self): + return repr(self.value) + def set_datatypes_registry( d_registry ): """ Set up datatypes_registry @@ -709,15 +721,33 @@ if not assoc.deleted and assoc.type == file_type: return assoc.dataset return None + def get_converted_dataset_deps(self, trans, target_ext): + """ + Returns dict of { "dependency" => HDA } + """ + converted_dataset = self.get_converted_files_by_type( target_ext ) + # List of string of dependencies + try: + depends_list = trans.app.datatypes_registry.converter_deps[self.extension][target_ext] + except KeyError: + depends_list = [] + return dict([ (dep, self.get_converted_dataset(trans, dep)) for dep in depends_list ]) def get_converted_dataset(self, trans, target_ext): """ - Return converted dataset(s) if they exist. If not converted yet, do so and return None (the first time). - If unconvertible, raise exception. + Return converted dataset(s) if they exist, along with a dict of dependencies. + If not converted yet, do so and return None (the first time). If unconvertible, raise exception. """ # See if we can convert the dataset if target_ext not in self.get_converter_types(): raise ValueError("Conversion from '%s' to '%s' not possible", self.extension, target_ext) + deps = {} + # List of string of dependencies + try: + depends_list = trans.app.datatypes_registry.converter_deps[self.extension][target_ext] + except KeyError: + depends_list = [] + # See if converted dataset already exists converted_dataset = self.get_converted_files_by_type( target_ext ) if converted_dataset: @@ -725,20 +755,22 @@ # Conversion is possible but hasn't been done yet, run converter. # Check if we have dependencies - deps = {} + try: - fail_dependencies = False - depends_on = trans.app.datatypes_registry.converter_deps[self.extension][target_ext] - for dependency in depends_on: + for dependency in depends_list: dep_dataset = self.get_converted_dataset(trans, dependency) - if dep_dataset is None or dep_dataset.state != trans.app.model.Job.states.OK: - fail_dependencies = True - else: - deps[dependency] = dep_dataset - if fail_dependencies: - return None + if dep_dataset is None: + # None means converter is running first time + return None + elif dep_dataset.state == trans.app.model.Job.states.ERROR: + raise ConverterDependencyException("A dependency (%s) was in an error state." % dependency) + elif dep_dataset.state != trans.app.model.Job.states.OK: + # Pending + return None + + deps[dependency] = dep_dataset except ValueError: - raise ValueError("A dependency could not be converted.") + raise NoConverterException("A dependency (%s) is missing a converter." % dependency) except KeyError: pass # No deps --- a/lib/galaxy/visualization/tracks/data_providers.py Mon Apr 18 17:02:32 2011 -0400 +++ b/lib/galaxy/visualization/tracks/data_providers.py Mon Apr 18 18:07:52 2011 -0400 @@ -21,7 +21,7 @@ from galaxy.datatypes.interval import Bed, Gff, Gtf from galaxy.datatypes.util.gff_util import parse_gff_attributes -from pysam import csamtools +from pysam import csamtools, ctabix MAX_VALS = 5000 # only display first MAX_VALS features ERROR_MAX_VALS = "Only the first " + str(MAX_VALS) + " %s in this tile are displayed." @@ -45,10 +45,11 @@ """ col_name_data_attr_mapping = {} - def __init__( self, converted_dataset=None, original_dataset=None ): + def __init__( self, converted_dataset=None, original_dataset=None, dependencies=None ): """ Create basic data provider. """ self.converted_dataset = converted_dataset self.original_dataset = original_dataset + self.dependencies = dependencies def write_data_to_file( self, chrom, start, end, filename ): """ @@ -419,36 +420,27 @@ f.close() return results -class BigWigDataProvider( TracksDataProvider ): +class BBIDataProvider( TracksDataProvider ): """ - BigWig data provider for the Galaxy track browser. + BBI data provider for the Galaxy track browser. """ - def _get_dataset( self ): - if self.converted_dataset is not None: - f = open( self.converted_dataset.file_name ) - else: - f = open( self.original_dataset.file_name ) - return f - def valid_chroms( self ): # No way to return this info as of now return None def has_data( self, chrom ): - f = self._get_dataset() - bw = BigWigFile(file=f) - all_dat = bw.query(chrom, 0, 2147483647, 1) + f, bbi = self._get_dataset() + all_dat = bbi.query(chrom, 0, 2147483647, 1) f.close() return all_dat is not None def get_data( self, chrom, start, end, **kwargs ): # Bigwig has the possibility of it being a standalone bigwig file, in which case we use # original_dataset, or coming from wig->bigwig conversion in which we use converted_dataset - f = self._get_dataset() - bw = BigWigFile(file=f) + f, bbi = self._get_dataset() if 'stats' in kwargs: - all_dat = bw.query(chrom, 0, 2147483647, 1) + all_dat = bbi.query(chrom, 0, 2147483647, 1) f.close() if all_dat is None: return None @@ -464,7 +456,7 @@ if (end - start) < num_points: num_points = end - start - data = bw.query(chrom, start, end, num_points) + data = bbi.query(chrom, start, end, num_points) f.close() pos = start @@ -477,6 +469,20 @@ return result +class BigBedDataProvider( BBIDataProvider ): + def _get_dataset( self ): + # Nothing converts to bigBed so we don't consider converted dataset + f = open( self.original_dataset.file_name ) + return f, BigBedFile(file=f) + +class BigWigDataProvider (BBIDataProvider ): + def _get_dataset( self ): + if self.converted_dataset is not None: + f = open( self.converted_dataset.file_name ) + else: + f = open( self.original_dataset.file_name ) + return f, BigWigFile(file=f) + class IntervalIndexDataProvider( TracksDataProvider ): """ Interval index data provider for the Galaxy track browser. @@ -557,10 +563,25 @@ return filters +class TabixDataProvider( IntervalIndexDataProvider ): + """ + Tabix index data provider for the Galaxy track browser. + + Payload format: [ uid (offset), start, end, name, strand, thick_start, thick_end, blocks ] + """ + def get_data( self, chrom, start, end, **kwargs ): + if end >= 2<<29: + end = (2<<29 - 1) # Tabix-enforced maximum start, end = int(start), int(end) - source = open( self.original_dataset.file_name ) - index = Indexes( self.converted_dataset.file_name ) + + # {'bgzip': (<galaxy.model.HistoryDatasetAssociation object at 0x85fbe90>, {})} + bgzip_fname = self.dependencies['bgzip'].file_name + + # if os.path.getsize(self.converted_dataset.file_name) == 0: + # return { 'kind': messages.ERROR, 'message': "Tabix converted size was 0, meaning the input file had invalid values." } + tabix = ctabix.Tabixfile(bgzip_fname, index_filename=self.converted_dataset.file_name) + results = [] count = 0 message = None @@ -569,7 +590,7 @@ # characters (e.g. 'chr') and see if that works. This enables the # provider to handle chrome names defined as chrXXX and as XXX. chrom = str(chrom) - if chrom not in index.indexes and chrom[3:] in index.indexes: + if chrom not in tabix.contigs and ("chr" + chrom[3:]) in tabix.contigs: chrom = chrom[3:] # @@ -581,12 +602,12 @@ # filter_cols = from_json_string( kwargs.get( "filter_cols", "[]" ) ) no_detail = ( "no_detail" in kwargs ) - for start, end, offset in index.find(chrom, start, end): + + for line in tabix.fetch(reference=chrom, start=start, end=end): if count >= MAX_VALS: message = ERROR_MAX_VALS % "features" break count += 1 - source.seek( offset ) # TODO: can we use column metadata to fill out payload? # TODO: use function to set payload data if isinstance( self.original_dataset.datatype, Gff ): @@ -597,35 +618,38 @@ payload.insert( 0, offset ) elif isinstance( self.original_dataset.datatype, Bed ): # BED dataset. - payload = [ offset, start, end ] - if not no_detail: - feature = source.readline().split() - length = len(feature) - - # Simpler way to add stuff, but type casting is not done. - # Name, score, strand, thick start, thick end. - #end = min( len( feature ), 8 ) - #payload.extend( feature[ 3:end ] ) - - # Name, strand, thick start, thick end. - if length >= 4: - payload.append(feature[3]) - if length >= 6: - payload.append(feature[5]) - if length >= 8: - payload.append(int(feature[6])) - payload.append(int(feature[7])) + feature = line.split() + length = len(feature) + payload = [ feature[1]+"-"+feature[2]+":"+str(count), int(feature[1]), int(feature[2]) ] + + if no_detail: + results.append( payload ) + continue + + # Simpler way to add stuff, but type casting is not done. + # Name, score, strand, thick start, thick end. + #end = min( len( feature ), 8 ) + #payload.extend( feature[ 3:end ] ) + + # Name, strand, thick start, thick end. + if length >= 4: + payload.append(feature[3]) + if length >= 6: + payload.append(feature[5]) + if length >= 8: + payload.append(int(feature[6])) + payload.append(int(feature[7])) - # Blocks. - if length >= 12: - block_sizes = [ int(n) for n in feature[10].split(',') if n != ''] - block_starts = [ int(n) for n in feature[11].split(',') if n != '' ] - blocks = zip( block_sizes, block_starts ) - payload.append( [ ( start + block[1], start + block[1] + block[0] ) for block in blocks ] ) - - # Score (filter data) - if length >= 5 and filter_cols and filter_cols[0] == "Score": - payload.append( float(feature[4]) ) + # Blocks. + if length >= 12: + block_sizes = [ int(n) for n in feature[10].split(',') if n != ''] + block_starts = [ int(n) for n in feature[11].split(',') if n != '' ] + blocks = zip( block_sizes, block_starts ) + payload.append( [ ( int(feature[1]) + block[1], int(feature[1]) + block[1] + block[0] ) for block in blocks ] ) + + # Score (filter data) + if length >= 5 and filter_cols and filter_cols[0] == "Score": + payload.append( float(feature[4]) ) results.append( payload ) @@ -671,10 +695,12 @@ # is original dataset type. TODO: This needs to be more flexible. dataset_type_name_to_data_provider = { "array_tree": ArrayTreeDataProvider, + "tabix": TabixDataProvider, "interval_index": { "vcf": VcfDataProvider, "default" : IntervalIndexDataProvider }, "bai": BamDataProvider, "summary_tree": SummaryTreeDataProvider, - "bigwig": BigWigDataProvider + "bigwig": BigWigDataProvider, + "bigbed": BigBedDataProvider } dataset_type_to_data_provider = { --- a/lib/galaxy/web/controllers/tracks.py Mon Apr 18 17:02:32 2011 -0400 +++ b/lib/galaxy/web/controllers/tracks.py Mon Apr 18 18:07:52 2011 -0400 @@ -14,7 +14,7 @@ from galaxy.web.framework.helpers import time_ago, grids from galaxy.util.bunch import Bunch from galaxy.datatypes.interval import Gff - +from galaxy.model import NoConverterException, ConverterDependencyException from galaxy.visualization.tracks.data_providers import * from galaxy.visualization.tracks.visual_analytics import get_tool_def, get_dataset_job @@ -458,7 +458,8 @@ # Check for data in the genome window. if data_sources.get( 'index' ): tracks_dataset_type = data_sources['index']['name'] - indexer = get_data_provider( tracks_dataset_type )( dataset.get_converted_dataset( trans, tracks_dataset_type ), dataset ) + converted_dataset = dataset.get_converted_dataset( trans, tracks_dataset_type ) + indexer = get_data_provider( tracks_dataset_type )( converted_dataset, dataset ) if not indexer.has_data( chrom ): return messages.NO_DATA valid_chroms = indexer.valid_chroms() @@ -505,7 +506,8 @@ # # Have to choose between indexer and data provider tracks_dataset_type = data_sources['index']['name'] - indexer = get_data_provider( tracks_dataset_type )( dataset.get_converted_dataset( trans, tracks_dataset_type ), dataset ) + converted_dataset = dataset.get_converted_dataset( trans, tracks_dataset_type ) + indexer = get_data_provider( tracks_dataset_type )( converted_dataset, dataset ) summary = indexer.get_summary( chrom, low, high, **kwargs ) if summary is None: return { 'dataset_type': tracks_dataset_type, 'data': None } @@ -525,7 +527,9 @@ else: tracks_dataset_type = data_sources['data']['name'] data_provider_class = get_data_provider( name=tracks_dataset_type, original_dataset=dataset ) - data_provider = data_provider_class( dataset.get_converted_dataset(trans, tracks_dataset_type), dataset ) + converted_dataset = dataset.get_converted_dataset( trans, tracks_dataset_type ) + deps = dataset.get_converted_dataset_deps( trans, tracks_dataset_type ) + data_provider = data_provider_class( converted_dataset=converted_dataset, original_dataset=dataset, dependencies=deps ) # Get and return data from data_provider. data = data_provider.get_data( chrom, low, high, **kwargs ) @@ -724,6 +728,7 @@ track_type, data_sources = input_dataset.datatype.get_track_type() data_source = data_sources[ 'data' ] converted_dataset = input_dataset.get_converted_dataset( trans, data_source ) + deps = input_dataset.get_converted_dataset_deps( trans, data_source ) # # Create new HDA for input dataset's subset. @@ -742,7 +747,8 @@ # Write subset of data to new dataset data_provider_class = get_data_provider( original_dataset=input_dataset ) data_provider = data_provider_class( original_dataset=input_dataset, - converted_dataset=converted_dataset ) + converted_dataset=converted_dataset, + deps=deps ) data_provider.write_data_to_file( chrom, low, high, new_dataset.file_name ) # TODO: size not working. @@ -828,9 +834,11 @@ # necessary. try: converted_dataset = dataset.get_converted_dataset( trans, target_type ) - except ValueError: + except NoConverterException: return messages.NO_CONVERTER - + except ConverterDependencyException, dep_error: + return { 'kind': messages.ERROR, 'message': dep_error.value } + # Check dataset state and return any messages. msg = None if converted_dataset and converted_dataset.state == model.Dataset.states.ERROR: Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.