commit/galaxy-central: Jeremy Goecks: Genome data providers: (a) organize imports and (b) implement random and nth read iterators to downsample deep coverage regions. Use nth read iterator by default because it is the fastest.
1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/615d5040d90b/ Changeset: 615d5040d90b User: Jeremy Goecks Date: 2014-04-01 16:30:12 Summary: Genome data providers: (a) organize imports and (b) implement random and nth read iterators to downsample deep coverage regions. Use nth read iterator by default because it is the fastest. Affected #: 3 files diff -r abe1482b550884d58cda3ca1e46200afa6c09ee9 -r 615d5040d90b23a3476ff2e8c388c338fb031c09 lib/galaxy/visualization/data_providers/genome.py --- a/lib/galaxy/visualization/data_providers/genome.py +++ b/lib/galaxy/visualization/data_providers/genome.py @@ -4,21 +4,24 @@ import os, sys, re import pkg_resources +import itertools +import random + pkg_resources.require( "bx-python" ) pkg_resources.require( "pysam" ) pkg_resources.require( "numpy" ) import numpy -from galaxy.datatypes.util.gff_util import convert_gff_coords_to_bed, GFFFeature, GFFInterval, GFFReaderWrapper, parse_gff_attributes -from galaxy.util.json import from_json_string from bx.interval_index_file import Indexes from bx.bbi.bigwig_file import BigWigFile from bx.bbi.bigbed_file import BigBedFile +from pysam import csamtools, ctabix + +from galaxy.datatypes.util.gff_util import convert_gff_coords_to_bed, GFFFeature, GFFInterval, GFFReaderWrapper, parse_gff_attributes +from galaxy.util.json import from_json_string from galaxy.visualization.data_providers.basic import BaseDataProvider from galaxy.visualization.data_providers.cigar import get_ref_based_read_seq_and_cigar from galaxy.datatypes.interval import Bed, Gff, Gtf -from pysam import csamtools, ctabix - # # Utility functions. # @@ -887,8 +890,10 @@ except ValueError: return None return data + - def process_data( self, iterator, start_val=0, max_vals=None, ref_seq=None, start=0, **kwargs ): + def process_data( self, iterator, start_val=0, max_vals=None, ref_seq=None, + iterator_type='nth', mean_depth=None, start=0, end=0, **kwargs ): """ Returns a dict with the following attributes:: @@ -911,7 +916,6 @@ max_low - lowest coordinate for the returned reads max_high - highest coordinate for the returned reads message - error/informative message - """ # No iterator indicates no reads. if iterator is None: @@ -921,14 +925,59 @@ # Helper functions. # - # Decode strand from read flag. def decode_strand( read_flag, mask ): + """ Decode strand from read flag. """ + strand_flag = ( read_flag & mask == 0 ) if strand_flag: return "+" else: return "-" + def _random_read_iterator( read_iterator, threshold ): + """ + An iterator that returns a random stream of reads from the read_iterator + as well as corresponding pairs for returned reads. + threshold is a value in [0,1] that denotes the percentage of reads + to return. + """ + for e in read_iterator: + if e.qname in paired_pending or random.uniform( 0, 1 ) <= threshold: + yield e + + def _nth_read_iterator( read_iterator, threshold ): + """ + An iterator that returns every nth read. + """ + + # Convert threshold to N for stepping through iterator. + n = int( 1/threshold ) + for e in itertools.islice( read_iterator, None, None, n ): + yield e + + # Alternatate and much slower implementation that looks for pending pairs. + ''' + for i, e in enumerate( read_iterator ): + if e.qname in paired_pending or ( i % n ) == 0: + yield e + ''' + + # -- Choose iterator. -- + + # Calculate threshold for non-sequential iterators based on mean_depth and read length. + first_read = next( iterator ) + read_len = len( first_read.seq ) + num_reads = ( end - start ) * mean_depth / float ( read_len ) + threshold = float( max_vals )/ num_reads + iterator = itertools.chain( iter( [ first_read ] ), iterator ) + + if iterator_type == 'sequential': + read_iterator = iterator + elif iterator_type == 'random': + read_iterator = _random_read_iterator( iterator, threshold ) + elif iterator_type == 'nth': + read_iterator = _nth_read_iterator( iterator, threshold ) + # # Encode reads as list of lists. # @@ -936,7 +985,8 @@ paired_pending = {} unmapped = 0 message = None - for count, read in enumerate( iterator ): + count = 0 + for read in read_iterator: if count < start_val: continue if ( count - start_val - unmapped ) >= max_vals: @@ -958,7 +1008,8 @@ read_len = len(seq) # If no cigar, just use sequence length if read.is_proper_pair: - if qname in paired_pending: # one in dict is always first + if qname in paired_pending: + # Found pair. pair = paired_pending[qname] results.append( [ "%i_%s" % ( pair['start'], qname ), pair['start'], @@ -970,15 +1021,17 @@ ] ) del paired_pending[qname] else: + # Insert first of pair. paired_pending[qname] = { 'start': read.pos, 'end': read.pos + read_len, 'seq': seq, 'mate_start': read.mpos, 'rlen': read_len, 'strand': strand, 'cigar': read.cigar, 'mapq': read.mapq } + count += 1 else: results.append( [ "%i_%s" % ( read.pos, qname ), read.pos, read.pos + read_len, qname, read.cigar, strand, read.seq, read.mapq ] ) - + count += 1 + # Take care of reads whose mates are out of range. - # TODO: count paired reads when adhering to max_vals? for qname, read in paired_pending.iteritems(): if read['mate_start'] < read['start']: # Mate is before read. diff -r abe1482b550884d58cda3ca1e46200afa6c09ee9 -r 615d5040d90b23a3476ff2e8c388c338fb031c09 lib/galaxy/webapps/galaxy/api/datasets.py --- a/lib/galaxy/webapps/galaxy/api/datasets.py +++ b/lib/galaxy/webapps/galaxy/api/datasets.py @@ -2,7 +2,7 @@ API operations on the contents of a history dataset. """ from galaxy import web -from galaxy.visualization.data_providers.genome import FeatureLocationIndexDataProvider +from galaxy.visualization.data_providers.genome import FeatureLocationIndexDataProvider, SamDataProvider, BamDataProvider from galaxy.web.base.controller import BaseAPIController, UsesVisualizationMixin, UsesHistoryDatasetAssociationMixin from galaxy.web.base.controller import UsesHistoryMixin from galaxy.web.framework.helpers import is_true @@ -146,6 +146,7 @@ extra_info = None mode = kwargs.get( "mode", "Auto" ) data_provider_registry = trans.app.data_provider_registry + indexer = None # Coverage mode uses index data. if mode == "Coverage": @@ -192,16 +193,26 @@ if max_vals is None: max_vals = data_provider.get_default_max_vals() - # Get reference sequence for region; this is used by providers for aligned reads. + # Get reference sequence and mean depth for region; these is used by providers for aligned reads. ref_seq = None - if dataset.dbkey: - data_dict = self.app.genomes.reference( trans, dbkey=dataset.dbkey, chrom=chrom, low=low, high=high ) - if data_dict: - ref_seq = data_dict[ 'data' ] + mean_depth = None + if isinstance( data_provider, (SamDataProvider, BamDataProvider ) ): + # Get reference sequence. + if dataset.dbkey: + data_dict = self.app.genomes.reference( trans, dbkey=dataset.dbkey, chrom=chrom, low=low, high=high ) + if data_dict: + ref_seq = data_dict[ 'data' ] + + # Get mean depth. + if not indexer: + indexer = data_provider_registry.get_data_provider( trans, original_dataset=dataset, source='index' ) + stats = indexer.get_data( chrom, low, high, stats=True ) + mean_depth = stats[ 'data' ][ 'mean' ] + # Get and return data from data_provider. result = data_provider.get_data( chrom, int( low ), int( high ), int( start_val ), int( max_vals ), - ref_seq=ref_seq, **kwargs ) + ref_seq=ref_seq, mean_depth=mean_depth, **kwargs ) result.update( { 'dataset_type': data_provider.dataset_type, 'extra_info': extra_info } ) return result diff -r abe1482b550884d58cda3ca1e46200afa6c09ee9 -r 615d5040d90b23a3476ff2e8c388c338fb031c09 static/scripts/viz/trackster/tracks.js --- a/static/scripts/viz/trackster/tracks.js +++ b/static/scripts/viz/trackster/tracks.js @@ -1952,6 +1952,9 @@ this.has_icons = false; // Add message + action icons to tile's html. + /* + This does not work right now because a random set of reads is returned by the server. + When the server can respond with more data systematically, renable these icons. if (message) { this.has_icons = true; @@ -1999,6 +2002,7 @@ e.stopPropagation(); }); } + */ }; extend(FeatureTrackTile.prototype, Tile.prototype); Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
commits-noreply@bitbucket.org