commit/galaxy-central: jgoecks: SAM/BAM data providers: add support for modifying CIGAR string so that CIGAR can be used, along with reference sequence, to infer read sequence. Parameter clean up as well.
1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/4317a158fe8c/ Changeset: 4317a158fe8c User: jgoecks Date: 2013-04-04 16:16:53 Summary: SAM/BAM data providers: add support for modifying CIGAR string so that CIGAR can be used, along with reference sequence, to infer read sequence. Parameter clean up as well. Affected #: 4 files diff -r f3f8ae33a02a8eef02c5ecb876bfbcd67060fdc7 -r 4317a158fe8c90b01155e2a6e98f2c9ec44b29b1 lib/galaxy/visualization/data_providers/genome.py --- a/lib/galaxy/visualization/data_providers/genome.py +++ b/lib/galaxy/visualization/data_providers/genome.py @@ -2,7 +2,7 @@ Data providers for genome visualizations. """ -import os, sys +import os, sys, operator from math import ceil, log import pkg_resources pkg_resources.require( "bx-python" ) @@ -179,7 +179,7 @@ """ start, end = int( low ), int( high ) iterator = self.get_iterator( chrom, start, end, **kwargs ) - return self.process_data( iterator, start_val, max_vals, **kwargs ) + return self.process_data( iterator, start_val, max_vals, start=start, end=end, **kwargs ) def get_genome_data( self, chroms_info, **kwargs ): """ @@ -897,7 +897,7 @@ return None return data - def process_data( self, iterator, start_val=0, max_vals=None, **kwargs ): + def process_data( self, iterator, start_val=0, max_vals=None, ref_seq=None, start=0, **kwargs ): """ Returns a dict with the following attributes:: @@ -925,7 +925,86 @@ # No iterator indicates no reads. if iterator is None: return { 'data': [], 'message': None } - + + # + # Helper functions: + # + + def counter( s1, p1, s2, p2 ): + ''' + Count consecutive matches/mismatches between strings s1 and s2 + starting at p1 and p2, respectively. + ''' + + # Do initial comparison to determine whether to count matches or + # mismatches. + if s1[ p1 ] == s2[ p2 ]: + cmp_fn = operator.eq + match = True + else: + cmp_fn = operator.ne + match = False + + # Increment counts to move to next characters. + count = 1 + p1 += 1 + p2 += 1 + + # Count matches/mismatches. + while p1 < len( s1 ) and p2 < len( s2 ) and cmp_fn( s1[ p1 ], s2[ p2 ] ): + count += 1 + p1 += 1 + p2 += 1 + + return match, count + + def transform_cigar( read_seq, read_start, ref_seq, cigar ): + ''' + Returns a new cigar with =s and Xs replacing Ms; M operation is + imprecise because it can denote a sequence match or mismatch. + New cigar can be used with reference to reconstruct read sequence. + ''' + + if not ref_seq: + return read_seq, cigar + + # Set up position for reference, read. + ref_seq_pos = read_start - start + read_pos = 0 + + # Create new cigar. + new_cigar = [] + for op_tuple in cigar: + op, op_len = op_tuple + + # Op is index into string 'MIDNSHP=X' + if op == 0: # Match + # Transform Ms to =s and Xs. + new_op = [] + while read_pos < op_len: + match, count = counter( read_seq, read_pos, ref_seq, ref_seq_pos ) + if match: + new_op = 7 + else: + new_op = 8 + read_pos += count + ref_seq_pos += count + new_cigar.append( ( new_op, count ) ) + elif op == 1: # Insertion + new_cigar.append( op_tuple ) + elif op in [ 2, 3, 5, 6 ]: # Deletion, Skip, Soft Clipping or Padding + ref_seq_pos += op_len + new_cigar.append( op_tuple ) + elif op == 4: # Soft clipping + read_pos += op_len + new_cigar.append( op_tuple ) + elif op in [ 7, 8 ]: # Match or mismatch + read_pos += op_len + ref_seq_pos += op_len + new_cigar.append( op_tuple ) + + return new_cigar + # Decode strand from read flag. def decode_strand( read_flag, mask ): strand_flag = ( read_flag & mask == 0 ) @@ -934,7 +1013,11 @@ else: return "-" + # # Encode reads as list of lists. + # + if ref_seq: + ref_seq = ref_seq.upper() results = [] paired_pending = {} unmapped = 0 diff -r f3f8ae33a02a8eef02c5ecb876bfbcd67060fdc7 -r 4317a158fe8c90b01155e2a6e98f2c9ec44b29b1 lib/galaxy/visualization/genomes.py --- a/lib/galaxy/visualization/genomes.py +++ b/lib/galaxy/visualization/genomes.py @@ -296,7 +296,7 @@ return False - def reference( self, trans, dbkey, chrom, low, high, **kwargs ): + def reference( self, trans, dbkey, chrom, low, high ): """ Return reference data for a build. """ diff -r f3f8ae33a02a8eef02c5ecb876bfbcd67060fdc7 -r 4317a158fe8c90b01155e2a6e98f2c9ec44b29b1 lib/galaxy/webapps/galaxy/api/datasets.py --- a/lib/galaxy/webapps/galaxy/api/datasets.py +++ b/lib/galaxy/webapps/galaxy/api/datasets.py @@ -176,8 +176,16 @@ if max_vals is None: max_vals = data_provider.get_default_max_vals() + # Get reference sequence for region; this is used by providers for aligned reads. + ref_seq = None + if dataset.dbkey: + data_dict = self.app.genomes.reference( trans, dbkey=dataset.dbkey, chrom=chrom, low=low, high=high ) + if data_dict: + ref_seq = data_dict[ 'data' ] + # Get and return data from data_provider. - result = data_provider.get_data( chrom, int( low ), int( high ), int( start_val ), int( max_vals ), **kwargs ) + result = data_provider.get_data( chrom, int( low ), int( high ), int( start_val ), int( max_vals ), + ref_seq=ref_seq, **kwargs ) result.update( { 'dataset_type': data_provider.dataset_type, 'extra_info': extra_info } ) return result diff -r f3f8ae33a02a8eef02c5ecb876bfbcd67060fdc7 -r 4317a158fe8c90b01155e2a6e98f2c9ec44b29b1 lib/galaxy/webapps/galaxy/api/genomes.py --- a/lib/galaxy/webapps/galaxy/api/genomes.py +++ b/lib/galaxy/webapps/galaxy/api/genomes.py @@ -37,7 +37,7 @@ # Return info. rval = None if reference: - rval = self.app.genomes.reference( trans, dbkey=id, chrom=chrom, low=low, high=high, **kwd ) + rval = self.app.genomes.reference( trans, dbkey=id, chrom=chrom, low=low, high=high ) else: rval = self.app.genomes.chroms( trans, dbkey=id, num=num, chrom=chrom, low=low ) return rval Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
commits-noreply@bitbucket.org