1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/ce3143a06f21/ Changeset: ce3143a06f21 User: jgoecks Date: 2013-04-05 15:39:05 Summary: Implement a reference-based compression approach for encoding aligned reads' sequence and cigar. Move cigar functions into own file. Affected #: 2 files diff -r 1829e8c8f8bb632b8fd1c9224dd3da6849246568 -r ce3143a06f21f636145a03bf1d62c4f8a6ccaa6b lib/galaxy/visualization/data_providers/cigar.py --- /dev/null +++ b/lib/galaxy/visualization/data_providers/cigar.py @@ -0,0 +1,103 @@ +''' +Functions for working with SAM/BAM CIGAR representation. +''' + +import operator + +def get_ref_based_read_seq_and_cigar( read_seq, read_start, ref_seq, ref_seq_start, cigar ): + ''' + Returns a ( new_read_seq, new_cigar ) that can be used with reference + sequence to reconstruct the read. The new read sequence includes only + bases that cannot be recovered from the reference: mismatches and + insertions (soft clipped bases are not included). The new cigar replaces + Ms with =s and Xs because the M operation can denote a sequence match or + mismatch. + ''' + + if not ref_seq: + return read_seq, cigar + + # Set up position for reference, read. + ref_seq_pos = read_start - ref_seq_start + read_pos = 0 + + # Create new read sequence, cigar. + new_read_seq = '' + new_cigar = [] + for op_tuple in cigar: + op, op_len = op_tuple + + # Op is index into string 'MIDNSHP=X' + if op == 0: # Match + # Transform Ms to =s and Xs. + new_op = [] + total_count = 0 + while total_count < op_len and ref_seq_pos < len( ref_seq ): + match, count = _match_mismatch_counter( read_seq, read_pos, ref_seq, ref_seq_pos ) + # Use min because count cannot exceed remainder of operation. + count = min( count, op_len - total_count ) + if match: + new_op = 7 + else: + new_op = 8 + # Include mismatched bases in new read sequence. + new_read_seq += read_seq[ read_pos:read_pos + count ] + new_cigar.append( ( new_op, count ) ) + total_count += count + read_pos += count + ref_seq_pos += count + + # If part of read falls outside of ref_seq dat, then leave + # part as M. + if total_count < op_len: + new_cigar.append( ( 0, op_len - total_count ) ) + elif op == 1: # Insertion + new_cigar.append( op_tuple ) + # Include insertion bases in new read sequence. + new_read_seq += read_seq[ read_pos:read_pos + op_len ] + read_pos += op_len + elif op in [ 2, 3, 6 ]: # Deletion, Skip, or Padding + ref_seq_pos += op_len + new_cigar.append( op_tuple ) + elif op == 4: # Soft clipping + read_pos += op_len + new_cigar.append( op_tuple ) + elif op == 5: # Hard clipping + new_cigar.append( op_tuple ) + elif op in [ 7, 8 ]: # Match or mismatch + if op == 8: + # Include mismatched bases in new read sequence. + new_read_seq += read_seq[ read_pos:read_pos + op_len ] + read_pos += op_len + ref_seq_pos += op_len + new_cigar.append( op_tuple ) + + return ( new_read_seq, new_cigar ) + +def _match_mismatch_counter( s1, p1, s2, p2 ): + ''' + Count consecutive matches/mismatches between strings s1 and s2 + starting at p1 and p2, respectively. + ''' + + # Do initial comparison to determine whether to count matches or + # mismatches. + if s1[ p1 ] == s2[ p2 ]: + cmp_fn = operator.eq + match = True + else: + cmp_fn = operator.ne + match = False + + # Increment counts to move to next characters. + count = 1 + p1 += 1 + p2 += 1 + + # Count matches/mismatches. + while p1 < len( s1 ) and p2 < len( s2 ) and cmp_fn( s1[ p1 ], s2[ p2 ] ): + count += 1 + p1 += 1 + p2 += 1 + + return match, count diff -r 1829e8c8f8bb632b8fd1c9224dd3da6849246568 -r ce3143a06f21f636145a03bf1d62c4f8a6ccaa6b lib/galaxy/visualization/data_providers/genome.py --- a/lib/galaxy/visualization/data_providers/genome.py +++ b/lib/galaxy/visualization/data_providers/genome.py @@ -927,93 +927,9 @@ return { 'data': [], 'message': None } # - # Helper functions: + # Helper functions. # - def counter( s1, p1, s2, p2 ): - ''' - Count consecutive matches/mismatches between strings s1 and s2 - starting at p1 and p2, respectively. - ''' - - # Do initial comparison to determine whether to count matches or - # mismatches. - if s1[ p1 ] == s2[ p2 ]: - cmp_fn = operator.eq - match = True - else: - cmp_fn = operator.ne - match = False - - # Increment counts to move to next characters. - count = 1 - p1 += 1 - p2 += 1 - - # Count matches/mismatches. - while p1 < len( s1 ) and p2 < len( s2 ) and cmp_fn( s1[ p1 ], s2[ p2 ] ): - count += 1 - p1 += 1 - p2 += 1 - - return match, count - - def transform_cigar( read_seq, read_start, ref_seq, cigar ): - ''' - Returns a new cigar with =s and Xs replacing Ms; M operation is - imprecise because it can denote a sequence match or mismatch. - New cigar can be used with reference to reconstruct read sequence. - ''' - - if not ref_seq: - return read_seq, cigar - - # Set up position for reference, read. - ref_seq_pos = read_start - start - read_pos = 0 - - # Create new cigar. - new_cigar = [] - for op_tuple in cigar: - op, op_len = op_tuple - - # Op is index into string 'MIDNSHP=X' - if op == 0: # Match - # Transform Ms to =s and Xs. - new_op = [] - total_count = 0 - while total_count < op_len and ref_seq_pos < len( ref_seq ): - match, count = counter( read_seq, read_pos, ref_seq, ref_seq_pos ) - # Use min because count cannot exceed remainder of operation. - count = min( count, op_len - total_count ) - if match: - new_op = 7 - else: - new_op = 8 - new_cigar.append( ( new_op, count ) ) - total_count += count - read_pos += count - ref_seq_pos += count - - # If part of read falls outside of ref_seq dat, then leave - # part as M. - if total_count < op_len: - new_cigar.append( ( 0, op_len - total_count ) ) - elif op == 1: # Insertion - new_cigar.append( op_tuple ) - elif op in [ 2, 3, 5, 6 ]: # Deletion, Skip, Soft Clipping or Padding - ref_seq_pos += op_len - new_cigar.append( op_tuple ) - elif op == 4: # Soft clipping - read_pos += op_len - new_cigar.append( op_tuple ) - elif op in [ 7, 8 ]: # Match or mismatch - read_pos += op_len - ref_seq_pos += op_len - new_cigar.append( op_tuple ) - - return new_cigar - # Decode strand from read flag. def decode_strand( read_flag, mask ): strand_flag = ( read_flag & mask == 0 ) @@ -1022,7 +938,7 @@ else: return "-" - # + # # Encode reads as list of lists. # if ref_seq: Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.