# HG changeset patch -- Bitbucket.org # Project galaxy-dist # URL http://bitbucket.org/galaxy/galaxy-dist/overview # User Kelly Vincent <kpvincent@bx.psu.edu> # Date 1277401418 14400 # Node ID df5010a5df581674896beec9b35ca50bf477f380 # Parent 42a4c30c7486b95ff8ce9c5fec116e76855996c7 Changed the way sam_indel_filter expects quality score cutoffs and fixed bug so it checks qualities for correct bases in all cases --- a/tools/samtools/sam_indel_filter.py +++ b/tools/samtools/sam_indel_filter.py @@ -26,14 +26,26 @@ def __main__(): # prep output file output = open( options.output, 'wb' ) # patterns - pat_indel = re.compile( '(?P<before_match>(\d+[MIDNSHP])*)(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M' ) + pat_indel = re.compile( '(?P<before_match>(\d+[MNSHP])*)(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M(?P<after_match>(\d+[MNSHP])*)' ) pat_matches = re.compile( '(\d+[MIDNSHP])+' ) - qual_thresh = int( options.quality_threshold ) - adj_bases = int( options.adjacent_bases ) + try: + qual_thresh = int( options.quality_threshold ) + 33 + if qual_thresh < 33 or qual_thresh > 126: + raise ValueError + except ValueError: + stop_err( 'Your quality threshold should be an integer between 0 and 93, inclusive.' ) + try: + adj_bases = int( options.adjacent_bases ) + if adj_bases < 1: + raise ValueError + except ValueError: + stop_err( 'The number of adjacent bases should be an integer greater than 1.' ) # record lines skipped because of more than one indel multi_indel_lines = 0 # go through all lines in input file - for line in open( options.input, 'rb' ): + for i,line in enumerate(open( options.input, 'rb' )): + if i > 1000: + break if line and not line.startswith( '#' ) and not line.startswith( '@' ) : split_line = line.split( '\t' ) cigar = split_line[5] @@ -52,15 +64,15 @@ def __main__(): if pre_left_groups: for pl in pre_left_groups.groups(): if pl.endswith( 'M' ) or pl.endswith( 'S' ) or pl.endswith( 'P' ): - pre_left += int( pl[:-1] ) + pre_left += pl[:-1] parts[ 'pre_left' ] = pre_left matches.append( parts ) - cigar_copy = cigar_copy[ len( parts[ 'lmatch' ] ) : ] + cigar_copy = cigar_copy[ len( parts[ 'lmatch' ] ) + 1 : ] # see if matches meet filter requirements if len( matches ) > 1: multi_indel_lines += 1 elif len( matches ) == 1: - pre_left = matches[0][ 'pre_left' ] + pre_left = int( matches[0][ 'pre_left' ] ) left = int( matches[0][ 'lmatch' ] ) right = int( matches[0][ 'rmatch' ] ) if matches[0][ 'ins_del' ] == 'D': @@ -69,25 +81,23 @@ def __main__(): middle = 0 # if there are enough adjacent bases to check, then do so if left >= adj_bases and right >= adj_bases: - qual = split_line[10] - left_bases = qual[ pre_left : pre_left + left ][ -adj_bases : ] - right_bases = qual[ pre_left + left + middle - 1 : pre_left + left + middle + right ][ : adj_bases ] + quals = split_line[10] + left_quals = quals[ pre_left : pre_left + left ][ -adj_bases : ] + middle_quals = quals[ pre_left + left : pre_left + left + middle ] + right_quals = quals[ pre_left + left + middle : pre_left + left + middle + right ][ : adj_bases ] qual_thresh_met = True - for l in left_bases: + for l in left_quals: if ord( l ) < qual_thresh: qual_thresh_met = False break if qual_thresh_met: - for r in right_bases: + for r in right_quals: if ord( r ) < qual_thresh: qual_thresh_met = False break # if filter reqs met, output line if qual_thresh_met: output.write( line ) - # error if there are multiple indels - elif len( matches ) > 1: - stop_err( 'There is more than one indel present in the alignment:\n%s' % line ) # close out file output.close() # if skipped lines because of more than one indel, output message --- a/tools/samtools/sam_indel_filter.xml +++ b/tools/samtools/sam_indel_filter.xml @@ -9,7 +9,7 @@ </command><inputs><param format="sam" name="input1" type="data" label="Select dataset to filter" /> - <param name="quality_threshold" type="integer" value="40" label="Quality threshold for adjacent bases" help="You need to give the true value, not offset (see chart below), regardless of source FASTQ type" /> + <param name="quality_threshold" type="integer" value="40" label="Quality threshold for adjacent bases" help="Takes Phred value assuming Sanger scale; usually between 0 and 40, but up to 93" /><param name="adjacent_bases" type="integer" value="1" label="The number of adjacent bases to match on either side of the indel" help="If one side is shorter than this width, it will still match if the long-enough side matches" /></inputs><outputs> @@ -18,19 +18,19 @@ <tests><test><param name="input1" value="sam_indel_filter_in1.sam" ftype="sam"/> - <param name="quality_threshold" value="47"/> + <param name="quality_threshold" value="14"/><param name="adjacent_bases" value="2"/><output name="out_file1" file="sam_indel_filter_out1.sam" ftype="sam"/></test><test><param name="input1" value="sam_indel_filter_in1.sam" ftype="sam"/> - <param name="quality_threshold" value="62"/> + <param name="quality_threshold" value="29"/><param name="adjacent_bases" value="5"/><output name="out_file1" file="sam_indel_filter_out2.sam" ftype="sam"/></test><test><param name="input1" value="sam_indel_filter_in2.sam" ftype="sam"/> - <param name="quality_threshold" value="40"/> + <param name="quality_threshold" value="7"/><param name="adjacent_bases" value="1"/><output name="out_file1" file="sam_indel_filter_out3.sam" ftype="sam"/></test> @@ -39,7 +39,7 @@ **What it does** -Allows extracting indels from SAM. Currently it can handle SAM with alignments with only one insertion or one deletion, and will throw an error if it encounters an alignment with more than one. It matches CIGAR strings (column 6 in the SAM file) like 5M3I5M or 4M2D10M, so there must be a match or mismatch of sufficient length on either side of the indel. +Allows extracting indels from SAM. Currently it can handle SAM with alignments with only one insertion or one deletion, and will skip that alignment if it encounters one with more than one indel. It matches CIGAR strings (column 6 in the SAM file) like 5M3I5M or 4M2D10M, so there must be a match or mismatch of sufficient length on either side of the indel. ----- @@ -49,10 +49,7 @@ Suppose you have the following:: r770 89 ref 116 37 17M1I5M = 72131356 0 CACACTGTGACAGACAGCGCAGC 00/02!!0//1200210AA44/1 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 r770 181 ref 116 0 24M = 72131356 0 TTGGTGCGCGCGGTTGAGGGTTGG $$(#%%#$%#%####$%%##$### - r1945 113 ref 181247988 0 23M 41710908 41710908 0 GAGAGAGAGAGAGAGAGAGAGAG PQRVUMNXYRPUXYXWXSOSZ]M XT:A:R CM:i:0 SM:i:0 AM:i:0 X0:i:163148 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 r1945 177 ref 41710908 0 23M 190342418 181247988 0 AGAGAGAGAGAGAGAGAGAGAGA SQQWZYURVYWX]]YXTSY]]ZM XT:A:R CM:i:0 SM:i:0 AM:i:0 X0:i:163148 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 - r2363 115 ref 19671878 0 23M = 19671877 -1 AGAGAGAGAGAGAGAGAGAGTCT 77543:<55#"4!&=964518A> XT:A:R CM:i:2 SM:i:0 AM:i:0 X0:i:4 X1:i:137 XM:i:2 XO:i:0 XG:i:0 MD:Z:23 - r2363 179 ref 19671877 0 23M = 19671878 1 GAGAGAGAGAGAGAGAGAGAGTC LE7402DD34FL:27AKE>;432 XT:A:R CM:i:0 SM:i:0 AM:i:0 X0:i:265 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 r3671 117 ref 190342418 0 24M = 190342418 0 CTGGCGTTCTCGGCGTGGATGGGT #####$$##$#%#%%###%$#$## r3671 153 ref 190342418 37 16M1I6M = 190342418 0 TCTAACTTAGCCTCATAATAGCT /<<!"0///////00/!!0121/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 r3824 117 ref 80324999 0 24M = 80324999 0 TCCAGTCGCGTTGTTAGGTTCGGA #$#$$$#####%##%%###**#+/