# HG changeset patch --
Bitbucket.org
# Project galaxy-dist
# URL
http://bitbucket.org/galaxy/galaxy-dist/overview
# User Kelly Vincent <kpvincent(a)bx.psu.edu>
# Date 1277401418 14400
# Node ID df5010a5df581674896beec9b35ca50bf477f380
# Parent 42a4c30c7486b95ff8ce9c5fec116e76855996c7
Changed the way sam_indel_filter expects quality score cutoffs and fixed bug so it checks
qualities for correct bases in all cases
--- a/tools/samtools/sam_indel_filter.py
+++ b/tools/samtools/sam_indel_filter.py
@@ -26,14 +26,26 @@ def __main__():
# prep output file
output = open( options.output, 'wb' )
# patterns
- pat_indel = re.compile(
'(?P<before_match>(\d+[MIDNSHP])*)(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M'
)
+ pat_indel = re.compile(
'(?P<before_match>(\d+[MNSHP])*)(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M(?P<after_match>(\d+[MNSHP])*)'
)
pat_matches = re.compile( '(\d+[MIDNSHP])+' )
- qual_thresh = int( options.quality_threshold )
- adj_bases = int( options.adjacent_bases )
+ try:
+ qual_thresh = int( options.quality_threshold ) + 33
+ if qual_thresh < 33 or qual_thresh > 126:
+ raise ValueError
+ except ValueError:
+ stop_err( 'Your quality threshold should be an integer between 0 and 93,
inclusive.' )
+ try:
+ adj_bases = int( options.adjacent_bases )
+ if adj_bases < 1:
+ raise ValueError
+ except ValueError:
+ stop_err( 'The number of adjacent bases should be an integer greater than
1.' )
# record lines skipped because of more than one indel
multi_indel_lines = 0
# go through all lines in input file
- for line in open( options.input, 'rb' ):
+ for i,line in enumerate(open( options.input, 'rb' )):
+ if i > 1000:
+ break
if line and not line.startswith( '#' ) and not line.startswith(
'@' ) :
split_line = line.split( '\t' )
cigar = split_line[5]
@@ -52,15 +64,15 @@ def __main__():
if pre_left_groups:
for pl in pre_left_groups.groups():
if pl.endswith( 'M' ) or pl.endswith( 'S'
) or pl.endswith( 'P' ):
- pre_left += int( pl[:-1] )
+ pre_left += pl[:-1]
parts[ 'pre_left' ] = pre_left
matches.append( parts )
- cigar_copy = cigar_copy[ len( parts[ 'lmatch' ] ) : ]
+ cigar_copy = cigar_copy[ len( parts[ 'lmatch' ] ) + 1 : ]
# see if matches meet filter requirements
if len( matches ) > 1:
multi_indel_lines += 1
elif len( matches ) == 1:
- pre_left = matches[0][ 'pre_left' ]
+ pre_left = int( matches[0][ 'pre_left' ] )
left = int( matches[0][ 'lmatch' ] )
right = int( matches[0][ 'rmatch' ] )
if matches[0][ 'ins_del' ] == 'D':
@@ -69,25 +81,23 @@ def __main__():
middle = 0
# if there are enough adjacent bases to check, then do so
if left >= adj_bases and right >= adj_bases:
- qual = split_line[10]
- left_bases = qual[ pre_left : pre_left + left ][ -adj_bases : ]
- right_bases = qual[ pre_left + left + middle - 1 : pre_left + left +
middle + right ][ : adj_bases ]
+ quals = split_line[10]
+ left_quals = quals[ pre_left : pre_left + left ][ -adj_bases : ]
+ middle_quals = quals[ pre_left + left : pre_left + left + middle ]
+ right_quals = quals[ pre_left + left + middle : pre_left + left +
middle + right ][ : adj_bases ]
qual_thresh_met = True
- for l in left_bases:
+ for l in left_quals:
if ord( l ) < qual_thresh:
qual_thresh_met = False
break
if qual_thresh_met:
- for r in right_bases:
+ for r in right_quals:
if ord( r ) < qual_thresh:
qual_thresh_met = False
break
# if filter reqs met, output line
if qual_thresh_met:
output.write( line )
- # error if there are multiple indels
- elif len( matches ) > 1:
- stop_err( 'There is more than one indel present in the
alignment:\n%s' % line )
# close out file
output.close()
# if skipped lines because of more than one indel, output message
--- a/tools/samtools/sam_indel_filter.xml
+++ b/tools/samtools/sam_indel_filter.xml
@@ -9,7 +9,7 @@
</command><inputs><param format="sam" name="input1"
type="data" label="Select dataset to filter" />
- <param name="quality_threshold" type="integer"
value="40" label="Quality threshold for adjacent bases" help="You
need to give the true value, not offset (see chart below), regardless of source FASTQ
type" />
+ <param name="quality_threshold" type="integer"
value="40" label="Quality threshold for adjacent bases"
help="Takes Phred value assuming Sanger scale; usually between 0 and 40, but up to
93" /><param name="adjacent_bases" type="integer"
value="1" label="The number of adjacent bases to match on either side of
the indel" help="If one side is shorter than this width, it will still match if
the long-enough side matches" /></inputs><outputs>
@@ -18,19 +18,19 @@
<tests><test><param name="input1"
value="sam_indel_filter_in1.sam" ftype="sam"/>
- <param name="quality_threshold" value="47"/>
+ <param name="quality_threshold" value="14"/><param
name="adjacent_bases" value="2"/><output
name="out_file1" file="sam_indel_filter_out1.sam"
ftype="sam"/></test><test><param name="input1"
value="sam_indel_filter_in1.sam" ftype="sam"/>
- <param name="quality_threshold" value="62"/>
+ <param name="quality_threshold" value="29"/><param
name="adjacent_bases" value="5"/><output
name="out_file1" file="sam_indel_filter_out2.sam"
ftype="sam"/></test><test><param name="input1"
value="sam_indel_filter_in2.sam" ftype="sam"/>
- <param name="quality_threshold" value="40"/>
+ <param name="quality_threshold" value="7"/><param
name="adjacent_bases" value="1"/><output
name="out_file1" file="sam_indel_filter_out3.sam"
ftype="sam"/></test>
@@ -39,7 +39,7 @@
**What it does**
-Allows extracting indels from SAM. Currently it can handle SAM with alignments with only
one insertion or one deletion, and will throw an error if it encounters an alignment with
more than one. It matches CIGAR strings (column 6 in the SAM file) like 5M3I5M or 4M2D10M,
so there must be a match or mismatch of sufficient length on either side of the indel.
+Allows extracting indels from SAM. Currently it can handle SAM with alignments with only
one insertion or one deletion, and will skip that alignment if it encounters one with more
than one indel. It matches CIGAR strings (column 6 in the SAM file) like 5M3I5M or
4M2D10M, so there must be a match or mismatch of sufficient length on either side of the
indel.
-----
@@ -49,10 +49,7 @@ Suppose you have the following::
r770 89 ref 116 37 17M1I5M = 72131356 0
CACACTGTGACAGACAGCGCAGC 00/02!!0//1200210AA44/1 XT:A:U CM:i:2 SM:i:37 AM:i:0
X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22
r770 181 ref 116 0 24M = 72131356 0
TTGGTGCGCGCGGTTGAGGGTTGG $$(#%%#$%#%####$%%##$###
- r1945 113 ref 181247988 0 23M 41710908 41710908 0
GAGAGAGAGAGAGAGAGAGAGAG PQRVUMNXYRPUXYXWXSOSZ]M XT:A:R CM:i:0 SM:i:0 AM:i:0
X0:i:163148 XM:i:0 XO:i:0 XG:i:0 MD:Z:23
r1945 177 ref 41710908 0 23M 190342418 181247988 0
AGAGAGAGAGAGAGAGAGAGAGA SQQWZYURVYWX]]YXTSY]]ZM XT:A:R CM:i:0 SM:i:0 AM:i:0
X0:i:163148 XM:i:0 XO:i:0 XG:i:0 MD:Z:23
- r2363 115 ref 19671878 0 23M = 19671877 -1
AGAGAGAGAGAGAGAGAGAGTCT 77543:<55#"4!&=964518A> XT:A:R CM:i:2
SM:i:0 AM:i:0 X0:i:4 X1:i:137 XM:i:2 XO:i:0 XG:i:0 MD:Z:23
- r2363 179 ref 19671877 0 23M = 19671878 1
GAGAGAGAGAGAGAGAGAGAGTC LE7402DD34FL:27AKE>;432 XT:A:R CM:i:0 SM:i:0 AM:i:0
X0:i:265 XM:i:0 XO:i:0 XG:i:0 MD:Z:23
r3671 117 ref 190342418 0 24M = 190342418 0
CTGGCGTTCTCGGCGTGGATGGGT #####$$##$#%#%%###%$#$##
r3671 153 ref 190342418 37 16M1I6M = 190342418 0
TCTAACTTAGCCTCATAATAGCT /<<!"0///////00/!!0121/ XT:A:U CM:i:2
SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22
r3824 117 ref 80324999 0 24M = 80324999 0
TCCAGTCGCGTTGTTAGGTTCGGA #$#$$$#####%##%%###**#+/