# HG changeset patch -- Bitbucket.org # Project galaxy-dist # URL http://bitbucket.org/galaxy/galaxy-dist/overview # User Kelly Vincent kpvincent@bx.psu.edu # Date 1275938612 14400 # Node ID c3ced3ed4fee6e0bdb5acbc7bdc2518995f97e7a # Parent f7cf88978f1f848b97c5da1bfe6e48aeb1930378 Adding SAM indel filter tool with test files
--- /dev/null +++ b/test-data/sam_indel_filter_out1.sam @@ -0,0 +1,3 @@ +1378_28_770 89 chr11.nib:1-134452384 72131356 37 17M1I5M = 72131356 0 CACACTGTGACAGACAGCGCAGC 00/02!!0//1200210AA44/1 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 +1378_69_800 16 chr11.nib:1-125234658 241 255 15M1D8M * 0 0 CGTGGCCGGCGGGCCGAAGGCAT IIIIIIIIIICCCCIII?IIIII +1378_72_1612 151 chrY.nib:1-124295114 190342418 37 19M1I3M = 190342418 0 TCTAACTTAGCCTCATAATAGCT /<<!"0/4//7//00/BC0121/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22
--- /dev/null +++ b/tools/samtools/sam_indel_filter.xml @@ -0,0 +1,74 @@ +<tool id="sam_indel_filter" name="Filter SAM" version="1.0.0"> + <description>for indels</description> + <command interpreter="python"> + sam_indel_filter.py + --input=$input1 + --quality_threshold=$quality_threshold + --adjacent_bases=$adjacent_bases + --output=$out_file1 + </command> + <inputs> + <param format="sam" name="input1" type="data" label="Select dataset to filter" /> + <param name="quality_threshold" type="integer" value="40" label="Quality threshold for adjacent bases" help="You need to give the true value, not offset (see chart below), regardless of source FASTQ type" /> + <param name="adjacent_bases" type="integer" value="5" label="The number of adjacent bases to match on either side of the indel" help="If one side is shorter than this width, it will still match if the long-enough side matches" /> + </inputs> + <outputs> + <data format="sam" name="out_file1" /> + </outputs> + <tests> + <test> + <param name="input1" value="sam_indel_filter_in1.sam" ftype="sam"/> + <param name="quality_threshold" value="47"/> + <param name="adjacent_bases" value="2"/> + <output name="out_file1" file="sam_indel_filter_out1.sam" ftype="sam"/> + </test> + <test> + <param name="input1" value="sam_indel_filter_in1.sam" ftype="sam"/> + <param name="quality_threshold" value="62"/> + <param name="adjacent_bases" value="5"/> + <output name="out_file1" file="sam_indel_filter_out2.sam" ftype="sam"/> + </test> + </tests> + <help> + +**What it does** + +Allows extracting indels from SAM. Currently it can handle SAM with alignments with only one insertion or one deletion, and will throw an error if it encounters an alignment with more than one. It matches CIGAR strings (column 6 in the SAM file) like 5M3I5M or 4M2D10M, so there must be a match or mismatch of sufficient length on either side of the indel. + +----- + +**Example** + +Suppose you have the following:: + + r770 89 ref 116 37 17M1I5M = 72131356 0 CACACTGTGACAGACAGCGCAGC 00/02!!0//1200210AA44/1 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 + r770 181 ref 116 0 24M = 72131356 0 TTGGTGCGCGCGGTTGAGGGTTGG $$(#%%#$%#%####$%%##$### + r1945 113 ref 181247988 0 23M 41710908 41710908 0 GAGAGAGAGAGAGAGAGAGAGAG PQRVUMNXYRPUXYXWXSOSZ]M XT:A:R CM:i:0 SM:i:0 AM:i:0 X0:i:163148 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 + r1945 177 ref 41710908 0 23M 190342418 181247988 0 AGAGAGAGAGAGAGAGAGAGAGA SQQWZYURVYWX]]YXTSY]]ZM XT:A:R CM:i:0 SM:i:0 AM:i:0 X0:i:163148 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 + r2363 115 ref 19671878 0 23M = 19671877 -1 AGAGAGAGAGAGAGAGAGAGTCT 77543:<55#"4!&=964518A> XT:A:R CM:i:2 SM:i:0 AM:i:0 X0:i:4 X1:i:137 XM:i:2 XO:i:0 XG:i:0 MD:Z:23 + r2363 179 ref 19671877 0 23M = 19671878 1 GAGAGAGAGAGAGAGAGAGAGTC LE7402DD34FL:27AKE>;432 XT:A:R CM:i:0 SM:i:0 AM:i:0 X0:i:265 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 + r3671 117 ref 190342418 0 24M = 190342418 0 CTGGCGTTCTCGGCGTGGATGGGT #####$$##$#%#%%###%$#$## + r3671 153 ref 190342418 37 16M1I6M = 190342418 0 TCTAACTTAGCCTCATAATAGCT /<<!"0///////00/!!0121/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 + r3824 117 ref 80324999 0 24M = 80324999 0 TCCAGTCGCGTTGTTAGGTTCGGA #$#$$$#####%##%%###**#+/ + r3824 153 ref 80324999 37 8M1I14M = 80324999 0 TTTAGCCCGAAATGCCTAGAGCA 4;6//11!"11100110////00 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 + r4795 81 ref 26739130 0 23M 57401793 57401793 0 TGGCATTCCTGTAGGCAGAGAGG AZWWZS]!"QNXZ]VQ]]]/2]] XT:A:R CM:i:2 SM:i:0 AM:i:0 X0:i:3 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:23 + r4795 161 ref 57401793 37 23M 26739130 26739130 0 GATCACCCAGGTGATGTAACTCC ]WV]]]]WW]]]]]]]]]]PU]] XT:A:U CM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 + r4800 16 ref 241 255 15M1D8M * 0 0 CGTGGCCGGCGGGCCGAAGGCAT IIIIIIIIIICCCCIII?IIIII + r5377 170 ref 59090793 37 23M 26739130 26739130 0 TATCAATAAGGTGATGTAACTCG ]WV]ABAWW]]]]]P]P//GU]] XT:A:U CM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 + r5612 151 ref 190342418 37 19M1I3M = 190342418 0 TCTAACTTAGCCTCATAATAGCT /<<!"0/4//7//00/BC0121/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 + + +To select only alignments with indels, you need to determine the minimum quality you want the adjacent bases to have, as well as the number of adjacent bases to check. If you set the quality threshold to 47 and the number of bases to check to 2, you will get the following output:: + + r770 89 ref 116 37 17M1I5M = 72131356 0 CACACTGTGACAGACAGCGCAGC 00/02!!0//1200210AA44/1 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 + r4800 16 ref 241 255 15M1D8M * 0 0 CGTGGCCGGCGGGCCGAAGGCAT IIIIIIIIIICCCCIII?IIIII + r5612 151 ref 190342418 37 19M1I3M = 190342418 0 TCTAACTTAGCCTCATAATAGCT /<<!"0/4//7//00/BC0121/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 + + +For more information on SAM, please consult the `SAM format description`__. + +.. __: http://www.ncbi.nlm.nih.gov/pubmed/19505943 + + + </help> +</tool>
--- /dev/null +++ b/tools/samtools/sam_indel_filter.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python + +""" +Allows user to filter out non-indels from SAM. + +usage: %prog [options] + -i, --input=i: Input SAM file to be filtered + -q, --quality_threshold=q: Minimum quality value for adjacent bases + -a, --adjacent_bases=a: Number of adjacent bases on each size to check qualities + -o, --output=o: Filtered output SAM file +""" + +import re, sys +from galaxy import eggs +import pkg_resources; pkg_resources.require( "bx-python" ) +from bx.cookbook import doc_optparse + + +def stop_err( msg ): + sys.stderr.write( '%s\n' % msg ) + sys.exit() + +def __main__(): + #Parse Command Line + options, args = doc_optparse.parse( __doc__ ) + # prep output file + output = open( options.output, 'wb' ) + # patterns + pat_indel = re.compile( '(?P<before_match>(\d+[MIDNSHP])*)(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M' ) + pat_matches = re.compile( '\d+[MIDNSHP]' ) + qual_thresh = int( options.quality_threshold ) + adj_bases = int( options.adjacent_bases ) + # go through all lines in input file + for line in open( options.input, 'rb' ): + if line and not line.startswith( '#' ) and not line.startswith( '@' ) : + split_line = line.split( '\t' ) + cigar = split_line[5] + # find all possible matches, like 3M2D7M and 7M3I10M in 3M2D7M3I10M + cigar_copy = cigar[:] + matches = [] + while len( cigar_copy ) >= 6: # nMnInM or nMnDnM + m = pat_indel.match( cigar_copy ) + if not m: + break + else: + parts = m.groupdict() + parts[ 'start' ] = m.start() + matches.append( parts ) + cigar_copy = cigar_copy[ len( parts[ 'lmatch' ] ) : ] + # see if matches meet filter requirements + if len( matches ) == 1: + start = int( matches[0][ 'start' ] ) + left = int( matches[0][ 'lmatch' ] ) + right = int( matches[0][ 'rmatch' ] ) + if matches[0][ 'ins_del' ] == 'D': + middle = int( matches[0][ 'ins_del_width' ] ) + else: + middle = 0 + # if there are enough adjacent bases to check, then do so + if left >= adj_bases and right >= adj_bases: + qual = split_line[10] + left_bases = qual[ start : start + left + 1 ][ -adj_bases : ] + right_bases = qual[ start + left + middle : start + left + middle + right + 1 ][ : adj_bases ] + qual_thresh_met = True + for l in left_bases: + if ord( l ) < qual_thresh: + qual_thresh_met = False + break + if qual_thresh_met: + for r in right_bases: + if ord( r ) < qual_thresh: + qual_thresh_met = False + break + # if filter reqs met, output line + if qual_thresh_met: + output.write( line ) + # error if there are multiple indels + elif len( matches ) > 1: + stop_err( 'There is more than one indel present in the alignment:\n%s' % line ) + # close out file + output.close() + +if __name__=="__main__": __main__()
--- /dev/null +++ b/test-data/sam_indel_filter_out2.sam @@ -0,0 +1,1 @@ +1378_69_800 16 chr11.nib:1-125234658 241 255 15M1D8M * 0 0 CGTGGCCGGCGGGCCGAAGGCAT IIIIIIIIIICCCCIII?IIIII
--- /dev/null +++ b/test-data/sam_indel_filter_in1.sam @@ -0,0 +1,15 @@ +1378_28_770 89 chr11.nib:1-134452384 72131356 37 17M1I5M = 72131356 0 CACACTGTGACAGACAGCGCAGC 00/02!!0//1200210AA44/1 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 +1378_28_770 181 chr11.nib:1-134452384 72131356 0 24M = 72131356 0 TTGGTGCGCGCGGTTGAGGGTTGG $$(#%%#$%#%####$%%##$### +1378_33_1945 113 chr2.nib:1-242951149 181247988 0 23M chr12.nib:1-132349534 41710908 0 GAGAGAGAGAGAGAGAGAGAGAG PQRVUMNXYRPUXYXWXSOSZ]M XT:A:R CM:i:0 SM:i:0 AM:i:0 X0:i:163148 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 +1378_33_1945 177 chr12.nib:1-132349534 41710908 0 23M chr2.nib:1-242951149 181247988 0 AGAGAGAGAGAGAGAGAGAGAGA SQQWZYURVYWX]]YXTSY]]ZM XT:A:R CM:i:0 SM:i:0 AM:i:0 X0:i:163148 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 +1378_35_263 115 chr16.nib:1-88827254 19671878 0 23M = 19671877 -1 AGAGAGAGAGAGAGAGAGAGTCT 77543:<55#"4!&=964518A> XT:A:R CM:i:2 SM:i:0 AM:i:0 X0:i:4 X1:i:137 XM:i:2 XO:i:0 XG:i:0 MD:Z:23 +1378_35_263 179 chr16.nib:1-88827254 19671877 0 23M = 19671878 1 GAGAGAGAGAGAGAGAGAGAGTC LE7402DD34FL:27AKE>;432 XT:A:R CM:i:0 SM:i:0 AM:i:0 X0:i:265 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 +1378_51_1671 117 chr2.nib:1-242951149 190342418 0 24M = 190342418 0 CTGGCGTTCTCGGCGTGGATGGGT #####$$##$#%#%%###%$#$## +1378_51_1671 153 chr2.nib:1-242951149 190342418 37 16M1I6M = 190342418 0 TCTAACTTAGCCTCATAATAGCT /<<!"0///////00/!!0121/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 +1378_56_324 117 chr2.nib:1-242951149 80324999 0 24M = 80324999 0 TCCAGTCGCGTTGTTAGGTTCGGA #$#$$$#####%##%%###**#+/ +1378_56_324 153 chr2.nib:1-242951149 80324999 37 8M1I14M = 80324999 0 TTTAGCCCGAAATGCCTAGAGCA 4;6//11!"11100110////00 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 +1378_67_1795 81 chr16.nib:1-88827254 26739130 0 23M chrY.nib:1-57772954 57401793 0 TGGCATTCCTGTAGGCAGAGAGG AZWWZS]!"QNXZ]VQ]]]/2]] XT:A:R CM:i:2 SM:i:0 AM:i:0 X0:i:3 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:23 +1378_67_1795 161 chrY.nib:1-57772954 57401793 37 23M chr16.nib:1-88827254 26739130 0 GATCACCCAGGTGATGTAACTCC ]WV]]]]WW]]]]]]]]]]PU]] XT:A:U CM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 +1378_69_800 16 chr11.nib:1-125234658 241 255 15M1D8M * 0 0 CGTGGCCGGCGGGCCGAAGGCAT IIIIIIIIIICCCCIII?IIIII +1378_69_1777 170 chrX.nib:1-59090954 59090793 37 23M chr16.nib:1-88827254 26739130 0 TATCAATAAGGTGATGTAACTCG ]WV]ABAWW]]]]]P]P//GU]] XT:A:U CM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 +1378_72_1612 151 chrY.nib:1-124295114 190342418 37 19M1I3M = 190342418 0 TCTAACTTAGCCTCATAATAGCT /<<!"0/4//7//00/BC0121/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22
galaxy-commits@lists.galaxyproject.org