galaxy-dist commit 1dc82a345db2: Updated indel analysis tools: fixed counting bugs in indel_analysis and improved help section; standardized CIGAR regex across tools; updated test files for several tools
# HG changeset patch -- Bitbucket.org # Project galaxy-dist # URL http://bitbucket.org/galaxy/galaxy-dist/overview # User Kelly Vincent <kpvincent@bx.psu.edu> # Date 1279743946 14400 # Node ID 1dc82a345db24860b31b73624030fa2644c18d38 # Parent 95491a8e9e1bca9267ec5830ac5df2a6d30ed915 Updated indel analysis tools: fixed counting bugs in indel_analysis and improved help section; standardized CIGAR regex across tools; updated test files for several tools --- a/test-data/indel_analysis_out1.interval +++ b/test-data/indel_analysis_out1.interval @@ -1,4 +1,7 @@ -ref 10 11 1 1 9.09 -ref 12 14 2 1 7.69 -ref 13 14 1 1 7.69 -ref 18 20 2 1 8.33 +chr1 11 13 1 100.00 +chr1 21 22 1 25.00 +chr1 21 23 1 25.00 +chrM 16 18 1 9.09 +chrM 19 20 1 8.33 +chrM 21 22 1 9.09 +chrM 22 23 1 9.09 --- a/tools/indels/indel_sam2interval.xml +++ b/tools/indels/indel_sam2interval.xml @@ -1,5 +1,5 @@ -<tool id="indel_sam2interval" name="Convert SAM to interval/BED" version="1.0.0"> - <description>for indels</description> +<tool id="indel_sam2interval" name="Extract indels" version="1.0.0"> + <description>from SAM</description><command interpreter="python"> indel_sam2interval.py --input=$input1 @@ -30,7 +30,7 @@ <when value="false" /></conditional><conditional name="del_out"> - <param name="include_del_out" type="select" label="Include insertions output bed file?"> + <param name="include_del_out" type="select" label="Include deletions output bed file?"><option value="true">Yes</option><option value="false">No</option></param> @@ -41,10 +41,10 @@ <outputs><data format="interval" name="output1" /><data format="bed" name="output2"> - <filter>ins_out[ "include_ins_out" ] = "true"</filter> + <filter>ins_out[ "include_ins_out" ] == "true"</filter></data><data format="bed" name="output3"> - <filter>del_out[ "include_del_out" ] = "true"</filter> + <filter>del_out[ "include_del_out" ] == "true"</filter></data></outputs><tests> @@ -69,48 +69,66 @@ Given a SAM file containing indels, conv **Example** -Suppose you have the following:: +Suppose you have the following mapping results:: - r770 89 ref 116 37 17M1I5M = 72131356 0 CACACTGTGACAGACAGCGCAGC 00/02!!0//1200210AA44/1 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r780 181 ref 4567 0 24M = 72131356 0 TTGGTGCGCGCGGTTGAGGGTTGG $$(#%%#$%#%####$%%##$### XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r1231 69 * 0 0 * * 0 0 AGACCGGGCGGGGTGGCGTTCGGT %##+'#######%###$#$##$(# - r1563 133 * 0 0 * * 0 0 GTTCGTGGCCGGTGGGTGTTTGGG ###$$#$#$&#####$'$#$###$ - r1945 177 ref 71908 0 23M 190342418 181247988 0 AGAGAGAGAGAGAGAGAGAGAGA SQQWZYURVYWX]]YXTSY]]ZM XT:A:R CM:i:0 SM:i:0 AM:i:0 X0:i:163148 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 - r3671 117 ref 31903418 0 24M = 190342418 0 CTGGCGTTCTCGGCGTGGATGGGT #####$$##$#%#%%###%$#$## XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r3673 153 ref 48819768 37 16M1I6M = 190342418 0 TCTAACTTAGCCTCATAATAGCT /<<!"0///////00/!!0121/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r3824 117 ref 80729921 0 24M = 80324999 0 TCCAGTCGCGTTGTTAGGTTCGGA #$#$$$#####%##%%###**#+/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r3911 153 ref 87824718 37 8M1I14M = 80324999 0 TTTAGCCCGAAATGCCTAGAGCA 4;6//11!"11100110////00 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r4795 81 ref 126739130 0 23M 57401793 57401793 0 TGGCATTCCTGTAGGCAGAGAGG AZWWZS]!"QNXZ]VQ]]]/2]] XT:A:R CM:i:2 SM:i:0 AM:i:0 X0:i:3 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:23 - r4797 161 ref 57401793 37 23M 26739130 26739130 0 GATCACCCAGGTGATGTAACTCC ]WV]]]]WW]]]]]]]]]]PU]] XT:A:U CM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 - r4800 16 ref 241 255 15M2D8M = 1550 0 CGTGGCCGGCGGGCCGAAGGCAT IIIIIIIIIICCCCIII?IIIII XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r5377 170 ref 52090793 37 23M 26739130 26739130 0 TATCAATAAGGTGATGTAACTCG ]WV]ABAWW]]]]]P]P//GU]] XT:A:U CM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 - r5612 151 ref 190341152 37 19M1D4M = 190342418 0 TCTAACTTAGCCTCATAATGCTAA /<<!"0/4/*/7//00/BC0121/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r5623 151 ref 188841418 37 19M1I3M = 190342418 0 TCTAACTTAGCCTCATAATAGCT /<<!"0/4//7//00/BC0121/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r7899 69 * 0 0 * * 0 0 CTGCGTGTTGGTGTCTACTGGGGT #%#'##$#$##&%#%$$$%#%#'# - r9192 133 * 0 0 * * 0 0 GTGCGTCGGGGAGGGTGCTGTCGG ######%#$%#$$###($###&&% + r327 16 chrM 11 37 8M1D10M * 0 0 CTTACCAGATAGTCATCA -+<2;?@BA@?-,.+4=4 XT:A:U NM:i:1 X0:i:1 X1:i:0 XM:i:0 XO:i:1 XG:i:1 MD:Z:41^C35 + r457 0 chr1 14 37 14M * 0 0 ACCTGACAGATATC =/DF;?@1A@?-,. XT:A:U NM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 + r501 16 chrM 6 23 7M1I13M * 0 0 TCTGTGCCTACCAGACATTCA +=$2;?@BA@?-,.+4=4=4A XT:A:U NM:i:3 X0:i:1 X1:i:1 XM:i:2 XO:i:1 XG:i:1 MD:Z:28C36G9 XA:Z:chrM,+134263658,14M1I61M,4; + r1288 16 chrM 8 37 11M1I7M * 0 0 TCACTTACCTGTACACACA /*F2;?@%A@?-,.+4=4= XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:2T0T1A69 + r1902 0 chr1 4 37 7M2D18M * 0 0 AGTCTCTTACCTGACGGTTATGA <2;?@BA@?-,.+4=4=4AA663 XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 + r2204 16 chrM 9 0 19M * 0 0 CTGGTACCTGACAGGTATC 2;?@BA@?-,.+4=4=4AA XT:A:R NM:i:1 X0:i:2 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:0T75 XA:Z:chrM,-564927,76M,1; + r2314 16 chrM 6 37 10M2D8M * 0 0 TCACTCTTACGTCTGA <2;?@BA@?-,.+4=4 XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:25A5^CA45 + r3001 0 chrM 13 37 3M1D5M2I7M * 0 0 TACAGTCACCCTCATCA <2;?@BA/(@?-,$& XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 + r3218 0 chr1 13 37 8M1D7M * 0 0 TACAGTCACTCATCA <2;?@BA/(@?-,$& XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 + r4767 16 chr2 3 37 15M2I7M * 0 0 CAGACTCTCTTACCAAAGACAGAC <2;?@BA/(@?-,.+4=4=4AA66 XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:2T1A4T65 + r5333 0 chrM 5 37 17M1D8M * 0 0 GTCTCTCATACCAGACAACGGCAT FB3$@BA/(@?-,.+4=4=4AA66 XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:45C10^C0C5C13 + r6690 16 chrM 7 23 20M * 0 0 CTCTCTTACCAGACAGACAT 2;?@BA/(@?-,.+4=4=4A XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 XA:Z:chrM,-568532,76M,1; + r7211 0 chrM 7 37 24M * 0 0 CGACAGAGACAAAATAACATTTAA //<2;?@BA@?-,.+4=442;;6: XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:2 XO:i:1 XG:i:1 MD:Z:73G0G0 + r7899 69 * 0 0 * * 0 0 CTGCGTGTTGGTGTCTACTGGGGT #%#'##$#$##&%#%$$$%#%#'# + r9192 133 * 0 0 * * 0 0 GTGCGTCGGGGAGGGTGCTGTCGG ######%#$%#$$###($###&&% + r9922 16 chrM 4 0 7M3I9M * 0 0 CCAGACATTTGAAATCAGG F/D4=44^D++26632;;6 XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 + r9987 16 chrM 4 0 9M1I18M * 0 0 AGGTTCTCATTACCTGACACTCATCTTG G/AD6"/+4=4426632;;6:<2;?@BA XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 + r10145 16 chr1 16 0 5M2D7M * 0 0 CACATTGTTGTA G//+4=44=4AA XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 + r10324 16 chrM 15 0 6M1D5M * 0 0 CCGTTCTACTTG A@??8.G//+4= XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 + r12331 16 chrM 17 0 4M2I6M * 0 0 AGTCGAATACGTG 632;;6:<2;?@B XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 + r12914 16 chr2 24 0 4M3I3M * 0 0 ACTACCCCAA G//+4=42,. XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 + r13452 16 chrM 13 0 3M1D11M * 0 0 TACGTCACTCATCA IIIABCCCICCCCI XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 The following three files will be produced (Interval, Insertions BED and Deletions BED):: - ref 133 134 I C 1 - ref 256 258 D - 1 - ref 48819784 48819785 I A 1 - ref 87824726 87824727 I G 1 - ref 188841437 188841419 I T 1 - ref 190341171 190341172 D - 1 + chr1 11 13 D - 1 + chr1 21 22 D - 1 + chr1 21 23 D - 1 + chr2 18 19 I AA 1 + chr2 28 29 I CCC 1 + chrM 11 12 I TTT 1 + chrM 13 14 I C 1 + chrM 13 14 I T 1 + chrM 16 17 D - 1 + chrM 16 18 D - 1 + chrM 19 20 D - 1 + chrM 19 20 I T 1 + chrM 21 22 D - 1 + chrM 21 22 I GA 1 + chrM 22 23 D - 1 - ref 133 134 - ref 48819784 48819785 - ref 87824726 87824727 - ref 188841437 188841419 + chr2 18 19 + chr2 28 29 + chrM 11 12 + chrM 13 14 + chrM 13 14 + chrM 19 20 + chrM 21 22 - ref 256 258 - ref 190341171 190341172 - - - - - + chr1 11 13 + chr1 21 22 + chr1 21 23 + chrM 16 17 + chrM 16 18 + chrM 19 20 + chrM 21 22 + chrM 22 23 For more information on SAM, please consult the `SAM format description`__. --- a/tools/indels/indel_sam2interval.py +++ b/tools/indels/indel_sam2interval.py @@ -72,11 +72,11 @@ def __main__(): output_bed_del = None # the pattern to match, assuming just one indel per cigar string - pat_indel = re.compile( '(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M' ) + pat_indel = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' ) pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' ) # go through all lines in input file - out_data = [] + out_data = {} multi_indel_lines = 0 for line in open( options.input, 'rb' ): if line and not line.startswith( '#' ) and not line.startswith( '@' ) : @@ -84,14 +84,14 @@ def __main__(): if split_line < 12: continue # grab relevant pieces - cigar = split_line[5] + cigar = split_line[5].strip() pos = int( split_line[3] ) chr = split_line[2] base_string = split_line[9] # parse cigar string - m = pat_indel.search( cigar ) + m = pat_indel.match( cigar ) if not m: - m = pat_multi.search( cigar ) + m = pat_multi.match( cigar ) # skip this line if no match if not m: continue @@ -109,37 +109,43 @@ def __main__(): start = left + pos if middle_type == 'D': end = start + middle - d = [ chr, start, end, 'D' ] + data = [ chr, start, end, 'D' ] if options.include_base == "true": - d.append( '-' ) - out_data.append( tuple( d ) ) - if output_bed_del: - output_bed_del.write( '%s\t%s\t%s\n' % ( chr, start, end ) ) + data.append( '-' ) else: - end = start + 1#+ middle - d = [ chr, start, end, 'I' ] + end = start + 1 + data = [ chr, start, end, 'I' ] if options.include_base == "true": - d.append( bases ) - out_data.append( tuple( d ) ) - if output_bed_ins: - output_bed_ins.write( '%s\t%s\t%s\n' % ( chr, start, end ) ) + data.append( bases ) + location = '\t'.join( [ '%s' % d for d in data ] ) + try: + out_data[ location ] += 1 + except KeyError: + out_data[ location ] = 1 # output to interval file - if options.collapse == 'true': - out_dict = {} - # first collapse and get counts - for data in out_data: - location = ' '.join( [ '%s' % d for d in data ] ) - try: - out_dict[ location ].append( data ) - except KeyError: - out_dict[ location ] = [ data ] - locations = out_dict.keys() - locations.sort( numeric_sort ) - for loc in locations: - output.write( '%s\t%s\n' % ( '\t'.join( [ '%s' % d for d in out_dict[ loc ][0] ] ), len( out_dict[ loc ] ) ) ) - else: - for data in out_data: - output.write( '%s\n' % '\t'.join( [ '%s' % d for d in data ] ) ) + # get all locations and sort + locations = out_data.keys() + locations.sort( numeric_sort ) + last_line = '' + # output each location, either with counts or each occurrence + for loc in locations: + sp_loc = loc.split( '\t' ) + cur_line = '\t'.join( sp_loc[:3] ) + if options.collapse == 'true': + output.write( '%s\t%s\n' % ( loc, out_data[ loc ] ) ) + if output_bed_del and sp_loc[3] == 'D': + output_bed_del.write( '%s\n' % cur_line ) + if output_bed_ins and sp_loc[3] == 'I' and last_line != cur_line: + output_bed_ins.write( '%s\n' % cur_line ) + last_line = cur_line + else: + for i in range( out_data[ loc ] ): + output.write( '%s\n' % loc ) + if output_bed_del or output_bed_ins: + if output_bed_del and sp_loc[3] == 'D': + output_bed_del.write( '%s\n' % cur_line ) + if output_bed_ins and sp_loc[3] == 'I': + output_bed_ins.write( '%s\n' % cur_line ) # cleanup, close files if output_bed_ins: @@ -150,6 +156,6 @@ def __main__(): # if skipped lines because of more than one indel, output message if multi_indel_lines > 0: - sys.stdout.write( '%s alignments were skipped because they contained more than one indel or had unhandled operations (N/S/H/P).' % multi_indel_lines ) + sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines ) if __name__=="__main__": __main__() --- a/test-data/indel_sam2interval_in1.sam +++ b/test-data/indel_sam2interval_in1.sam @@ -1,17 +1,24 @@ -r770 89 ref 116 37 17M1I5M = 72131356 0 CACACTGTGACAGACAGCGCAGC 00/02!!0//1200210AA44/1 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r780 181 ref 4567 0 24M = 72131356 0 TTGGTGCGCGCGGTTGAGGGTTGG $$(#%%#$%#%####$%%##$### XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 +r327 16 chrM 11 37 8M1D10M * 0 0 CTTACCAGATAGTCATCA -+<2;?@BA@?-,.+4=4 XT:A:U NM:i:1 X0:i:1 X1:i:0 XM:i:0 XO:i:1 XG:i:1 MD:Z:41^C35 +r457 0 chr1 14 37 14M * 0 0 ACCTGACAGATATC =/DF;?@1A@?-,. XT:A:U NM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 +r501 16 chrM 6 23 7M1I13M * 0 0 TCTGTGCCTACCAGACATTCA +=$2;?@BA@?-,.+4=4=4A XT:A:U NM:i:3 X0:i:1 X1:i:1 XM:i:2 XO:i:1 XG:i:1 MD:Z:28C36G9 XA:Z:chrM,+134263658,14M1I61M,4; r1231 69 * 0 0 * * 0 0 AGACCGGGCGGGGTGGCGTTCGGT %##+'#######%###$#$##$(# +r1288 16 chrM 8 37 11M1I7M * 0 0 TCACTTACCTGTACACACA /*F2;?@%A@?-,.+4=4= XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:2T0T1A69 r1563 133 * 0 0 * * 0 0 GTTCGTGGCCGGTGGGTGTTTGGG ###$$#$#$####$'$#$###$ -r1945 177 ref 71908 0 23M 190342418 181247988 0 AGAGAGAGAGAGAGAGAGAGAGA SQQWZYURVYWX]]YXTSY]]ZM XT:A:R CM:i:0 SM:i:0 AM:i:0 X0:i:163148 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 -r3671 117 ref 31903418 0 24M = 190342418 0 CTGGCGTTCTCGGCGTGGATGGGT #####$$##$#%#%%###%$#$## -r3673 153 ref 48819768 37 16M1I6M = 190342418 0 TCTAACTTAGCCTCATAATAGCT /<<!"0///////00/!!0121/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r3824 117 ref 80729921 0 24M = 80324999 0 TCCAGTCGCGTTGTTAGGTTCGGA #$#$$$#####%##%%###**#+/ -r3911 153 ref 87824718 37 8M1I14M = 80324999 0 TTTAGCCCGAAATGCCTAGAGCA 4;6//11!"11100110////00 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r4795 81 ref 126739130 0 23M 57401793 57401793 0 TGGCATTCCTGTAGGCAGAGAGG AZWWZS]!"QNXZ]VQ]]]/2]] XT:A:R CM:i:2 SM:i:0 AM:i:0 X0:i:3 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:23 -r4797 161 ref 57401793 37 23M 26739130 26739130 0 GATCACCCAGGTGATGTAACTCC ]WV]]]]WW]]]]]]]]]]PU]] XT:A:U CM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 -r4800 16 ref 241 255 15M2D8M = 1550 0 CGTGGCCGGCGGGCCGAAGGCAT IIIIIIIIIICCCCIII?IIIII -r5377 170 ref 52090793 37 23M 26739130 26739130 0 TATCAATAAGGTGATGTAACTCG ]WV]ABAWW]]]]]P]P//GU]] XT:A:U CM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 -r5612 151 ref 190341152 37 19M1D4M = 190342418 0 TCTAACTTAGCCTCATAATGCTAA /<<!"0/4/*/7//00/BC0121/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r5623 151 ref 188841418 37 19M1I3M = 190342418 0 TCTAACTTAGCCTCATAATAGCT /<<!"0/4//7//00/BC0121/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 +r1902 0 chr1 4 37 7M2D18M * 0 0 AGTCTCTTACCTGACGGTTATGA <2;?@BA@?-,.+4=4=4AA663 XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 +r2204 16 chrM 9 0 19M * 0 0 CTGGTACCTGACAGGTATC 2;?@BA@?-,.+4=4=4AA XT:A:R NM:i:1 X0:i:2 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:0T75 XA:Z:chrM,-564927,76M,1; +r2314 16 chrM 6 37 10M2D8M * 0 0 TCACTCTTACGTCTGA <2;?@BA@?-,.+4=4 XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:25A5^CA45 +r3001 0 chrM 13 37 3M1D5M2I7M * 0 0 TACAGTCACCCTCATCA <2;?@BA/(@?-,$& XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 +r3218 0 chr1 13 37 8M1D7M * 0 0 TACAGTCACTCATCA <2;?@BA/(@?-,$& XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 +r4767 16 chr2 3 37 15M2I7M * 0 0 CAGACTCTCTTACCAAAGACAGAC <2;?@BA/(@?-,.+4=4=4AA66 XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:2T1A4T65 +r5333 0 chrM 5 37 17M1D8M * 0 0 GTCTCTCATACCAGACAACGGCAT FB3$@BA/(@?-,.+4=4=4AA66 XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:45C10^C0C5C13 +r6690 16 chrM 7 23 20M * 0 0 CTCTCTTACCAGACAGACAT 2;?@BA/(@?-,.+4=4=4A XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 XA:Z:chrM,-568532,76M,1; +r7211 0 chrM 7 37 24M * 0 0 CGACAGAGACAAAATAACATTTAA //<2;?@BA@?-,.+4=442;;6: XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:2 XO:i:1 XG:i:1 MD:Z:73G0G0 r7899 69 * 0 0 * * 0 0 CTGCGTGTTGGTGTCTACTGGGGT #%#'##$#$##&%#%$$$%#%#'# r9192 133 * 0 0 * * 0 0 GTGCGTCGGGGAGGGTGCTGTCGG ######%#$%#$$###($###&&% +r9922 16 chrM 4 0 7M3I9M * 0 0 CCAGACATTTGAAATCAGG F/D4=44^D++26632;;6 XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 +r9987 16 chrM 4 0 9M1I18M * 0 0 AGGTTCTCATTACCTGACACTCATCTTG G/AD6"/+4=4426632;;6:<2;?@BA XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 +r10145 16 chr1 16 0 5M2D7M * 0 0 CACATTGTTGTA G//+4=44=4AA XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 +r10324 16 chrM 15 0 6M1D5M * 0 0 CCGTTCTACTTG A@??8.G//+4= XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 +r12331 16 chrM 17 0 4M2I6M * 0 0 AGTCGAATACGTG 632;;6:<2;?@B XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 +r12914 16 chr2 24 0 4M3I3M * 0 0 ACTACCCCAA G//+4=42,. XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 +r13452 16 chrM 13 0 3M2D11M * 0 0 TACTCACTCATCAG IIIABCCCICCCCI XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 --- a/tools/indels/indel_analysis.xml +++ b/tools/indels/indel_analysis.xml @@ -24,7 +24,7 @@ </test><test><param name="input1" value="indel_analysis_in2.sam" ftype="sam"/> - <param name="threshold" value="0.08"/> + <param name="threshold" value="0.09"/><output name="out_del" file="indel_analysis_out3.interval" ftype="interval"/><output name="out_ins" file="indel_analysis_out4.interval" ftype="interval"/></test> @@ -35,75 +35,87 @@ Given an input sam file, this tool provides analysis of the indels. It filters out matches that do not meet the frequency threshold. The way this frequency of occurence is calculated is different for deletions and insertions. The CIGAR string's "M" can indicate an exact match or a mismatch. For SAM containing the following bits of information (assuming the reference "ACTGCTCGAT"):: - CHROM POS CIGAR SEQ - ref 3 2M1I3M TATCTC - ref 2 3M1D2M ATGTC - ref 4 2M2I3M GTTGAAG - ref 1 2M2D2M ACCT - ref 2 3M1I2M TCCATC - ref 7 4M CTAT - ref 5 5M CGTGA + CHROM POS CIGAR SEQ + ref 3 2M1I3M TACTTC + ref 1 2M1D3M ACGCT + ref 4 4M2I3M GTTCAAGAT + ref 2 2M2D3M CTCCG + ref 1 3M1D4M AACCTGG + ref 6 3M1I2M TTCAAT + ref 5 3M1I3M CTCTGTT + ref 7 4M CTAT + ref 5 5M CGCTA + ref 3 2M1D2M TGCC -The following totals would be calculated:: +The following totals would be calculated (this is an intermediate step and not output):: - Counts for chromosome "ref", where "-" indicates a deletion and - "+" an insertion - ---------------------------------------------------------------- - POS BASE NUMREADS DELPROPCALC DELPROP INSPROPCALC INSPROP - 1 A 1 1/1 1.00 -- -- - 2 A 1 1/2 0.50 -- -- - C 1 1/2 0.50 -- -- - 3 T 3 3/4 0.75 -- -- - -- 1 1/4 0.25 -- -- - 4 A 1 1/5 0.20 -- -- - G 2 2/5 0.40 -- -- - C 1 1/5 0.20 -- -- - - 1 1/5 0.20 -- -- - 5 C 4 4/5 0.80 -- -- - T 1 1/5 0.20 -- -- - +A 1 --- --- 1/6 0.17 - +T 1 --- --- 1/6 0.17 - 6 A 1 1/6 0.17 -- -- - C 1 1/6 0.17 -- -- - T 4 4/6 0.67 -- -- - +TG 1 --- --- 1/7 0.14 - 7 A 1 1/6 0.17 -- -- - C 3 3/6 0.50 -- -- - G 1 1/6 0.17 -- -- - T 1 1/6 0.17 -- -- - 8 G 2 2/3 0.67 -- -- - T 1 1/3 0.33 -- -- - 9 A 2 2/2 1.00 -- -- - 10 T 1 1/1 1.00 -- -- + ------------------------------------------------------------------------------------------------------- + POS BASE NUMREADS DELPROPCALC DELPROP INSPROPSTARTCALC INSSTARTPROP INSPROPENDCALC INSENDPROP + ------------------------------------------------------------------------------------------------------- + 1 A 2 2/2 1.00 --- --- --- --- + 2 A 1 1/3 0.33 --- --- --- --- + C 2 2/3 0.67 --- --- --- --- + 3 C 1 1/5 0.20 --- --- --- --- + T 3 3/5 0.60 --- --- --- --- + - 1 1/5 0.20 --- --- --- --- + 4 A 1 1/6 0.17 --- --- --- --- + G 3 3/6 0.50 --- --- --- --- + - 1 1/6 0.17 --- --- --- --- + -- 1 1/6 0.17 --- --- --- --- + 5 C 4 4/7 0.57 --- --- --- --- + T 2 2/7 0.29 --- --- --- --- + - 1 1/7 0.14 --- --- --- --- + +C 1 --- --- 1/7 0.14 1/9 0.11 + 6 C 2 2/9 0.22 --- --- --- --- + G 1 1/9 0.11 --- --- --- --- + T 6 6/9 0.67 --- --- --- --- + 7 C 7 7/9 0.78 --- --- --- --- + G 1 1/9 0.11 --- --- --- --- + T 1 1/9 0.11 --- --- --- --- + 8 C 1 1/7 0.14 --- --- --- --- + G 4 4/7 0.57 --- --- --- --- + T 2 2/7 0.29 --- --- --- --- + +T 1 --- --- 1/8 0.13 1/6 0.17 + +AA 1 --- --- 1/8 0.13 1/6 0.17 + 9 A 4 4/5 0.80 --- --- --- --- + T 1 1/5 0.20 --- --- --- --- + +A 1 --- --- 1/6 0.17 1/5 0.20 + 10 T 4 4/4 1.00 --- --- --- --- -Note that the way these are calculated may not be immediately clear. First, the basic total number of reads at a given position is the number of reads with a particular base plus the number of reads with that a deletion at that given position. Note that deletions of two bases and one base would be counted separately. Insertions are not counted in this total, which is used to calculate the proportion of occurrences of each individual base and deletion. For position 4 above, the reference base is G, and there are 2 occurrences of it along with one each of mismatching bases A and C. Also, there is on 1-base deletion. So there are a total of 5 matches/mismatches/deletions, and the proportions for each base are either 1/5 = 0.20 or 2/5 = 0.40, and for the deletion it is 1/5 = 0.20. Insertions are slightly more complicated. Each insertion is regarded individually, and the total number of occurrences of that insertion is divided by the sum of the number of its occurrences and the b asic total. So for position 5, there are a total of 5 matches/mismatches/deletions, and two insertions that each occur once, so each has a insertion has a proportion of 1/6 = 0.17. +The general idea for calculating these is that we want to find out the proportion of times a particular event occurred at a position among all reads that touch that base in some way. First, the basic total number of reads at a given position is the number of reads with each particular base plus the number of reads with that a deletion at that given position (including the bases that are "mismatches"). Note that deletions of two bases and one base would be counted completely separately. Insertions are not counted in this total. For position 4 above, the reference base is G, and there are 3 occurrences of it along with one mismatching base, A. Also, there is a 1-base deletion and another 2-base deletion. So there are a total of 5 matches/mismatches/deletions, and the proportions for each base are 1/6 = 0.17 (A) and 3/6 = 0.50 (G), and for each deletion it is 1/6 = 0.17. -The DELPROP or INSPROP needs to be greater than the threshold frequency specified by the user. +Insertions are slightly more complicated. We actually want to get the frequency of occurrence for both the associated start and end positions, since an insertion appears between those two bases. Each insertion is regarded individually, and the total number of occurrences of that insertion is divided by the sum of the number of its occurrences and the basic total for either the start or end. So for the insertions at position 8, there are a total of 7 matches/mismatches/deletions at position 8, and two insertions that each occur once, so each has an INSSTARTPROP of 1/8 = 0.13. For the end position there are 5 matches/mismatches/deletions, so the INSENDPROP is 1/6 = 0.17 for both insertions (T and AA). -The output varies for deletions and insertions, though for both, the first three columns are chromosome, start position, and end position. +These proportions (DELPROP and either INSSTARTPROP or INSENDPROP) need to be greater than the threshold frequency specified by the user in order for that base, deletion or insertion to be included in the output. + + +** Output format ** + +The output varies for deletions and insertions, although for both, the first three columns are chromosome, start position, and end position. Columns in the deletions file:: - Column Description - ---------------------------- ------------------------------------------------------------------------------------ - 1 Chrom Chromosome - 2 Start Starting position - 3 End Ending position - 4 Number of Deleted Base(s) The number of bases deleted at Start position - 5 Frequency Percentage Frequency of this exact deletion (2 and 1 are mutually exclusive), as percentage (%) + Column Description + ----------------------------- --------------------------------------------------------------------------------------------------- + 1 Chrom Chromosome + 2 Start Starting position + 3 End Ending position + 4 Coverage Number of reads containing this exact deletion + 5 Frequency Percentage Frequency of this exact deletion (2 and 1 are mutually exclusive, for instance), as percentage (%) Columns in the insertions file:: - Column Description - -------------------------- ------------------------------------------------------------------------------------------------------------- - 1 Chrom Chromosome - 2 Start Starting position - 3 End Ending position (always Start + 1 for insertions) - 4 Inserted Base(s) The exact base(s) inserted at Start position - 5 Freq. Perc. at Start Frequency of this exact insertion given Start position ("GG" and "G" are considered distinct), as percentage (%) - 6 Freq. Perc. at End Frequency of this exact insertion given End position ("GG" and "G" are considered distinct), as percentage (%) + Column Description + ------------------------ ----------------------------------------------------------------------------------------------------------------- + 1 Chrom Chromosome + 2 Start Starting position + 3 End Ending position (always Start + 1 for insertions) + 4 Inserted Base(s) The exact base(s) inserted at Start position + 5 Coverage Number of reads containing this exact insertion + 6 Freq. Perc. at Start Frequency of this exact insertion given Start position ("GG" and "G" are considered distinct), as percentage (%) + 7 Freq. Perc. at End Frequency of this exact insertion given End position ("GG" and "G" are considered distinct), as percentage (%) -Before using this tool, you probably will want to use the Filter SAM for indels tool to filter out indels on bases with insufficient quality scores, but this is not required. +Before using this tool, you may will want to use the Filter SAM for indels tool to filter out indels on bases with insufficient quality scores, but this is not required. ----- @@ -112,38 +124,43 @@ Before using this tool, you probably wil If you set the threshold to 0.0 and have the following SAM file:: - r770 89 ref 6 37 7M1I5M = 0 0 TGGATCTTCATAG !0//110AA44/1 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r1124 113 ref 4 0 23M = 0 0 CATCGTTCTGTTAGATCTACGTA PQRVUMNXYRPUXYXWXSOSZ]M XT:A:R CM:i:0 SM:i:0 AM:i:0 X0:i:163148 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 - r1789 177 ref 6 0 17M = 0 0 TCGATCGCTTAGTTCTC SQQWZY]]YXTSY]]ZM XT:A:R CM:i:0 SM:i:0 AM:i:0 X0:i:163148 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 - r3671 153 ref 10 37 6M1I6M = 0 0 TCTCTTTAGGTCT /<<!"0/////// XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r3824 153 ref 5 37 8M1I7M = 0 0 ATTGATGTTCTTAGAT 4;6//11!"100110/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r4800 16 ref 7 255 5M2D6M = 0 0 CGATCTTTGAT IIIIIIIIIIC XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r5612 151 ref 5 37 8M1D9M = 0 0 ATCTATCTTTTGATCTC /<<!"0/4/*/7//B0/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r5612 151 ref 11 37 3M1I10M = 0 0 CTCCTTAGCTCTCC /<<!"0/4//7//0 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r9145 115 ref 11 0 19M = 0 -1 CTCTTAGCTCTCCGAATTAG 7753:<5#"4!&=9518A> XT:A:R CM:i:2 SM:i:0 AM:i:0 X0:i:4 X1:i:137 XM:i:2 XO:i:0 XG:i:0 MD:Z:23 - r11770 89 ref 10 37 10M2I8M = 0 0 TCTCTTAGATGGCTCCGTAT 00/02!!0/120210AA4/1 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r13671 153 ref 1 37 12M1I12M = 0 0 TCGCATCGATCTCCGTAGATCTCCG /<""<!"0///////00/!!0121/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r13824 153 ref 13 37 9M1I7M = 0 0 CATAGATCTACCGGATT 4;6//11!"11100110 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r24800 16 ref 3 255 15M2D9M = 0 0 GCATCTATCTGATAGCTCCGAATT IIIIIIIII45"CCCIII?IIIII XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r25612 151 ref 1 37 9M1D5M = 0 0 TCGCATCGACTCTT 0/4/*/7//00/1C XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r25612 151 ref 21 37 4M1I7M = 0 0 TGCGTTATTGGG <!"0/70/BC01 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r29962 16 ref 20 37 4M1I7M = 0 0 CTCCGGTATGAGG <!"0/70/7BC01 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 + r327 16 chrM 11 37 8M1D10M * 0 0 CTTACCAGATAGTCATCA -+<2;?@BA@?-,.+4=4 XT:A:U NM:i:1 X0:i:1 X1:i:0 XM:i:0 XO:i:1 XG:i:1 MD:Z:41^C35 + r457 0 chr1 14 37 14M * 0 0 ACCTGACAGATATC =/DF;?@1A@?-,. XT:A:U NM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 + r501 16 chrM 6 23 7M1I13M * 0 0 TCTGTGCCTACCAGACATTCA +=$2;?@BA@?-,.+4=4=4A XT:A:U NM:i:3 X0:i:1 X1:i:1 XM:i:2 XO:i:1 XG:i:1 MD:Z:28C36G9 XA:Z:chrM,+134263658,14M1I61M,4; + r1288 16 chrM 8 37 11M1I7M * 0 0 TCACTTACCTGTACACACA /*F2;?@%A@?-,.+4=4= XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:2T0T1A69 + r1902 0 chr1 4 37 7M2D18M * 0 0 AGTCTCTTACCTGACGGTTATGA <2;?@BA@?-,.+4=4=4AA663 XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 + r2204 16 chrM 9 0 19M * 0 0 CTGGTACCTGACAGGTATC 2;?@BA@?-,.+4=4=4AA XT:A:R NM:i:1 X0:i:2 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:0T75 XA:Z:chrM,-564927,76M,1; + r2314 16 chrM 6 37 10M2D8M * 0 0 TCACTCTTACGTCTGA <2;?@BA@?-,.+4=4 XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:25A5^CA45 + r3001 0 chrM 13 37 3M1D5M2I7M * 0 0 TACAGTCACCCTCATCA <2;?@BA/(@?-,$& XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 + r3218 0 chr1 13 37 8M1D7M * 0 0 TACAGTCACTCATCA <2;?@BA/(@?-,$& XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 + r4767 16 chr2 3 37 15M2I7M * 0 0 CAGACTCTCTTACCAAAGACAGAC <2;?@BA/(@?-,.+4=4=4AA66 XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:2T1A4T65 + r5333 0 chrM 5 37 17M1D8M * 0 0 GTCTCTCATACCAGACAACGGCAT FB3$@BA/(@?-,.+4=4=4AA66 XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:45C10^C0C5C13 + r6690 16 chrM 7 23 20M * 0 0 CTCTCTTACCAGACAGACAT 2;?@BA/(@?-,.+4=4=4A XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 XA:Z:chrM,-568532,76M,1; + r7211 0 chrM 7 37 24M * 0 0 CGACAGAGACAAAATAACATTTAA //<2;?@BA@?-,.+4=442;;6: XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:2 XO:i:1 XG:i:1 MD:Z:73G0G0 + r9922 16 chrM 4 0 7M3I9M * 0 0 CCAGACATTTGAAATCAGG F/D4=44^D++26632;;6 XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 + r9987 16 chrM 4 0 9M1I18M * 0 0 AGGTTCTCATTACCTGACACTCATCTTG G/AD6"/+4=4426632;;6:<2;?@BA XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 + r10145 16 chr1 16 0 5M2D7M * 0 0 CACATTGTTGTA G//+4=44=4AA XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 + r10324 16 chrM 15 0 6M1D5M * 0 0 CCGTTCTACTTG A@??8.G//+4= XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 + r12331 16 chrM 17 0 4M2I6M * 0 0 AGTCGAATACGTG 632;;6:<2;?@B XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 + r12914 16 chr2 24 0 4M3I3M * 0 0 ACTACCCCAA G//+4=42,. XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 The following will be produced (deletions file followed by insertions file):: - ref 10 11 1 1 9.09 - ref 12 14 2 1 7.69 - ref 13 14 1 1 7.69 - ref 18 20 2 1 8.33 + chr1 11 13 1 100.00 + chr1 21 22 1 25.00 + chr1 21 23 1 25.00 + chrM 16 18 1 9.09 + chrM 19 20 1 8.33 + chrM 21 22 1 9.09 + chrM 22 23 1 9.09 - ref 13 14 C 1 6.25 6.67 - ref 13 14 T 2 12.50 13.33 - ref 14 15 C 1 6.67 7.14 - ref 16 17 T 1 7.14 7.69 - ref 20 21 GG 1 8.33 8.33 - ref 22 23 A 1 8.33 11.11 - ref 24 25 G 1 11.11 12.50 - ref 25 26 T 1 12.50 14.29 + chr2 18 19 AA 1 50.00 50.00 + chr2 28 29 CCC 1 50.00 50.00 + chrM 11 12 TTT 1 9.09 9.09 + chrM 13 14 C 1 9.09 9.09 + chrM 13 14 T 1 9.09 9.09 + chrM 19 20 T 1 7.69 8.33 + chrM 21 22 GA 1 8.33 8.33 </help> --- a/test-data/indel_analysis_out3.interval +++ b/test-data/indel_analysis_out3.interval @@ -1,2 +1,6 @@ -ref 10 11 1 1 9.09 -ref 18 20 2 1 8.33 +chr1 11 13 1 100.00 +chr1 21 22 1 25.00 +chr1 21 23 1 25.00 +chrM 16 18 1 9.09 +chrM 21 22 1 9.09 +chrM 22 23 1 9.09 --- a/test-data/indel_analysis_out4.interval +++ b/test-data/indel_analysis_out4.interval @@ -1,5 +1,5 @@ -ref 13 14 T 2 13.33 14.29 -ref 20 21 GG 1 8.33 8.33 -ref 22 23 A 1 8.33 11.11 -ref 24 25 G 1 11.11 14.29 -ref 25 26 T 1 12.50 14.29 +chr2 18 19 AA 1 50.00 50.00 +chr2 28 29 CCC 1 50.00 50.00 +chrM 11 12 TTT 1 9.09 9.09 +chrM 13 14 C 1 9.09 9.09 +chrM 13 14 T 1 9.09 9.09 --- a/tools/indels/indel_analysis.py +++ b/tools/indels/indel_analysis.py @@ -10,7 +10,7 @@ usage: %prog [options] [input3 sum3[ inp -D, --out_del=D: The interval output file showing deletions """ -import re, sys +import re, sets, sys from galaxy import eggs import pkg_resources; pkg_resources.require( "bx-python" ) from bx.cookbook import doc_optparse @@ -20,18 +20,18 @@ def stop_err( msg ): sys.stderr.write( '%s\n' % msg ) sys.exit() -def add_to_ref_pos( ref_pos, pos, bases ): +def add_to_mis_matches( mis_matches, pos, bases ): """ - Adds the bases and counts to the ref_pos dict + Adds the bases and counts to the mis_matches dict """ for j, base in enumerate( bases ): try: - ref_pos[ pos + j ][ base ] += 1 + mis_matches[ pos + j ][ base ] += 1 except KeyError: try: - ref_pos[ pos + j ][ base ] = 1 + mis_matches[ pos + j ][ base ] = 1 except KeyError: - ref_pos[ pos + j ] = { base: 1 } + mis_matches[ pos + j ] = { base: 1 } def __main__(): #Parse Command Line @@ -39,17 +39,16 @@ def __main__(): # prep output files out_ins = open( options.out_ins, 'wb' ) out_del = open( options.out_del, 'wb' ) - # pattern - pat = re.compile( '(^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$)|(^(?P<match_width>\d+)M$)' ) + # patterns + pat = re.compile( '^((?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M)$|((?P<match_width>\d+)M)$' ) pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' ) # for tracking occurences at each pos of ref - ref_pos = {} + mis_matches = {} indels = {} - num_reads = {} multi_indel_lines = 0 # go through all lines in input file for i,line in enumerate( open( options.input, 'rb' ) ): - if line and not line.startswith( '#' ) and not line.startswith( '@' ) : + if line.strip() and not line.startswith( '#' ) and not line.startswith( '@' ) : split_line = line.split( '\t' ) chrom = split_line[2].strip() pos = int( split_line[3].strip() ) @@ -59,11 +58,11 @@ def __main__(): if chrom == '*': continue # find matches like 3M2D7M or 7M3I10M - matches = '' - m = pat.search( cigar ) + match = {} + m = pat.match( cigar ) # unprocessable CIGAR if not m: - m = pat_multi.search( cigar ) + m = pat_multi.match( cigar ) # skip this line if no match if not m: continue @@ -72,11 +71,9 @@ def __main__(): multi_indel_lines += 1 # get matching parts for the indel or full match if matching else: - if not ref_pos.has_key( chrom ): - ref_pos[ chrom ] = {} + if not mis_matches.has_key( chrom ): + mis_matches[ chrom ] = {} indels[ chrom ] = { 'D': {}, 'I': {} } - if not num_reads.has_key( chrom ): - num_reads[ chrom ] = {} parts = m.groupdict() if parts[ 'match_width' ] or ( parts[ 'lmatch' ] and parts[ 'ins_del_width' ] and parts[ 'rmatch' ] ): match = parts @@ -84,12 +81,7 @@ def __main__(): if match: # match/mismatch if parts[ 'match_width' ]: - add_to_ref_pos( ref_pos[ chrom ], pos, bases ) - for i, base in enumerate( bases ): - try: - num_reads[ chrom ][ i + pos ] += 1 - except KeyError: - num_reads[ chrom ][ i + pos ] = 1 + add_to_mis_matches( mis_matches[ chrom ], pos, bases ) # indel else: # pieces of CIGAR string @@ -104,86 +96,81 @@ def __main__(): right_bases = bases[ -right : ] start = pos + left # add data to ref_pos dict for match/mismatch bases on left and on right - add_to_ref_pos( ref_pos[ chrom ], pos, left_bases ) - for i, base in enumerate( left_bases ): - try: - num_reads[ chrom ][ i + pos ] += 1 - except KeyError: - num_reads[ chrom ][ i + pos ] = 1 + add_to_mis_matches( mis_matches[ chrom ], pos, left_bases ) if match[ 'ins_del' ] == 'I': - add_to_ref_pos( ref_pos[ chrom ], start, right_bases ) - indel_pos = start + add_to_mis_matches( mis_matches[ chrom ], start, right_bases ) else: - add_to_ref_pos( ref_pos[ chrom ], start + middle, right_bases ) - indel_pos = start + middle - for i, base in enumerate( right_bases ): - try: - num_reads[ chrom ][ i + indel_pos ] += 1 - except KeyError: - num_reads[ chrom ][ i + indel_pos ] = 1 + add_to_mis_matches( mis_matches[ chrom ], start + middle, right_bases ) # for insertions, count instances of particular inserted bases if match[ 'ins_del' ] == 'I': if indels[ chrom ][ 'I' ].has_key( start ): - if indels[ chrom ][ 'I' ][ start ].has_key( middle_bases ): + try: indels[ chrom ][ 'I' ][ start ][ middle_bases ] += 1 - else: + except KeyError: indels[ chrom ][ 'I' ][ start ][ middle_bases ] = 1 else: indels[ chrom ][ 'I' ][ start ] = { middle_bases: 1 } # for deletions, count number of deletions bases else: if indels[ chrom ][ 'D' ].has_key( start ): - if indels[ chrom ][ 'D' ][ start ].has_key( middle ): + try: indels[ chrom ][ 'D' ][ start ][ middle ] += 1 - else: + except KeyError: indels[ chrom ][ 'D' ][ start ][ middle ] = 1 else: indels[ chrom ][ 'D' ][ start ] = { middle: 1 } # compute deletion frequencies and insertion frequencies for checking against threshold freqs = {} ins_freqs = {} - chroms = ref_pos.keys() + chroms = mis_matches.keys() chroms.sort() for chrom in chroms: freqs[ chrom ] = {} ins_freqs[ chrom ] = {} - poses = num_reads[ chrom ].keys() - poses.sort() + poses = mis_matches[ chrom ].keys() + poses.extend( indels[ chrom ][ 'D' ].keys() ) + poses.extend( indels[ chrom ][ 'I' ].keys() ) + poses = list( sets.Set( poses ) ) for pos in poses: # all reads touching this particular position freqs[ chrom ][ pos ] = {} sum_counts = 0.0 sum_counts_end = 0.0 # get basic counts (match/mismatch) - if num_reads[ chrom ].has_key( pos ): - sum_counts += float( num_reads[ chrom ][ pos ] ) - try: - sum_counts_end += float( num_reads[ chrom ][ pos + 1 ] ) - except KeyError: - pass + try: + sum_counts += float( sum( mis_matches[ chrom ][ pos ].values() ) ) + except KeyError: + pass + try: + sum_counts_end += float( sum( mis_matches[ chrom ][ pos + 1 ].values() ) ) + except KeyError: + pass # add deletions also touching this position try: sum_counts += float( sum( indels[ chrom ][ 'D' ][ pos ].values() ) ) - try: - sum_counts_end += float( sum( indels[ chrom ][ 'D' ][ pos + 1 ].values() ) ) - except KeyError: - pass - for d in indels[ chrom ][ 'D' ][ pos ].keys(): - freqs[ chrom ][ pos ][ '-' * d ] = indels[ chrom ][ 'D' ][ pos ][ d ] / sum_counts except KeyError: pass + try: + sum_counts_end += float( sum( indels[ chrom ][ 'D' ][ pos + 1 ].values() ) ) + except KeyError: + pass + freqs[ chrom ][ pos ][ 'total' ] = sum_counts # calculate actual frequencies # deletions - freqs[ chrom ][ pos ][ 'total' ] = sum_counts - for base in ref_pos[ chrom ][ pos ].keys(): - try: - prop = float( ref_pos[ chrom ][ pos ][ base ] ) / sum_counts - freqs[ chrom ][ pos ][ base ] = prop - except ZeroDivisionError: - freqs[ chrom ][ pos ][ base ] = 0.0 + # frequencies for deletions try: for d in indels[ chrom ][ 'D' ][ pos ].keys(): - freqs[ chrom ][ pos ][ '-' * d ] = indels[ chrom ][ 'D' ][ pos ][ d ] / sum_counts + freqs[ chrom ][ pos ][ d ] = indels[ chrom ][ 'D' ][ pos ][ d ] / sum_counts + except KeyError: + pass + # frequencies for matches/mismatches + try: + for base in mis_matches[ chrom ][ pos ].keys(): + try: + prop = float( mis_matches[ chrom ][ pos ][ base ] ) / sum_counts + freqs[ chrom ][ pos ][ base ] = prop + except ZeroDivisionError: + freqs[ chrom ][ pos ][ base ] = 0.0 except KeyError: pass # insertions @@ -191,7 +178,7 @@ def __main__(): for bases in indels[ chrom ][ 'I' ][ pos ].keys(): prop_start = indels[ chrom ][ 'I' ][ pos ][ bases ] / ( indels[ chrom ][ 'I' ][ pos ][ bases ] + sum_counts ) try: - prop_end = indels[ chrom ][ 'I' ][ pos ][ bases ] / sum_counts_end + prop_end = indels[ chrom ][ 'I' ][ pos ][ bases ] / ( indels[ chrom ][ 'I' ][ pos ][ bases ] + sum_counts_end ) except ZeroDivisionError: prop_end = 0.0 try: @@ -204,7 +191,7 @@ def __main__(): threshold = float( options.threshold ) #out_del.write( '#Chrom\tStart\tEnd\t#Del\t#Reads\t%TotReads\n' ) #out_ins.write( '#Chrom\tStart\tEnd\tInsBases\t#Reads\t%TotReadsAtStart\t%ReadsAtEnd\n' ) - for chrom in ref_pos.keys(): + for chrom in chroms: # deletions file poses = indels[ chrom ][ 'D' ].keys() poses.sort() @@ -214,9 +201,9 @@ def __main__(): dels.sort() for d in dels: end = start + d - prop = freqs[ chrom ][ start ][ '-' * d ] + prop = freqs[ chrom ][ start ][ d ] if prop > threshold : - out_del.write( '%s\t%s\t%s\t%s\t%s\t%.2f\n' % ( chrom, start, end, d, indels[ chrom ][ 'D' ][ pos ][ d ], 100.0 * prop ) ) + out_del.write( '%s\t%s\t%s\t%s\t%.2f\n' % ( chrom, start, end, indels[ chrom ][ 'D' ][ pos ][ d ], 100.0 * prop ) ) # insertions file poses = indels[ chrom ][ 'I' ].keys() poses.sort() @@ -235,6 +222,6 @@ def __main__(): out_ins.close() # if skipped lines because of more than one indel, output message if multi_indel_lines > 0: - sys.stdout.write( '%s alignments were skipped because they contained more than one indel or had unhandled operations (N/S/H/P).' % multi_indel_lines ) + sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines ) if __name__=="__main__": __main__() --- a/test-data/indel_sam2interval_out1.interval +++ b/test-data/indel_sam2interval_out1.interval @@ -1,6 +1,14 @@ -ref 133 134 I C 1 -ref 256 258 D - 1 -ref 48819784 48819785 I A 1 -ref 87824726 87824727 I G 1 -ref 188841437 188841438 I A 1 -ref 190341171 190341172 D - 1 +chr1 11 13 D - 1 +chr1 21 22 D - 1 +chr1 21 23 D - 1 +chr2 18 19 I AA 1 +chr2 28 29 I CCC 1 +chrM 11 12 I TTT 1 +chrM 13 14 I C 1 +chrM 13 14 I T 1 +chrM 16 18 D - 2 +chrM 19 20 D - 1 +chrM 19 20 I T 1 +chrM 21 22 D - 1 +chrM 21 22 I GA 1 +chrM 22 23 D - 1 --- a/test-data/indel_analysis_in1.sam +++ b/test-data/indel_analysis_in1.sam @@ -1,22 +1,19 @@ -r770 89 ref 6 37 7M1I5M = 0 0 TCGATCTTCATAG !0//110AA44/1 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r1124 113 ref 4 0 23M = 0 0 CATCGTTCTGTTAGATCTACGTA PQRVUMNXYRPUXYXWXSOSZ]M XT:A:R CM:i:0 SM:i:0 AM:i:0 X0:i:163148 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 -r1231 69 * 0 0 * * 0 0 AGACCGGGCGGGGTGGCGTTCGGT %##+'#######%###$#$##$(# -r1563 133 * 0 0 * * 0 0 GTTCGTGGCCGGTGGGTGTTTGGG ###$$#$#$####$'$#$###$ -r1789 177 ref 6 0 17M = 0 0 TCGATCGCTTAGTTCTC SQQWZY]]YXTSY]]ZM XT:A:R CM:i:0 SM:i:0 AM:i:0 X0:i:163148 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 -r3671 153 ref 10 37 6M1I6M = 0 0 TCTCTTTAGGTCT /<<!"0/////// XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r3824 153 ref 5 37 8M1I7M = 0 0 ATCGATGTTCTTAGAT 4;6//11!"100110/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r4800 16 ref 7 255 5M2D6M = 0 0 CGATCTTTGAT IIIIIIIIIIC XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r5612 151 ref 5 37 8M1D9M = 0 0 ATCTATCTTTTGATCTC /<<!"0/4/*/7//B0/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r5929 151 ref 11 37 3M1I10M = 0 0 CTCCTTAGCTCTCC /<<!"0/4//7//0 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r6743 69 * 0 0 * * 0 0 TGCCGTGTCTTGCTAACGCCGATT #'#$$#$###%%##$$$$###### -r9145 115 ref 11 0 19M = 0 -1 CTCTTAGCTCTCCGAATTAG 7753:<5#"4!&=9518A> XT:A:R CM:i:2 SM:i:0 AM:i:0 X0:i:4 X1:i:137 XM:i:2 XO:i:0 XG:i:0 MD:Z:23 -r11770 89 ref 10 37 10M2I8M = 0 0 TCTCTTAGATGGCTCCGTAT 00/02!!0/120210AA4/1 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r13671 153 ref 1 37 12M1I12M = 0 0 TCGCATCGATCTCCTTAGATCTCCG /<""<!"0///////00/!!0121/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r13762 133 * 0 0 * * 0 0 TGGGTGGATGTGTTGTCGTTCATG #$#$###$#$#######$#$#### -r13824 153 ref 13 37 9M1I7M = 0 0 CATAGATCTACCGGATT 4;6//11!"11100110 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r24800 16 ref 3 255 15M2D9M = 0 0 GCATCTATCTGATAGCTCCGAATT IIIIIIIII45"CCCIII?IIIII XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r25612 151 ref 1 37 9M1D5M = 0 0 TCGCATCGACTCTT 0/4/*/7//00/1C XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r25786 151 ref 21 37 4M1I7M = 0 0 TGCGTTATTGGG <!"0/70/BC01 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r27899 69 * 0 0 * * 0 0 CTGCGTGTTGGTGTCTACTGGGGT #%#'##$#$##&%#%$$$%#%#'# -r29192 133 * 0 0 * * 0 0 GTGCGTCGGGGAGGGTGCTGTCGG ######%#$%#$$###($###&&% -r29962 16 ref 20 37 4M1I7M = 0 0 CTCCGGTATGAGG <!"0/70/7BC01 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 +r327 16 chrM 11 37 8M1D10M * 0 0 CTTACCAGATAGTCATCA -+<2;?@BA@?-,.+4=4 XT:A:U NM:i:1 X0:i:1 X1:i:0 XM:i:0 XO:i:1 XG:i:1 MD:Z:41^C35 +r457 0 chr1 14 37 14M * 0 0 ACCTGACAGATATC =/DF;?@1A@?-,. XT:A:U NM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 +r501 16 chrM 6 23 7M1I13M * 0 0 TCTGTGCCTACCAGACATTCA +=$2;?@BA@?-,.+4=4=4A XT:A:U NM:i:3 X0:i:1 X1:i:1 XM:i:2 XO:i:1 XG:i:1 MD:Z:28C36G9 XA:Z:chr5,+134263658,14M1I61M,4; +r1288 16 chrM 8 37 11M1I7M * 0 0 TCACTTACCTGTACACACA /*F2;?@%A@?-,.+4=4= XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:2T0T1A69 +r1902 0 chr1 4 37 7M2D18M * 0 0 AGTCTCTTACCTGACGGTTATGA <2;?@BA@?-,.+4=4=4AA663 XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 +r2204 16 chrM 9 0 19M * 0 0 CTGGTACCTGACAGGTATC 2;?@BA@?-,.+4=4=4AA XT:A:R NM:i:1 X0:i:2 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:0T75 XA:Z:chr1,-564927,76M,1; +r2314 16 chrM 6 37 10M2D8M * 0 0 TCACTCTTACGTCTGA <2;?@BA@?-,.+4=4 XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:25A5^CA45 +r3001 0 chrM 13 37 3M1D5M2I7M * 0 0 TACAGTCACCCTCATCA <2;?@BA/(@?-,$& XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 +r3218 0 chr1 13 37 8M1D7M * 0 0 TACAGTCACTCATCA <2;?@BA/(@?-,$& XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 +r4767 16 chr2 3 37 15M2I7M * 0 0 CAGACTCTCTTACCAAAGACAGAC <2;?@BA/(@?-,.+4=4=4AA66 XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:2T1A4T65 +r5333 0 chrM 5 37 17M1D8M * 0 0 GTCTCTCATACCAGACAACGGCAT FB3$@BA/(@?-,.+4=4=4AA66 XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:45C10^C0C5C13 +r6690 16 chrM 7 23 20M * 0 0 CTCTCTTACCAGACAGACAT 2;?@BA/(@?-,.+4=4=4A XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 XA:Z:chr1,-568532,76M,1; +r7211 0 chrM 7 37 24M * 0 0 CGACAGAGACAAAATAACATTTAA //<2;?@BA@?-,.+4=442;;6: XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:2 XO:i:1 XG:i:1 MD:Z:73G0G0 +r9922 16 chrM 4 0 7M3I9M * 0 0 CCAGACATTTGAAATCAGG F/D4=44^D++26632;;6 XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 +r9987 16 chrM 4 0 9M1I18M * 0 0 AGGTTCTCATTACCTGACACTCATCTTG G/AD6"/+4=4426632;;6:<2;?@BA XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 +r10145 16 chr1 16 0 5M2D7M * 0 0 CACATTGTTGTA G//+4=44=4AA XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 +r10324 16 chrM 15 0 6M1D5M * 0 0 CCGTTCTACTTG A@??8.G//+4= XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 +r12331 16 chrM 17 0 4M2I6M * 0 0 AGTCGAATACGTG 632;;6:<2;?@B XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 +r12914 16 chr2 24 0 4M3I3M * 0 0 ACTACCCCAA G//+4=42,. XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 --- a/tools/indels/sam_indel_filter.py +++ b/tools/indels/sam_indel_filter.py @@ -26,11 +26,11 @@ def __main__(): # prep output file output = open( options.output, 'wb' ) # patterns - pat_indel = re.compile( '(?P<before_match>(\d+[MNSHP])*)(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M(?P<after_match>(\d+[MNSHP])*)' ) - pat_matches = re.compile( '(\d+[MIDNSHP])+' ) + pat = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' ) + pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' ) try: - qual_thresh = int( options.quality_threshold ) + 33 - if qual_thresh < 33 or qual_thresh > 126: + qual_thresh = int( options.quality_threshold ) + if qual_thresh < 0 or qual_thresh > 93: raise ValueError except ValueError: stop_err( 'Your quality threshold should be an integer between 0 and 93, inclusive.' ) @@ -46,54 +46,39 @@ def __main__(): for i,line in enumerate(open( options.input, 'rb' )): if line and not line.startswith( '#' ) and not line.startswith( '@' ) : split_line = line.split( '\t' ) - cigar = split_line[5] - # find all possible matches, like 3M2D7M and 7M3I10M in 3M2D7M3I10M - cigar_copy = cigar[:] - matches = [] - while len( cigar_copy ) >= 6: # nMnInM or nMnDnM - m = pat_indel.search( cigar_copy ) + cigar = split_line[5].strip() + # find matches like 3M2D7M or 7M3I10M + match = {} + m = pat.match( cigar ) + # if unprocessable CIGAR + if not m: + m = pat_multi.match( cigar ) + # skip this line if no match if not m: - break + continue + # account for multiple indels or operations we don't process else: - parts = m.groupdict() - if parts[ 'lmatch' ] and parts[ 'ins_del_width' ] and parts[ 'rmatch' ]: - pre_left = 0 - if m.start() > 0: - pre_left_groups = pat_matches.search( cigar_copy[ : m.start() ] ) - if pre_left_groups: - for pl in pre_left_groups.groups(): - if pl.endswith( 'M' ) or pl.endswith( 'S' ) or pl.endswith( 'P' ): - pre_left += pl[:-1] - parts[ 'pre_left' ] = pre_left - matches.append( parts ) - cigar_copy = cigar_copy[ len( parts[ 'lmatch' ] ) + 1 : ] - # see if matches meet filter requirements - if len( matches ) > 1: - multi_indel_lines += 1 - elif len( matches ) == 1: - pre_left = int( matches[0][ 'pre_left' ] ) - left = int( matches[0][ 'lmatch' ] ) - right = int( matches[0][ 'rmatch' ] ) - if matches[0][ 'ins_del' ] == 'D': - middle = int( matches[0][ 'ins_del_width' ] ) + multi_indel_lines += 1 + # otherwise get matching parts + else: + match = m.groupdict() + # process for indels + if match: + left = int( match[ 'lmatch' ] ) + right = int( match[ 'rmatch' ] ) + if match[ 'ins_del' ] == 'I': + middle = int( match[ 'ins_del_width' ] ) else: middle = 0 # if there are enough adjacent bases to check, then do so if left >= adj_bases and right >= adj_bases: quals = split_line[10] - left_quals = quals[ pre_left : pre_left + left ][ -adj_bases : ] - middle_quals = quals[ pre_left + left : pre_left + left + middle ] - right_quals = quals[ pre_left + left + middle : pre_left + left + middle + right ][ : adj_bases ] + eligible_quals = quals[ left - adj_bases : left + middle + adj_bases ] qual_thresh_met = True - for l in left_quals: - if ord( l ) < qual_thresh: + for q in eligible_quals: + if ord( q ) - 33 < qual_thresh: qual_thresh_met = False break - if qual_thresh_met: - for r in right_quals: - if ord( r ) < qual_thresh: - qual_thresh_met = False - break # if filter reqs met, output line if qual_thresh_met: output.write( line ) --- a/tools/indels/sam_indel_filter.xml +++ b/tools/indels/sam_indel_filter.xml @@ -1,5 +1,5 @@ -<tool id="sam_indel_filter" name="Filter SAM" version="1.0.0"> - <description>for indels</description> +<tool id="sam_indel_filter" name="Filter Indels" version="1.0.0"> + <description>for SAM</description><command interpreter="python"> sam_indel_filter.py --input=$input1 --- a/test-data/indel_sam2interval_out3.bed +++ b/test-data/indel_sam2interval_out3.bed @@ -1,2 +1,7 @@ -ref 256 258 -ref 190341171 190341172 +chr1 11 13 +chr1 21 22 +chr1 21 23 +chrM 16 18 +chrM 19 20 +chrM 21 22 +chrM 22 23 --- a/test-data/indel_sam2interval_out2.bed +++ b/test-data/indel_sam2interval_out2.bed @@ -1,4 +1,6 @@ -ref 133 134 -ref 48819784 48819785 -ref 87824726 87824727 -ref 188841437 188841438 +chr2 18 19 +chr2 28 29 +chrM 11 12 +chrM 13 14 +chrM 19 20 +chrM 21 22 --- a/test-data/indel_analysis_out2.interval +++ b/test-data/indel_analysis_out2.interval @@ -1,8 +1,7 @@ -ref 13 14 C 1 7.14 7.14 -ref 13 14 T 2 13.33 14.29 -ref 14 15 C 1 6.67 7.14 -ref 16 17 T 1 7.14 7.69 -ref 20 21 GG 1 8.33 8.33 -ref 22 23 A 1 8.33 11.11 -ref 24 25 G 1 11.11 14.29 -ref 25 26 T 1 12.50 14.29 +chr2 18 19 AA 1 50.00 50.00 +chr2 28 29 CCC 1 50.00 50.00 +chrM 11 12 TTT 1 9.09 9.09 +chrM 13 14 C 1 9.09 9.09 +chrM 13 14 T 1 9.09 9.09 +chrM 19 20 T 1 7.69 8.33 +chrM 21 22 GA 1 8.33 8.33 --- a/test-data/indel_analysis_in2.sam +++ b/test-data/indel_analysis_in2.sam @@ -1,16 +1,24 @@ -r770 89 ref 6 37 7M1I5M = 0 0 TCGATCTTCATAG !0//110AA44/1 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r1124 113 ref 4 0 23M = 0 0 CATCGTTCTGTTAGATCTACGTA PQRVUMNXYRPUXYXWXSOSZ]M XT:A:R CM:i:0 SM:i:0 AM:i:0 X0:i:163148 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 -r1789 177 ref 6 0 17M = 0 0 TCGATCGCTTAGTTCTC SQQWZY]]YXTSY]]ZM XT:A:R CM:i:0 SM:i:0 AM:i:0 X0:i:163148 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 -r3671 153 ref 10 37 6M1I6M = 0 0 TCTCTTTAGGTCT /<<!"0/////// XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r3824 153 ref 5 37 8M1I7M = 0 0 ATCGATGTTCTTAGAT 4;6//11!"100110/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r4800 16 ref 7 255 5M2D6M = 0 0 CGATCTTTGAT IIIIIIIIIIC XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r5612 151 ref 5 37 8M1D9M = 0 0 ATCTATCTTTTGATCTC /<<!"0/4/*/7//B0/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r5929 151 ref 11 37 3M1I10M = 0 0 CTCCTTAGCTCTCC /<<!"0/4//7//0 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r9145 115 ref 11 0 19M = 0 -1 CTCTTAGCTCTCCGAATTAG 7753:<5#"4!&=9518A> XT:A:R CM:i:2 SM:i:0 AM:i:0 X0:i:4 X1:i:137 XM:i:2 XO:i:0 XG:i:0 MD:Z:23 -r11770 89 ref 10 37 10M2I8M = 0 0 TCTCTTAGATGGCTCCGTAT 00/02!!0/120210AA4/1 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r13671 153 ref 1 37 12M1I12M = 0 0 TCGCATCGATCTCCTTAGATCTCCG /<""<!"0///////00/!!0121/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r13824 153 ref 13 37 9M1I7M = 0 0 CATAGATCTACCGGATT 4;6//11!"11100110 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r24800 16 ref 3 255 15M2D9M = 0 0 GCATCTATCTGATAGCTCCGAATT IIIIIIIII45"CCCIII?IIIII XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r25612 151 ref 1 37 9M1D5M = 0 0 TCGCATCGACTCTT 0/4/*/7//00/1C XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r25786 151 ref 21 37 4M1I7M = 0 0 TGCGTTATTGGG <!"0/70/BC01 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 -r29962 16 ref 20 37 4M1I7M = 0 0 CTCCGGTATGAGG <!"0/70/7BC01 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 +r327 16 chrM 11 37 8M1D10M * 0 0 CTTACCAGATAGTCATCA -+<2;?@BA@?-,.+4=4 XT:A:U NM:i:1 X0:i:1 X1:i:0 XM:i:0 XO:i:1 XG:i:1 MD:Z:41^C35 +r457 0 chr1 14 37 14M * 0 0 ACCTGACAGATATC =/DF;?@1A@?-,. XT:A:U NM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 +r501 16 chrM 6 23 7M1I13M * 0 0 TCTGTGCCTACCAGACATTCA +=$2;?@BA@?-,.+4=4=4A XT:A:U NM:i:3 X0:i:1 X1:i:1 XM:i:2 XO:i:1 XG:i:1 MD:Z:28C36G9 XA:Z:chr5,+134263658,14M1I61M,4; +r764 4 * 0 0 * * 0 0 TTAAGGGGAACGTGTGGGCTATTTAGGCTTTATG BBB=?BBA?>;?=B=AA=;A@B>=;;>:A=?:?9 +r1288 16 chrM 8 37 11M1I7M * 0 0 TCACTTACCTGTACACACA /*F2;?@%A@?-,.+4=4= XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:2T0T1A69 +r1902 0 chr1 4 37 7M2D18M * 0 0 AGTCTCTTACCTGACGGTTATGA <2;?@BA@?-,.+4=4=4AA663 XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 +r2204 16 chrM 9 0 19M * 0 0 CTGGTACCTGACAGGTATC 2;?@BA@?-,.+4=4=4AA XT:A:R NM:i:1 X0:i:2 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:0T75 XA:Z:chr1,-564927,76M,1; +r2314 16 chrM 6 37 10M2D6M * 0 0 TCACTCTTACGTCTGA <2;?@BA@?-,.+4=4 XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:25A5^CA45 +r3001 0 chrM 13 37 3M1D5M2I7M * 0 0 TACAGTCACCCTCATCA <2;?@BA/(@?-,$& XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 +r3218 0 chr1 13 37 8M1D7M * 0 0 TACAGTCACTCATCA <2;?@BA/(@?-,$& XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 +r3760 4 * 0 0 * * 0 0 CCTGTGATCCATCGTGATGTCTTATTTAAGG BABCBCBBBABBACBCABCAACBBCBBB=?B +r4767 16 chr2 3 37 15M2I7M * 0 0 CAGACTCTCTTACCAAAGACAGAC <2;?@BA/(@?-,.+4=4=4AA66 XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:2T1A4T65 +r5333 0 chrM 5 37 17M1D8M * 0 0 GTCTCTCATACCAGACAACGGCAT FB3$@BA/(@?-,.+4=4=4AA66 XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:45C10^C0C5C13 +r6121 4 * 0 0 * * 0 0 GTTAATAGGGTGATAGA AB/BC==CC%ACBC/CB +r6690 16 chrM 7 23 20M * 0 0 CTCTCTTACCAGACAGACAT 2;?@BA/(@?-,.+4=4=4A XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 XA:Z:chr1,-568532,76M,1; +r7211 0 chrM 7 37 24M * 0 0 CGACAGAGACAAAATAACATTTAA //<2;?@BA@?-,.+4=442;;6: XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:2 XO:i:1 XG:i:1 MD:Z:73G0G0 +r8122 4 * 0 0 * * 0 0 GACCTGTGATCCATCGTGATGT CBBAB/$3BB<AB/,CBCABCA +r9922 16 chrM 4 0 7M3I9M * 0 0 CCAGACATTTGAAATCAGG F/D4=44^D++26632;;6 XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 +r9987 16 chrM 4 0 9M1I18M * 0 0 AGGTTCTCATTACCTGACACTCATCTTG G/AD6"/+4=4426632;;6:<2;?@BA XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 +r10145 16 chr1 16 0 5M2D7M * 0 0 CACATTGTTGTA G//+4=44=4AA XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 +r10324 16 chrM 15 0 6M1D5M * 0 0 CCGTTCTACTTG A@??8.G//+4= XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 +r11324 4 * 0 0 * * 0 0 AATAGGGTGATAGACCTG //CCCC#ACB;;C3BA5C +r12331 16 chrM 17 0 4M2I6M * 0 0 AGTCGAATACGTG 632;;6:<2;?@B XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 +r12914 16 chr2 24 0 4M3I3M * 0 0 ACTACCCCAA G//+4=42,. XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76
participants (1)
-
commits-noreply@bitbucket.org