1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/e69a88d32259/ Changeset: e69a88d32259 User: natefoo Date: 2013-11-11 18:11:54 Summary: Remove broken indel analysis tools. Affected #: 10 files diff -r 21c98615987e8ea08187c4455a2978290f25657b -r e69a88d3225969567b011eb09331e1c3f8fe9bf2 tool_conf.xml.main --- a/tool_conf.xml.main +++ b/tool_conf.xml.main @@ -277,10 +277,6 @@ <tool file="gatk/variant_eval.xml" /><tool file="gatk/variant_combine.xml" /></section> - <section id="indel_analysis" name="NGS: Indel Analysis"> - <tool file="indels/sam_indel_filter.xml" /> - <tool file="indels/indel_analysis.xml" /> - </section><section id="peak_calling" name="NGS: Peak Calling"><tool file="peak_calling/macs_wrapper.xml" /><tool file="peak_calling/sicer_wrapper.xml" /> diff -r 21c98615987e8ea08187c4455a2978290f25657b -r e69a88d3225969567b011eb09331e1c3f8fe9bf2 tool_conf.xml.sample --- a/tool_conf.xml.sample +++ b/tool_conf.xml.sample @@ -285,11 +285,6 @@ <tool file="sr_mapping/srma_wrapper.xml" /><tool file="sr_mapping/mosaik.xml" /></section> - <section id="indel_analysis" name="NGS: Indel Analysis"> - <tool file="indels/sam_indel_filter.xml" /> - <tool file="indels/indel_table.xml" /> - <tool file="indels/indel_analysis.xml" /> - </section><section id="ngs-rna-tools" name="NGS: RNA Analysis"><label id="rna_seq" text="RNA-seq" /> diff -r 21c98615987e8ea08187c4455a2978290f25657b -r e69a88d3225969567b011eb09331e1c3f8fe9bf2 tools/indels/indel_analysis.py --- a/tools/indels/indel_analysis.py +++ /dev/null @@ -1,227 +0,0 @@ -#!/usr/bin/env python - -""" -Given an input sam file, provides analysis of the indels.. - -usage: %prog [options] [input3 sum3[ input4 sum4[ input5 sum5[...]]]] - -i, --input=i: The sam file to analyze - -t, --threshold=t: The deletion frequency threshold - -I, --out_ins=I: The interval output file showing insertions - -D, --out_del=D: The interval output file showing deletions -""" - -import re, sys -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse - - -def stop_err( msg ): - sys.stderr.write( '%s\n' % msg ) - sys.exit() - -def add_to_mis_matches( mis_matches, pos, bases ): - """ - Adds the bases and counts to the mis_matches dict - """ - for j, base in enumerate( bases ): - try: - mis_matches[ pos + j ][ base ] += 1 - except KeyError: - try: - mis_matches[ pos + j ][ base ] = 1 - except KeyError: - mis_matches[ pos + j ] = { base: 1 } - -def __main__(): - #Parse Command Line - options, args = doc_optparse.parse( __doc__ ) - # prep output files - out_ins = open( options.out_ins, 'wb' ) - out_del = open( options.out_del, 'wb' ) - # patterns - pat = re.compile( '^((?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M)$|((?P<match_width>\d+)M)$' ) - pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' ) - # for tracking occurences at each pos of ref - mis_matches = {} - indels = {} - multi_indel_lines = 0 - # go through all lines in input file - for i,line in enumerate( open( options.input, 'rb' ) ): - if line.strip() and not line.startswith( '#' ) and not line.startswith( '@' ) : - split_line = line.split( '\t' ) - chrom = split_line[2].strip() - pos = int( split_line[3].strip() ) - cigar = split_line[5].strip() - bases = split_line[9].strip() - # if not an indel or match, exit - if chrom == '*': - continue - # find matches like 3M2D7M or 7M3I10M - match = {} - m = pat.match( cigar ) - # unprocessable CIGAR - if not m: - m = pat_multi.match( cigar ) - # skip this line if no match - if not m: - continue - # account for multiple indels or operations we don't process - else: - multi_indel_lines += 1 - # get matching parts for the indel or full match if matching - else: - if not mis_matches.has_key( chrom ): - mis_matches[ chrom ] = {} - indels[ chrom ] = { 'D': {}, 'I': {} } - parts = m.groupdict() - if parts[ 'match_width' ] or ( parts[ 'lmatch' ] and parts[ 'ins_del_width' ] and parts[ 'rmatch' ] ): - match = parts - # see if matches meet filter requirements - if match: - # match/mismatch - if parts[ 'match_width' ]: - add_to_mis_matches( mis_matches[ chrom ], pos, bases ) - # indel - else: - # pieces of CIGAR string - left = int( match[ 'lmatch' ] ) - middle = int( match[ 'ins_del_width' ] ) - right = int( match[ 'rmatch' ] ) - left_bases = bases[ : left ] - if match[ 'ins_del' ] == 'I': - middle_bases = bases[ left : left + middle ] - else: - middle_bases = '' - right_bases = bases[ -right : ] - start = pos + left - # add data to ref_pos dict for match/mismatch bases on left and on right - add_to_mis_matches( mis_matches[ chrom ], pos, left_bases ) - if match[ 'ins_del' ] == 'I': - add_to_mis_matches( mis_matches[ chrom ], start, right_bases ) - else: - add_to_mis_matches( mis_matches[ chrom ], start + middle, right_bases ) - # for insertions, count instances of particular inserted bases - if match[ 'ins_del' ] == 'I': - if indels[ chrom ][ 'I' ].has_key( start ): - try: - indels[ chrom ][ 'I' ][ start ][ middle_bases ] += 1 - except KeyError: - indels[ chrom ][ 'I' ][ start ][ middle_bases ] = 1 - else: - indels[ chrom ][ 'I' ][ start ] = { middle_bases: 1 } - # for deletions, count number of deletions bases - else: - if indels[ chrom ][ 'D' ].has_key( start ): - try: - indels[ chrom ][ 'D' ][ start ][ middle ] += 1 - except KeyError: - indels[ chrom ][ 'D' ][ start ][ middle ] = 1 - else: - indels[ chrom ][ 'D' ][ start ] = { middle: 1 } - # compute deletion frequencies and insertion frequencies for checking against threshold - freqs = {} - ins_freqs = {} - chroms = mis_matches.keys() - chroms.sort() - for chrom in chroms: - freqs[ chrom ] = {} - ins_freqs[ chrom ] = {} - poses = mis_matches[ chrom ].keys() - poses.extend( indels[ chrom ][ 'D' ].keys() ) - poses.extend( indels[ chrom ][ 'I' ].keys() ) - poses = list( set( poses ) ) - for pos in poses: - # all reads touching this particular position - freqs[ chrom ][ pos ] = {} - sum_counts = 0.0 - sum_counts_end = 0.0 - # get basic counts (match/mismatch) - try: - sum_counts += float( sum( mis_matches[ chrom ][ pos ].values() ) ) - except KeyError: - pass - try: - sum_counts_end += float( sum( mis_matches[ chrom ][ pos + 1 ].values() ) ) - except KeyError: - pass - # add deletions also touching this position - try: - sum_counts += float( sum( indels[ chrom ][ 'D' ][ pos ].values() ) ) - except KeyError: - pass - try: - sum_counts_end += float( sum( indels[ chrom ][ 'D' ][ pos + 1 ].values() ) ) - except KeyError: - pass - freqs[ chrom ][ pos ][ 'total' ] = sum_counts - # calculate actual frequencies - # deletions - # frequencies for deletions - try: - for d in indels[ chrom ][ 'D' ][ pos ].keys(): - freqs[ chrom ][ pos ][ d ] = indels[ chrom ][ 'D' ][ pos ][ d ] / sum_counts - except KeyError: - pass - # frequencies for matches/mismatches - try: - for base in mis_matches[ chrom ][ pos ].keys(): - try: - prop = float( mis_matches[ chrom ][ pos ][ base ] ) / sum_counts - freqs[ chrom ][ pos ][ base ] = prop - except ZeroDivisionError: - freqs[ chrom ][ pos ][ base ] = 0.0 - except KeyError: - pass - # insertions - try: - for bases in indels[ chrom ][ 'I' ][ pos ].keys(): - prop_start = indels[ chrom ][ 'I' ][ pos ][ bases ] / ( indels[ chrom ][ 'I' ][ pos ][ bases ] + sum_counts ) - try: - prop_end = indels[ chrom ][ 'I' ][ pos ][ bases ] / ( indels[ chrom ][ 'I' ][ pos ][ bases ] + sum_counts_end ) - except ZeroDivisionError: - prop_end = 0.0 - try: - ins_freqs[ chrom ][ pos ][ bases ] = [ prop_start, prop_end ] - except KeyError: - ins_freqs[ chrom ][ pos ] = { bases: [ prop_start, prop_end ] } - except KeyError, e: - pass - # output to files if meet threshold requirement - threshold = float( options.threshold ) - #out_del.write( '#Chrom\tStart\tEnd\t#Del\t#Reads\t%TotReads\n' ) - #out_ins.write( '#Chrom\tStart\tEnd\tInsBases\t#Reads\t%TotReadsAtStart\t%ReadsAtEnd\n' ) - for chrom in chroms: - # deletions file - poses = indels[ chrom ][ 'D' ].keys() - poses.sort() - for pos in poses: - start = pos - dels = indels[ chrom ][ 'D' ][ start ].keys() - dels.sort() - for d in dels: - end = start + d - prop = freqs[ chrom ][ start ][ d ] - if prop > threshold : - out_del.write( '%s\t%s\t%s\t%s\t%.2f\n' % ( chrom, start, end, indels[ chrom ][ 'D' ][ pos ][ d ], 100.0 * prop ) ) - # insertions file - poses = indels[ chrom ][ 'I' ].keys() - poses.sort() - for pos in poses: - start = pos - end = pos + 1 - ins_bases = indels[ chrom ][ 'I' ][ start ].keys() - ins_bases.sort() - for bases in ins_bases: - prop_start = ins_freqs[ chrom ][ start ][ bases ][0] - prop_end = ins_freqs[ chrom ][ start ][ bases ][1] - if prop_start > threshold or prop_end > threshold: - out_ins.write( '%s\t%s\t%s\t%s\t%s\t%.2f\t%.2f\n' % ( chrom, start, end, bases, indels[ chrom ][ 'I' ][ start ][ bases ], 100.0 * prop_start, 100.0 * prop_end ) ) - # close out files - out_del.close() - out_ins.close() - # if skipped lines because of more than one indel, output message - if multi_indel_lines > 0: - sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines ) - -if __name__=="__main__": __main__() diff -r 21c98615987e8ea08187c4455a2978290f25657b -r e69a88d3225969567b011eb09331e1c3f8fe9bf2 tools/indels/indel_analysis.xml --- a/tools/indels/indel_analysis.xml +++ /dev/null @@ -1,167 +0,0 @@ -<tool id="indel_analysis" name="Indel Analysis" version="1.0.0"> - <description></description> - <command interpreter="python"> - indel_analysis.py - --input=$input1 - --threshold=$threshold - --out_ins=$out_ins - --out_del=$out_del - </command> - <inputs> - <param format="sam" name="input1" type="data" label="Select sam file to analyze" /> - <param name="threshold" type="float" value="0.015" size="5" label="Frequency threshold" help="Cutoff" /> - </inputs> - <outputs> - <data format="interval" name="out_del" /> - <data format="interval" name="out_ins" /> - </outputs> - <tests> - <test> - <param name="input1" value="indel_analysis_in1.sam" ftype="sam"/> - <param name="threshold" value="0.017"/> - <output name="out_del" file="indel_analysis_out1.interval" ftype="interval"/> - <output name="out_ins" file="indel_analysis_out2.interval" ftype="interval"/> - </test> - <test> - <param name="input1" value="indel_analysis_in2.sam" ftype="sam"/> - <param name="threshold" value="0.09"/> - <output name="out_del" file="indel_analysis_out3.interval" ftype="interval"/> - <output name="out_ins" file="indel_analysis_out4.interval" ftype="interval"/> - </test> - </tests> - <help> - -**What it does** - -Given an input sam file, this tool provides analysis of the indels. It filters out matches that do not meet the frequency threshold. The way this frequency of occurence is calculated is different for deletions and insertions. The CIGAR string's "M" can indicate an exact match or a mismatch. For SAM containing the following bits of information (assuming the reference "ACTGCTCGAT"):: - - CHROM POS CIGAR SEQ - ref 3 2M1I3M TACTTC - ref 1 2M1D3M ACGCT - ref 4 4M2I3M GTTCAAGAT - ref 2 2M2D3M CTCCG - ref 1 3M1D4M AACCTGG - ref 6 3M1I2M TTCAAT - ref 5 3M1I3M CTCTGTT - ref 7 4M CTAT - ref 5 5M CGCTA - ref 3 2M1D2M TGCC - -The following totals would be calculated (this is an intermediate step and not output):: - - ------------------------------------------------------------------------------------------------------- - POS BASE NUMREADS DELPROPCALC DELPROP INSPROPSTARTCALC INSSTARTPROP INSPROPENDCALC INSENDPROP - ------------------------------------------------------------------------------------------------------- - 1 A 2 2/2 1.00 --- --- --- --- - 2 A 1 1/3 0.33 --- --- --- --- - C 2 2/3 0.67 --- --- --- --- - 3 C 1 1/5 0.20 --- --- --- --- - T 3 3/5 0.60 --- --- --- --- - - 1 1/5 0.20 --- --- --- --- - 4 A 1 1/6 0.17 --- --- --- --- - G 3 3/6 0.50 --- --- --- --- - - 1 1/6 0.17 --- --- --- --- - -- 1 1/6 0.17 --- --- --- --- - 5 C 4 4/7 0.57 --- --- --- --- - T 2 2/7 0.29 --- --- --- --- - - 1 1/7 0.14 --- --- --- --- - +C 1 --- --- 1/7 0.14 1/9 0.11 - 6 C 2 2/9 0.22 --- --- --- --- - G 1 1/9 0.11 --- --- --- --- - T 6 6/9 0.67 --- --- --- --- - 7 C 7 7/9 0.78 --- --- --- --- - G 1 1/9 0.11 --- --- --- --- - T 1 1/9 0.11 --- --- --- --- - 8 C 1 1/7 0.14 --- --- --- --- - G 4 4/7 0.57 --- --- --- --- - T 2 2/7 0.29 --- --- --- --- - +T 1 --- --- 1/8 0.13 1/6 0.17 - +AA 1 --- --- 1/8 0.13 1/6 0.17 - 9 A 4 4/5 0.80 --- --- --- --- - T 1 1/5 0.20 --- --- --- --- - +A 1 --- --- 1/6 0.17 1/5 0.20 - 10 T 4 4/4 1.00 --- --- --- --- - -The general idea for calculating these is that we want to find out the proportion of times a particular event occurred at a position among all reads that touch that base in some way. First, the basic total number of reads at a given position is the number of reads with each particular base plus the number of reads with that a deletion at that given position (including the bases that are "mismatches"). Note that deletions of two bases and one base would be counted completely separately. Insertions are not counted in this total. For position 4 above, the reference base is G, and there are 3 occurrences of it along with one mismatching base, A. Also, there is a 1-base deletion and another 2-base deletion. So there are a total of 5 matches/mismatches/deletions, and the proportions for each base are 1/6 = 0.17 (A) and 3/6 = 0.50 (G), and for each deletion it is 1/6 = 0.17. - -Insertions are slightly more complicated. We actually want to get the frequency of occurrence for both the associated start and end positions, since an insertion appears between those two bases. Each insertion is regarded individually, and the total number of occurrences of that insertion is divided by the sum of the number of its occurrences and the basic total for either the start or end. So for the insertions at position 8, there are a total of 7 matches/mismatches/deletions at position 8, and two insertions that each occur once, so each has an INSSTARTPROP of 1/8 = 0.13. For the end position there are 5 matches/mismatches/deletions, so the INSENDPROP is 1/6 = 0.17 for both insertions (T and AA). - -These proportions (DELPROP and either INSSTARTPROP or INSENDPROP) need to be greater than the threshold frequency specified by the user in order for that base, deletion or insertion to be included in the output. - - -** Output format ** - -The output varies for deletions and insertions, although for both, the first three columns are chromosome, start position, and end position. - -Columns in the deletions file:: - - Column Description - ----------------------------- --------------------------------------------------------------------------------------------------- - 1 Chrom Chromosome - 2 Start Starting position - 3 End Ending position - 4 Coverage Number of reads containing this exact deletion - 5 Frequency Percentage Frequency of this exact deletion (2 and 1 are mutually exclusive, for instance), as percentage (%) - -Columns in the insertions file:: - - Column Description - ------------------------ ----------------------------------------------------------------------------------------------------------------- - 1 Chrom Chromosome - 2 Start Starting position - 3 End Ending position (always Start + 1 for insertions) - 4 Inserted Base(s) The exact base(s) inserted at Start position - 5 Coverage Number of reads containing this exact insertion - 6 Freq. Perc. at Start Frequency of this exact insertion given Start position ("GG" and "G" are considered distinct), as percentage (%) - 7 Freq. Perc. at End Frequency of this exact insertion given End position ("GG" and "G" are considered distinct), as percentage (%) - -Before using this tool, you may will want to use the Filter SAM for indels tool to filter out indels on bases with insufficient quality scores, but this is not required. - - ------ - -**Example** - -If you set the threshold to 0.0 and have the following SAM file:: - - r327 16 chrM 11 37 8M1D10M * 0 0 CTTACCAGATAGTCATCA -+<2;?@BA@?-,.+4=4 XT:A:U NM:i:1 X0:i:1 X1:i:0 XM:i:0 XO:i:1 XG:i:1 MD:Z:41^C35 - r457 0 chr1 14 37 14M * 0 0 ACCTGACAGATATC =/DF;?@1A@?-,. XT:A:U NM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r501 16 chrM 6 23 7M1I13M * 0 0 TCTGTGCCTACCAGACATTCA +=$2;?@BA@?-,.+4=4=4A XT:A:U NM:i:3 X0:i:1 X1:i:1 XM:i:2 XO:i:1 XG:i:1 MD:Z:28C36G9 XA:Z:chrM,+134263658,14M1I61M,4; - r1288 16 chrM 8 37 11M1I7M * 0 0 TCACTTACCTGTACACACA /*F2;?@%A@?-,.+4=4= XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:2T0T1A69 - r1902 0 chr1 4 37 7M2D18M * 0 0 AGTCTCTTACCTGACGGTTATGA <2;?@BA@?-,.+4=4=4AA663 XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 - r2204 16 chrM 9 0 19M * 0 0 CTGGTACCTGACAGGTATC 2;?@BA@?-,.+4=4=4AA XT:A:R NM:i:1 X0:i:2 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:0T75 XA:Z:chrM,-564927,76M,1; - r2314 16 chrM 6 37 10M2D8M * 0 0 TCACTCTTACGTCTGA <2;?@BA@?-,.+4=4 XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:25A5^CA45 - r3001 0 chrM 13 37 3M1D5M2I7M * 0 0 TACAGTCACCCTCATCA <2;?@BA/(@?-,$& XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 - r3218 0 chr1 13 37 8M1D7M * 0 0 TACAGTCACTCATCA <2;?@BA/(@?-,$& XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 - r4767 16 chr2 3 37 15M2I7M * 0 0 CAGACTCTCTTACCAAAGACAGAC <2;?@BA/(@?-,.+4=4=4AA66 XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:2T1A4T65 - r5333 0 chrM 5 37 17M1D8M * 0 0 GTCTCTCATACCAGACAACGGCAT FB3$@BA/(@?-,.+4=4=4AA66 XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:45C10^C0C5C13 - r6690 16 chrM 7 23 20M * 0 0 CTCTCTTACCAGACAGACAT 2;?@BA/(@?-,.+4=4=4A XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 XA:Z:chrM,-568532,76M,1; - r7211 0 chrM 7 37 24M * 0 0 CGACAGAGACAAAATAACATTTAA //<2;?@BA@?-,.+4=442;;6: XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:2 XO:i:1 XG:i:1 MD:Z:73G0G0 - r9922 16 chrM 4 0 7M3I9M * 0 0 CCAGACATTTGAAATCAGG F/D4=44^D++26632;;6 XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r9987 16 chrM 4 0 9M1I18M * 0 0 AGGTTCTCATTACCTGACACTCATCTTG G/AD6"/+4=4426632;;6:<2;?@BA XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r10145 16 chr1 16 0 5M2D7M * 0 0 CACATTGTTGTA G//+4=44=4AA XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r10324 16 chrM 15 0 6M1D5M * 0 0 CCGTTCTACTTG A@??8.G//+4= XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r12331 16 chrM 17 0 4M2I6M * 0 0 AGTCGAATACGTG 632;;6:<2;?@B XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r12914 16 chr2 24 0 4M3I3M * 0 0 ACTACCCCAA G//+4=42,. XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - -The following will be produced (deletions file followed by insertions file):: - - chr1 11 13 1 100.00 - chr1 21 22 1 25.00 - chr1 21 23 1 25.00 - chrM 16 18 1 9.09 - chrM 19 20 1 8.33 - chrM 21 22 1 9.09 - chrM 22 23 1 9.09 - - chr2 18 19 AA 1 50.00 50.00 - chr2 28 29 CCC 1 50.00 50.00 - chrM 11 12 TTT 1 9.09 9.09 - chrM 13 14 C 1 9.09 9.09 - chrM 13 14 T 1 9.09 9.09 - chrM 19 20 T 1 7.69 8.33 - chrM 21 22 GA 1 8.33 8.33 - - - </help> -</tool> diff -r 21c98615987e8ea08187c4455a2978290f25657b -r e69a88d3225969567b011eb09331e1c3f8fe9bf2 tools/indels/indel_sam2interval.py --- a/tools/indels/indel_sam2interval.py +++ /dev/null @@ -1,161 +0,0 @@ -#!/usr/bin/env python - -""" -Allows user to filter out non-indels from SAM. - -usage: %prog [options] - -i, --input=i: The input SAM file - -u, --include_base=u: Whether or not to include the base for insertions - -c, --collapse=c: Wheter to collapse multiple occurrences of a location with counts shown - -o, --int_out=o: The interval output file for the converted SAM file - -b, --bed_ins_out=b: The bed output file with insertions only for the converted SAM file - -d, --bed_del_out=d: The bed output file with deletions only for the converted SAM file -""" - -import re, sys -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse - - -def stop_err( msg ): - sys.stderr.write( '%s\n' % msg ) - sys.exit() - -def numeric_sort( text1, text2 ): - """ - For two items containing space-separated text, compares equivalent pieces - numerically if both numeric or as text otherwise - """ - pieces1 = text1.split() - pieces2 = text2.split() - if len( pieces1 ) == 0: - return 1 - if len( pieces2 ) == 0: - return -1 - for i, pc1 in enumerate( pieces1 ): - if i == len( pieces2 ): - return 1 - if not pieces2[i].isdigit(): - if pc1.isdigit(): - return -1 - else: - if pc1 > pieces2[i]: - return 1 - elif pc1 < pieces2[i]: - return -1 - else: - if not pc1.isdigit(): - return 1 - else: - if int( pc1 ) > int( pieces2[i] ): - return 1 - elif int( pc1 ) < int( pieces2[i] ): - return -1 - if i < len( pieces2 ) - 1: - return -1 - return 0 - -def __main__(): - #Parse Command Line - options, args = doc_optparse.parse( __doc__ ) - - # open up output files - output = open( options.int_out, 'wb' ) - if options.bed_ins_out != 'None': - output_bed_ins = open( options.bed_ins_out, 'wb' ) - else: - output_bed_ins = None - if options.bed_del_out != 'None': - output_bed_del = open( options.bed_del_out, 'wb' ) - else: - output_bed_del = None - - # the pattern to match, assuming just one indel per cigar string - pat_indel = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' ) - pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' ) - - # go through all lines in input file - out_data = {} - multi_indel_lines = 0 - for line in open( options.input, 'rb' ): - if line and not line.startswith( '#' ) and not line.startswith( '@' ) : - split_line = line.split( '\t' ) - if split_line < 12: - continue - # grab relevant pieces - cigar = split_line[5].strip() - pos = int( split_line[3] ) - chr = split_line[2] - base_string = split_line[9] - # parse cigar string - m = pat_indel.match( cigar ) - if not m: - m = pat_multi.match( cigar ) - # skip this line if no match - if not m: - continue - # account for multiple indels or operations we don't process - else: - multi_indel_lines += 1 - continue - else: - match = m.groupdict() - left = int( match[ 'lmatch' ] ) - middle = int( match[ 'ins_del_width' ] ) - middle_type = match[ 'ins_del' ] - bases = base_string[ left : left + middle ] - # calculate start and end positions, and output to insertion or deletion file - start = left + pos - if middle_type == 'D': - end = start + middle - data = [ chr, start, end, 'D' ] - if options.include_base == "true": - data.append( '-' ) - else: - end = start + 1 - data = [ chr, start, end, 'I' ] - if options.include_base == "true": - data.append( bases ) - location = '\t'.join( [ '%s' % d for d in data ] ) - try: - out_data[ location ] += 1 - except KeyError: - out_data[ location ] = 1 - # output to interval file - # get all locations and sort - locations = out_data.keys() - locations.sort( numeric_sort ) - last_line = '' - # output each location, either with counts or each occurrence - for loc in locations: - sp_loc = loc.split( '\t' ) - cur_line = '\t'.join( sp_loc[:3] ) - if options.collapse == 'true': - output.write( '%s\t%s\n' % ( loc, out_data[ loc ] ) ) - if output_bed_del and sp_loc[3] == 'D': - output_bed_del.write( '%s\n' % cur_line ) - if output_bed_ins and sp_loc[3] == 'I' and last_line != cur_line: - output_bed_ins.write( '%s\n' % cur_line ) - last_line = cur_line - else: - for i in range( out_data[ loc ] ): - output.write( '%s\n' % loc ) - if output_bed_del or output_bed_ins: - if output_bed_del and sp_loc[3] == 'D': - output_bed_del.write( '%s\n' % cur_line ) - if output_bed_ins and sp_loc[3] == 'I': - output_bed_ins.write( '%s\n' % cur_line ) - - # cleanup, close files - if output_bed_ins: - output_bed_ins.close() - if output_bed_del: - output_bed_del.close() - output.close() - - # if skipped lines because of more than one indel, output message - if multi_indel_lines > 0: - sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines ) - -if __name__=="__main__": __main__() diff -r 21c98615987e8ea08187c4455a2978290f25657b -r e69a88d3225969567b011eb09331e1c3f8fe9bf2 tools/indels/indel_sam2interval.xml --- a/tools/indels/indel_sam2interval.xml +++ /dev/null @@ -1,139 +0,0 @@ -<tool id="indel_sam2interval" name="Extract indels" version="1.0.0"> - <description>from SAM</description> - <command interpreter="python"> - indel_sam2interval.py - --input=$input1 - --include_base=$include_base - --collapse=$collapse - --int_out=$output1 - #if $ins_out.include_ins_out == "true" - --bed_ins_out=$output2 - #else - --bed_ins_out="None" - #end if - #if $del_out.include_del_out == "true" - --bed_del_out=$output3 - #else - --bed_del_out="None" - #end if - </command> - <inputs> - <param format="sam" name="input1" type="data" label="Select dataset to convert" /> - <param name="include_base" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Include the relevant base(s) for each insertion (and a dash (-) for deletions)" /> - <param name="collapse" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Collapse repeated locations onto single line with counts" /> - <conditional name="ins_out"> - <param name="include_ins_out" type="select" label="Include insertions output bed file?"> - <option value="true">Yes</option> - <option value="false">No</option> - </param> - <when value="true" /> - <when value="false" /> - </conditional> - <conditional name="del_out"> - <param name="include_del_out" type="select" label="Include deletions output bed file?"> - <option value="true">Yes</option> - <option value="false">No</option> - </param> - <when value="true" /> - <when value="false" /> - </conditional> - </inputs> - <outputs> - <data format="interval" name="output1" /> - <data format="bed" name="output2"> - <filter>ins_out[ "include_ins_out" ] == "true"</filter> - </data> - <data format="bed" name="output3"> - <filter>del_out[ "include_del_out" ] == "true"</filter> - </data> - </outputs> - <tests> - <test> - <param name="input1" value="indel_sam2interval_in1.sam" ftype="sam"/> - <param name="include_base" value="true"/> - <param name="collapse" value="true"/> - <param name="include_ins_out" value="true" /> - <param name="include_del_out" value="true" /> - <output name="output1" file="indel_sam2interval_out1.interval" ftype="interval"/> - <output name="output2" file="indel_sam2interval_out2.bed" ftype="bed"/> - <output name="output3" file="indel_sam2interval_out3.bed" ftype="bed"/> - </test> - </tests> - <help> - -**What it does** - -Given a SAM file containing indels, converts these to an interval file with a column indicating whether it is an insertion or a deletion, and then also can create a BED file for each type (one for insertions, one for deletions). The interval file can be combined with other like files to create a table useful for analysis with the Indel Analysis Table tool. The BED files can be useful for visualizing the reads. - ------ - -**Example** - -Suppose you have the following mapping results:: - - r327 16 chrM 11 37 8M1D10M * 0 0 CTTACCAGATAGTCATCA -+<2;?@BA@?-,.+4=4 XT:A:U NM:i:1 X0:i:1 X1:i:0 XM:i:0 XO:i:1 XG:i:1 MD:Z:41^C35 - r457 0 chr1 14 37 14M * 0 0 ACCTGACAGATATC =/DF;?@1A@?-,. XT:A:U NM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r501 16 chrM 6 23 7M1I13M * 0 0 TCTGTGCCTACCAGACATTCA +=$2;?@BA@?-,.+4=4=4A XT:A:U NM:i:3 X0:i:1 X1:i:1 XM:i:2 XO:i:1 XG:i:1 MD:Z:28C36G9 XA:Z:chrM,+134263658,14M1I61M,4; - r1288 16 chrM 8 37 11M1I7M * 0 0 TCACTTACCTGTACACACA /*F2;?@%A@?-,.+4=4= XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:2T0T1A69 - r1902 0 chr1 4 37 7M2D18M * 0 0 AGTCTCTTACCTGACGGTTATGA <2;?@BA@?-,.+4=4=4AA663 XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 - r2204 16 chrM 9 0 19M * 0 0 CTGGTACCTGACAGGTATC 2;?@BA@?-,.+4=4=4AA XT:A:R NM:i:1 X0:i:2 X1:i:0 XM:i:1 XO:i:0 XG:i:0 MD:Z:0T75 XA:Z:chrM,-564927,76M,1; - r2314 16 chrM 6 37 10M2D8M * 0 0 TCACTCTTACGTCTGA <2;?@BA@?-,.+4=4 XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:25A5^CA45 - r3001 0 chrM 13 37 3M1D5M2I7M * 0 0 TACAGTCACCCTCATCA <2;?@BA/(@?-,$& XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 - r3218 0 chr1 13 37 8M1D7M * 0 0 TACAGTCACTCATCA <2;?@BA/(@?-,$& XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:2 MD:Z:17^CA58A0 - r4767 16 chr2 3 37 15M2I7M * 0 0 CAGACTCTCTTACCAAAGACAGAC <2;?@BA/(@?-,.+4=4=4AA66 XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:2T1A4T65 - r5333 0 chrM 5 37 17M1D8M * 0 0 GTCTCTCATACCAGACAACGGCAT FB3$@BA/(@?-,.+4=4=4AA66 XT:A:U NM:i:4 X0:i:1 X1:i:0 XM:i:3 XO:i:1 XG:i:1 MD:Z:45C10^C0C5C13 - r6690 16 chrM 7 23 20M * 0 0 CTCTCTTACCAGACAGACAT 2;?@BA/(@?-,.+4=4=4A XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 XA:Z:chrM,-568532,76M,1; - r7211 0 chrM 7 37 24M * 0 0 CGACAGAGACAAAATAACATTTAA //<2;?@BA@?-,.+4=442;;6: XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:2 XO:i:1 XG:i:1 MD:Z:73G0G0 - r7899 69 * 0 0 * * 0 0 CTGCGTGTTGGTGTCTACTGGGGT #%#'##$#$##&%#%$$$%#%#'# - r9192 133 * 0 0 * * 0 0 GTGCGTCGGGGAGGGTGCTGTCGG ######%#$%#$$###($###&&% - r9922 16 chrM 4 0 7M3I9M * 0 0 CCAGACATTTGAAATCAGG F/D4=44^D++26632;;6 XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r9987 16 chrM 4 0 9M1I18M * 0 0 AGGTTCTCATTACCTGACACTCATCTTG G/AD6"/+4=4426632;;6:<2;?@BA XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r10145 16 chr1 16 0 5M2D7M * 0 0 CACATTGTTGTA G//+4=44=4AA XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r10324 16 chrM 15 0 6M1D5M * 0 0 CCGTTCTACTTG A@??8.G//+4= XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r12331 16 chrM 17 0 4M2I6M * 0 0 AGTCGAATACGTG 632;;6:<2;?@B XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r12914 16 chr2 24 0 4M3I3M * 0 0 ACTACCCCAA G//+4=42,. XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - r13452 16 chrM 13 0 3M1D11M * 0 0 TACGTCACTCATCA IIIABCCCICCCCI XT:A:U NM:i:0 X0:i:1 X1:i:1 XM:i:0 XO:i:0 XG:i:0 MD:Z:76 - - -The following three files will be produced (Interval, Insertions BED and Deletions BED):: - - chr1 11 13 D - 1 - chr1 21 22 D - 1 - chr1 21 23 D - 1 - chr2 18 19 I AA 1 - chr2 28 29 I CCC 1 - chrM 11 12 I TTT 1 - chrM 13 14 I C 1 - chrM 13 14 I T 1 - chrM 16 17 D - 1 - chrM 16 18 D - 1 - chrM 19 20 D - 1 - chrM 19 20 I T 1 - chrM 21 22 D - 1 - chrM 21 22 I GA 1 - chrM 22 23 D - 1 - - chr2 18 19 - chr2 28 29 - chrM 11 12 - chrM 13 14 - chrM 13 14 - chrM 19 20 - chrM 21 22 - - chr1 11 13 - chr1 21 22 - chr1 21 23 - chrM 16 17 - chrM 16 18 - chrM 19 20 - chrM 21 22 - chrM 22 23 - -For more information on SAM, please consult the `SAM format description`__. - -.. __: http://www.ncbi.nlm.nih.gov/pubmed/19505943 - - - </help> -</tool> diff -r 21c98615987e8ea08187c4455a2978290f25657b -r e69a88d3225969567b011eb09331e1c3f8fe9bf2 tools/indels/indel_table.py --- a/tools/indels/indel_table.py +++ /dev/null @@ -1,113 +0,0 @@ -#!/usr/bin/env python - -""" -Combines several interval files containing indels with counts. All input files need to have the same number of columns. - -usage: %prog [options] [input3 sum3[ input4 sum4[ input5 sum5[...]]]] - -1, --input1=1: The first input file - -s, --sum1=s: Whether or not to include the totals from first file in overall total - -2, --input2=2: The second input file - -S, --sum2=S: Whether or not to include the totals from second file in overall total - -o, --output=o: The interval output file for the combined files -""" - -import re, sys -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse - - -def stop_err( msg ): - sys.stderr.write( '%s\n' % msg ) - sys.exit() - -def numeric_sort( text1, text2 ): - """ - For two items containing space-separated text, compares equivalent pieces - numerically if both numeric or as text otherwise - """ - pieces1 = text1.split() - pieces2 = text2.split() - if len( pieces1 ) == 0: - return 1 - if len( pieces2 ) == 0: - return -1 - for i, pc1 in enumerate( pieces1 ): - if i == len( pieces2 ): - return 1 - if not pieces2[i].isdigit(): - if pc1.isdigit(): - return -1 - else: - if pc1 > pieces2[i]: - return 1 - elif pc1 < pieces2[i]: - return -1 - else: - if not pc1.isdigit(): - return 1 - else: - if int( pc1 ) > int( pieces2[i] ): - return 1 - elif int( pc1 ) < int( pieces2[i] ): - return -1 - if i < len( pieces2 ) - 1: - return -1 - return 0 - -def __main__(): - # Parse Command Line - options, args = doc_optparse.parse( __doc__ ) - inputs = [ options.input1, options.input2 ] - includes = [ options.sum1, options.sum2 ] - inputs.extend( [ a for i, a in enumerate( args ) if i % 2 == 0 ] ) - includes.extend( [ a for i, a in enumerate( args ) if i % 2 == 1 ] ) - num_cols = 0 - counts = {} - # read in data from all files and get total counts - try: - for i, input in enumerate( inputs ): - for line in open( input, 'rb' ): - sp_line = line.strip().split( '\t' ) - # set num_cols on first pass - if num_cols == 0: - if len( sp_line ) < 4: - raise Exception, 'There need to be at least 4 columns in the file: Chrom, Start, End, and Count' - num_cols = len( sp_line ) - # deal with differing number of columns - elif len( sp_line ) != num_cols: - raise Exception, 'All of the files need to have the same number of columns (current %s != %s of first line)' % ( len( sp_line ), num_cols ) - # get actual counts for each indel - indel = '\t'.join( sp_line[:-1] ) - try: - count = int( sp_line[-1] ) - except ValueError, e: - raise Exception, 'The last column of each file must be numeric, with the count of the number of instances of that indel: %s' % str( e ) - # total across all included files - if includes[i] == "true": - try: - counts[ indel ]['tot'] += count - except ( IndexError, KeyError ): - counts[ indel ] = { 'tot': count } - # counts for ith file - counts[ indel ][i] = count - except Exception, e: - stop_err( 'Failed to read all input files:\n%s' % str( e ) ) - # output combined results to table file - try: - output = open( options.output, 'wb' ) - count_keys = counts.keys() - count_keys.sort( numeric_sort ) - for indel in count_keys: - count_out = [ str( counts[ indel ][ 'tot' ] ) ] - for i in range( len( inputs ) ): - try: - count_out.append( str( counts[ indel ][i] ) ) - except KeyError: - count_out.append( '0' ) - output.write( '%s\t%s\n' % ( indel, '\t'.join( count_out ) ) ) - output.close() - except Exception, e: - stop_err( 'Failed to output data: %s' % str( e ) ) - -if __name__=="__main__": __main__() diff -r 21c98615987e8ea08187c4455a2978290f25657b -r e69a88d3225969567b011eb09331e1c3f8fe9bf2 tools/indels/indel_table.xml --- a/tools/indels/indel_table.xml +++ /dev/null @@ -1,122 +0,0 @@ -<tool id="indel_table" name="Indel Analysis Table" version="1.0.0"> - <description>for combining indel interval data</description> - <command interpreter="python"> - indel_table.py - --input1=$input1 - --sum1=$sum1 - --input2=$input2 - --sum2=$sum2 - --output=$output1 - #for $i in $inputs - ${i.input} - ${i.sum} - #end for - </command> - <inputs> - <param format="interval" name="input1" type="data" label="Select first file to add" /> - <param name="sum1" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Include first file's totals in overall total" /> - <param format="interval" name="input2" type="data" label="Select second file to add" /> - <param name="sum2" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Include second file's totals in overall total" /> - <repeat name="inputs" title="Input Files"> - <param name="input" label="Add file" type="data" format="interval" /> - <param name="sum" type="boolean" checked="true" truevalue="true" falsevalue="false" label="Include file's totals in overall total" /> - </repeat> - </inputs> - <outputs> - <data format="interval" name="output1" /> - </outputs> - <tests> - <test> - <param name="input1" value="indel_table_in1.interval" ftype="interval" /> - <param name="sum1" value="true"/> - <param name="input2" value="indel_table_in2.interval" ftype="interval" /> - <param name="sum2" value="true" /> - <param name="input" value="indel_table_in3.interval" ftype="interval" /> - <param name="sum" value="true" /> - <output name="output1" file="indel_table_out1.interval" ftype="interval" /> - </test> - </tests> - <help> - -**What it does** - -Creates a table allowing for analysis and comparison of indel data. Combines any number of interval files that have been produced by the tool that converts indel SAM data to interval format. Includes overall total counts for all or some files. The tool has the option to not include a given file's counts in the total column. This could be useful for combined data if the counts for certain indels might be included more than once. - -The exact columns of the output will depend on the columns of the input. Here is the detailed specification of the output columns:: - - Column Description - ------------------------------- ---------------------------------------------------------------------------------- - 1 ... m "Indel" All the "indel" columns, which contain the info that will be checked for equality - m + 1 Total Occurrences Total number of occurrences of this indel across all (included) files - m + 2 Occurrences for File 1 Number of occurrences of this indel for first file - m + 3 Occurrences for File 2 Number of occurrences of this indel for second file - [m + ...] [...] [Number of occurrences of this indel for ... file] - -The most likely columns would be from the output of the Convert SAM to Interval/BED tool, so: Chromosome, Start position, End position, I/D (Insertion/Deletion), -/<base(s)> (Deletion/Inserted base(s)), Total Occurrences (across files), Occurrences for File 1, Occurrences for File 2, etc. See below for an example. - - ------ - -**Example** - -Suppose you have the following 4 files:: - - chrM 300 301 D - 6 - chrM 303 304 D - 19 - chrM 359 360 D - 1 - chrM 410 411 D - 1 - chrM 435 436 D - 1 - - chrM 410 411 D - 1 - chrM 714 715 D - 1 - chrM 995 997 D - 1 - chrM 1168 1169 I A 1 - chrM 1296 1297 D - 1 - - chrM 300 301 D - 8 - chrM 525 526 D - 1 - chrM 958 959 D - 1 - chrM 995 996 D - 3 - chrM 1168 1169 I C 1 - chrM 1296 1297 D - 1 - - chrM 303 304 D - 22 - chrM 410 411 D - 1 - chrM 435 436 D - 1 - chrM 714 715 D - 1 - chrM 753 754 I A 1 - chrM 1168 1169 I A 1 - -and the fifth file:: - - chrM 303 304 D - 22 - chrM 410 411 D - 2 - chrM 435 436 D - 1 - chrM 714 715 D - 2 - chrM 753 754 I A 1 - chrM 995 997 D - 1 - chrM 1168 1169 I A 2 - chrM 1296 1297 D - 1 - -The following will be produced if you include the first four files in the sum, but not the fifth:: - - chrM 300 301 D - 14 6 0 8 0 0 - chrM 303 304 D - 41 19 0 0 22 22 - chrM 359 360 D - 1 1 0 0 0 0 - chrM 410 411 D - 3 1 1 0 1 2 - chrM 435 436 D - 2 1 0 0 1 2 - chrM 525 526 D - 1 0 0 1 0 0 - chrM 714 715 D - 2 0 1 0 1 2 - chrM 753 754 I A 1 0 0 0 1 1 - chrM 958 959 D - 1 0 0 1 0 0 - chrM 995 996 D - 3 0 0 3 0 0 - chrM 995 997 D - 1 0 1 0 0 1 - chrM 1168 1169 I A 2 0 1 0 1 2 - chrM 1168 1169 I C 1 0 0 1 0 0 - chrM 1296 1297 D - 2 0 1 1 0 1 - -The first numeric column includes the total or the next four columns, but not the fifth. - - - </help> -</tool> diff -r 21c98615987e8ea08187c4455a2978290f25657b -r e69a88d3225969567b011eb09331e1c3f8fe9bf2 tools/indels/sam_indel_filter.py --- a/tools/indels/sam_indel_filter.py +++ /dev/null @@ -1,91 +0,0 @@ -#!/usr/bin/env python - -""" -Allows user to filter out non-indels from SAM. - -usage: %prog [options] - -i, --input=i: Input SAM file to be filtered - -q, --quality_threshold=q: Minimum quality value for adjacent bases - -a, --adjacent_bases=a: Number of adjacent bases on each size to check qualities - -o, --output=o: Filtered output SAM file -""" - -import re, sys -from galaxy import eggs -import pkg_resources; pkg_resources.require( "bx-python" ) -from bx.cookbook import doc_optparse - - -def stop_err( msg ): - sys.stderr.write( '%s\n' % msg ) - sys.exit() - -def __main__(): - #Parse Command Line - options, args = doc_optparse.parse( __doc__ ) - # prep output file - output = open( options.output, 'wb' ) - # patterns - pat = re.compile( '^(?P<lmatch>\d+)M(?P<ins_del_width>\d+)(?P<ins_del>[ID])(?P<rmatch>\d+)M$' ) - pat_multi = re.compile( '(\d+[MIDNSHP])(\d+[MIDNSHP])(\d+[MIDNSHP])+' ) - try: - qual_thresh = int( options.quality_threshold ) - if qual_thresh < 0 or qual_thresh > 93: - raise ValueError - except ValueError: - stop_err( 'Your quality threshold should be an integer between 0 and 93, inclusive.' ) - try: - adj_bases = int( options.adjacent_bases ) - if adj_bases < 1: - raise ValueError - except ValueError: - stop_err( 'The number of adjacent bases should be an integer greater than 1.' ) - # record lines skipped because of more than one indel - multi_indel_lines = 0 - # go through all lines in input file - for i,line in enumerate(open( options.input, 'rb' )): - if line and not line.startswith( '#' ) and not line.startswith( '@' ) : - split_line = line.split( '\t' ) - cigar = split_line[5].strip() - # find matches like 3M2D7M or 7M3I10M - match = {} - m = pat.match( cigar ) - # if unprocessable CIGAR - if not m: - m = pat_multi.match( cigar ) - # skip this line if no match - if not m: - continue - # account for multiple indels or operations we don't process - else: - multi_indel_lines += 1 - # otherwise get matching parts - else: - match = m.groupdict() - # process for indels - if match: - left = int( match[ 'lmatch' ] ) - right = int( match[ 'rmatch' ] ) - if match[ 'ins_del' ] == 'I': - middle = int( match[ 'ins_del_width' ] ) - else: - middle = 0 - # if there are enough adjacent bases to check, then do so - if left >= adj_bases and right >= adj_bases: - quals = split_line[10] - eligible_quals = quals[ left - adj_bases : left + middle + adj_bases ] - qual_thresh_met = True - for q in eligible_quals: - if ord( q ) - 33 < qual_thresh: - qual_thresh_met = False - break - # if filter reqs met, output line - if qual_thresh_met: - output.write( line ) - # close out file - output.close() - # if skipped lines because of more than one indel, output message - if multi_indel_lines > 0: - sys.stdout.write( '%s alignments were skipped because they contained more than one indel.' % multi_indel_lines ) - -if __name__=="__main__": __main__() diff -r 21c98615987e8ea08187c4455a2978290f25657b -r e69a88d3225969567b011eb09331e1c3f8fe9bf2 tools/indels/sam_indel_filter.xml --- a/tools/indels/sam_indel_filter.xml +++ /dev/null @@ -1,77 +0,0 @@ -<tool id="sam_indel_filter" name="Filter Indels" version="1.0.0"> - <description>for SAM</description> - <command interpreter="python"> - sam_indel_filter.py - --input=$input1 - --quality_threshold=$quality_threshold - --adjacent_bases=$adjacent_bases - --output=$out_file1 - </command> - <inputs> - <param format="sam" name="input1" type="data" label="Select dataset to filter" /> - <param name="quality_threshold" type="integer" value="40" label="Quality threshold for adjacent bases" help="Takes Phred value assuming Sanger scale; usually between 0 and 40, but up to 93" /> - <param name="adjacent_bases" type="integer" value="1" label="The number of adjacent bases to match on either side of the indel" help="If one side is shorter than this width, the read will be excluded" /> - </inputs> - <outputs> - <data format="sam" name="out_file1" /> - </outputs> - <tests> - <test> - <param name="input1" value="sam_indel_filter_in1.sam" ftype="sam"/> - <param name="quality_threshold" value="14"/> - <param name="adjacent_bases" value="2"/> - <output name="out_file1" file="sam_indel_filter_out1.sam" ftype="sam"/> - </test> - <test> - <param name="input1" value="sam_indel_filter_in1.sam" ftype="sam"/> - <param name="quality_threshold" value="29"/> - <param name="adjacent_bases" value="5"/> - <output name="out_file1" file="sam_indel_filter_out2.sam" ftype="sam"/> - </test> - <test> - <param name="input1" value="sam_indel_filter_in2.sam" ftype="sam"/> - <param name="quality_threshold" value="7"/> - <param name="adjacent_bases" value="1"/> - <output name="out_file1" file="sam_indel_filter_out3.sam" ftype="sam"/> - </test> - </tests> - <help> - -**What it does** - -Allows extracting indels from SAM produced by BWA. Currently it can handle SAM with alignments that have only one insertion or one deletion, and will skip that alignment if it encounters one with more than one indel. It matches CIGAR strings (column 6 in the SAM file) like 5M3I5M or 4M2D10M, so there must be a match or mismatch of sufficient length on either side of the indel. - ------ - -**Example** - -Suppose you have the following:: - - r770 89 ref 116 37 17M1I5M = 72131356 0 CACACTGTGACAGACAGCGCAGC 00/02!!0//1200210AA44/1 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r770 181 ref 116 0 24M = 72131356 0 TTGGTGCGCGCGGTTGAGGGTTGG $$(#%%#$%#%####$%%##$### - r1945 177 ref 41710908 0 23M 190342418 181247988 0 AGAGAGAGAGAGAGAGAGAGAGA SQQWZYURVYWX]]YXTSY]]ZM XT:A:R CM:i:0 SM:i:0 AM:i:0 X0:i:163148 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 - r3671 117 ref 190342418 0 24M = 190342418 0 CTGGCGTTCTCGGCGTGGATGGGT #####$$##$#%#%%###%$#$## - r3671 153 ref 190342418 37 16M1I6M = 190342418 0 TCTAACTTAGCCTCATAATAGCT /<<!"0///////00/!!0121/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r3824 117 ref 80324999 0 24M = 80324999 0 TCCAGTCGCGTTGTTAGGTTCGGA #$#$$$#####%##%%###**#+/ - r3824 153 ref 80324999 37 8M1I14M = 80324999 0 TTTAGCCCGAAATGCCTAGAGCA 4;6//11!"11100110////00 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r4795 81 ref 26739130 0 23M 57401793 57401793 0 TGGCATTCCTGTAGGCAGAGAGG AZWWZS]!"QNXZ]VQ]]]/2]] XT:A:R CM:i:2 SM:i:0 AM:i:0 X0:i:3 X1:i:0 XM:i:2 XO:i:0 XG:i:0 MD:Z:23 - r4795 161 ref 57401793 37 23M 26739130 26739130 0 GATCACCCAGGTGATGTAACTCC ]WV]]]]WW]]]]]]]]]]PU]] XT:A:U CM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 - r4800 16 ref 241 255 15M1D8M = 0 0 CGTGGCCGGCGGGCCGAAGGCAT IIIIIIIIIICCCCIII?IIIII XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r5377 170 ref 59090793 37 23M 26739130 26739130 0 TATCAATAAGGTGATGTAACTCG ]WV]ABAWW]]]]]P]P//GU]] XT:A:U CM:i:0 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:0 XO:i:0 XG:i:0 MD:Z:23 - r5612 151 ref 190342418 37 19M1I3M = 190342418 0 TCTAACTTAGCCTCATAATAGCT /<<!"0/4//7//00/BC0121/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - - -To select only alignments with indels, you need to determine the minimum quality you want the adjacent bases to have, as well as the number of adjacent bases to check. If you set the quality threshold to 47 and the number of bases to check to 2, you will get the following output:: - - r770 89 ref 116 37 17M1I5M = 72131356 0 CACACTGTGACAGACAGCGCAGC 00/02!!0//1200210AA44/1 XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r4800 16 ref 241 255 15M1D8M = 0 0 CGTGGCCGGCGGGCCGAAGGCAT IIIIIIIIIICCCCIII?IIIII XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - r5612 151 ref 190342418 37 19M1I3M = 190342418 0 TCTAACTTAGCCTCATAATAGCT /<<!"0/4//7//00/BC0121/ XT:A:U CM:i:2 SM:i:37 AM:i:0 X0:i:1 X1:i:0 XM:i:1 XO:i:1 XG:i:1 MD:Z:22 - - -For more information on SAM, please consult the `SAM format description`__. - -.. __: http://www.ncbi.nlm.nih.gov/pubmed/19505943 - - - </help> -</tool> Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.