[hg] galaxy 2706: Added sam2interval converter
details: http://www.bx.psu.edu/hg/galaxy/rev/a9c4d314ac89 changeset: 2706:a9c4d314ac89 user: Anton Nekrutenko <anton@bx.psu.edu> date: Wed Sep 16 16:00:09 2009 -0400 description: Added sam2interval converter 6 file(s) affected in this change: test-data/sam2interval_noprintAll.dat test-data/sam2interval_printAll.dat test-data/sam_bioinf_example.sam tool_conf.xml.sample tools/samtools/sam2interval.py tools/samtools/sam2interval.xml diffs (218 lines): diff -r e2d37ea1a436 -r a9c4d314ac89 test-data/sam2interval_noprintAll.dat --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sam2interval_noprintAll.dat Wed Sep 16 16:00:09 2009 -0400 @@ -0,0 +1,6 @@ +ref 6 22 + +ref 8 19 + +ref 8 14 + +ref 15 40 + +ref 28 33 - +ref 36 45 - diff -r e2d37ea1a436 -r a9c4d314ac89 test-data/sam2interval_printAll.dat --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sam2interval_printAll.dat Wed Sep 16 16:00:09 2009 -0400 @@ -0,0 +1,6 @@ +ref 6 22 + r001 163 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTA * +ref 8 19 + r002 0 ref 9 30 3S6M1P1I4M * 0 0 AAAAGATAAGGATA * +ref 8 14 + r003 0 ref 9 30 5H6M * 0 0 AGCTAA * NM:i:1 +ref 15 40 + r004 0 ref 16 30 6M14N5M * 0 0 ATAGCTTCAGC * +ref 28 33 - r003 16 ref 29 30 6H5M * 0 0 TAGGC * NM:i:0 +ref 36 45 - r001 83 ref 37 30 9M = 7 -39 CAGCGCCAT * diff -r e2d37ea1a436 -r a9c4d314ac89 test-data/sam_bioinf_example.sam --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sam_bioinf_example.sam Wed Sep 16 16:00:09 2009 -0400 @@ -0,0 +1,6 @@ +r001 163 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTA * +r002 0 ref 9 30 3S6M1P1I4M * 0 0 AAAAGATAAGGATA * +r003 0 ref 9 30 5H6M * 0 0 AGCTAA * NM:i:1 +r004 0 ref 16 30 6M14N5M * 0 0 ATAGCTTCAGC * +r003 16 ref 29 30 6H5M * 0 0 TAGGC * NM:i:0 +r001 83 ref 37 30 9M = 7 -39 CAGCGCCAT * diff -r e2d37ea1a436 -r a9c4d314ac89 tool_conf.xml.sample --- a/tool_conf.xml.sample Wed Sep 16 15:30:17 2009 -0400 +++ b/tool_conf.xml.sample Wed Sep 16 16:00:09 2009 -0400 @@ -343,6 +343,7 @@ </section> <section name="SAM Tools" id="samtools"> <tool file="samtools/sam_bitwise_flag_filter.xml" /> + <tool file="samtools/sam2interval.xml" /> <tool file="samtools/sam_to_bam.xml" /> <tool file="samtools/sam_merge.xml" /> <tool file="samtools/sam_pileup.xml" /> diff -r e2d37ea1a436 -r a9c4d314ac89 tools/samtools/sam2interval.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/samtools/sam2interval.py Wed Sep 16 16:00:09 2009 -0400 @@ -0,0 +1,100 @@ +#!/usr/bin/env python + +import sys +import optparse +import re + +def stop_err( msg ): + sys.stderr.write( msg ) + sys.exit() + +def main(): + usage = """%prog [options] + +options (listed below) default to 'None' if omitted + """ + parser = optparse.OptionParser(usage=usage) + + parser.add_option( + '-f','--input_sam_file', + metavar="INPUT_SAM_FILE", + dest='input_sam', + default = False, + help='Name of the SAM file to be filtered. STDIN is default') + + parser.add_option( + '-c','--flag_column', + dest='flag_col', + default = '2', + help='Column containing SAM bitwise flag. 1-based') + + parser.add_option( + '-s','--start_column', + dest='start_col', + default = '4', + help='Column containing position. 1-based') + + parser.add_option( + '-g','--cigar_column', + dest='cigar_col', + default = '6', + help='Column containing CIGAR or extended CIGAR string') + + parser.add_option( + '-r','--ref_column', + dest='ref_col', + default = '3', + help='Column containing name of the refernce sequence coordinate. 1-based') + + parser.add_option( + '-e','--read_column', + dest='read_col', + default = '1', + help='Column containing read name. 1-based') + + parser.add_option( + '-d','--debug', + dest='debug', + action='store_true', + default = False, + help='Print debugging info') + + parser.add_option( + '-p','--print_all', + dest='prt_all', + action='store_true', + default = False, + help='Print coordinates and original SAM?') + + + options, args = parser.parse_args() + + if options.input_sam: + infile = open ( options.input_sam, 'r') + else: + infile = sys.stdin + + cigar = re.compile( '\d+M|\d+N|\d+D|\d+P' ) + + for line in infile: + line = line.rstrip( '\r\n' ) + if line and not line.startswith( '#,@' ): + fields = line.split( '\t' ) + start = int( fields[ int( options.start_col ) - 1 ] ) - 1 + end = 0 + for op in cigar.findall( fields[ int( options.cigar_col) - 1 ] ): + end += int( op[ 0:len( op ) - 1 ] ) + + strand = '+' + if bool( int( fields[ int( options.flag_col ) - 1 ] ) & 0x0010 ): + strand = '-' + read_name = fields[ int( options.read_col ) - 1 ] + ref_name = fields[ int( options.ref_col ) - 1 ] + + if options.prt_all: + print '%s\t%s\t%s\t%s\t%s' % (ref_name, str(start), str(end+start), strand, line) + else: + print '%s\t%s\t%s\t%s' % (ref_name, str(start), str(end+start), strand) + +if __name__ == "__main__": main() + diff -r e2d37ea1a436 -r a9c4d314ac89 tools/samtools/sam2interval.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/samtools/sam2interval.xml Wed Sep 16 16:00:09 2009 -0400 @@ -0,0 +1,69 @@ +<tool id="sam2interval" name="Convert SAM" version="1.0.0"> + <description>to interval</description> + <command interpreter="python">sam2interval.py --input_sam_file=$input1 $print_all > $out_file1 + </command> + <inputs> + <param format="sam" name="input1" type="data" label="Select dataset to convert"/> + <param name="print_all" type="select" label="Print all?" help="Do you want to retain original SAM fields? See example below."> + <option value="-p">Yes</option> + <option value="">No</option> + </param> + </inputs> + <outputs> + <data format="interval" name="out_file1" /> + </outputs> +<tests> + <test> + <param name="input1" value="sam_bioinf_example.sam" ftype="sam"/> + <param name="flags" value="Read is mapped in a proper pair"/> + <param name="print_all" value="Yes"/> + <output name="out_file1" file="sam2interval_printAll.dat" ftype="interval"/> + </test> + <test> + <param name="input1" value="sam_bioinf_example.sam" ftype="sam"/> + <param name="flags" value="Read is mapped in a proper pair"/> + <param name="print_all" value="No"/> + <output name="out_file1" file="sam2interval_noprintAll.dat" ftype="interval"/> + </test> + +</tests> + <help> + +**What it does** + +Converts positional information from a SAM dataset into interval format with 0-based start and 1-based end. To calculate the end position the tool uses the CIGAR string. + +----- + +**Example** + +Converting the following dataset:: + + r001 163 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTA * + r002 0 ref 9 30 3S6M1P1I4M * 0 0 AAAAGATAAGGATA * + r003 0 ref 9 30 5H6M * 0 0 AGCTAA * NM:i:1 + r004 0 ref 16 30 6M14N5M * 0 0 ATAGCTTCAGC * + r003 16 ref 29 30 6H5M * 0 0 TAGGC * NM:i:0 + r001 83 ref 37 30 9M = 7 -39 CAGCGCCAT * + +into Interval format will produce the following if *Print all?* is set to **Yes**:: + + ref 6 22 + r001 163 ref 7 30 8M2I4M1D3M = 37 39 TTAGATAAAGGATACTA * + ref 8 19 + r002 0 ref 9 30 3S6M1P1I4M * 0 0 AAAAGATAAGGATA * + ref 8 14 + r003 0 ref 9 30 5H6M * 0 0 AGCTAA * NM:i:1 + ref 15 40 + r004 0 ref 16 30 6M14N5M * 0 0 ATAGCTTCAGC * + ref 28 33 - r003 16 ref 29 30 6H5M * 0 0 TAGGC * NM:i:0 + ref 36 45 - r001 83 ref 37 30 9M = 7 -39 CAGCGCCAT * + +Setting *Print all?* is set to **No** will generate the following:: + + ref 6 22 + r001 + ref 8 19 + r002 + ref 8 14 + r003 + ref 15 40 + r004 + ref 28 33 - r003 + ref 36 45 - r001 + + + </help> +</tool>
participants (1)
-
Greg Von Kuster