details: http://www.bx.psu.edu/hg/galaxy/rev/1d326855ba89 changeset: 1517:1d326855ba89 user: wychung date: Thu Sep 18 15:41:23 2008 -0400 description: update shrimp_wrapper. 2 file(s) affected in this change: tools/metag_tools/shrimp_wrapper.py tools/metag_tools/shrimp_wrapper.xml diffs (621 lines): diff -r f1da9b95549b -r 1d326855ba89 tools/metag_tools/shrimp_wrapper.py --- a/tools/metag_tools/shrimp_wrapper.py Thu Sep 18 15:24:51 2008 -0400 +++ b/tools/metag_tools/shrimp_wrapper.py Thu Sep 18 15:41:23 2008 -0400 @@ -61,17 +61,13 @@ reversed_s.reverse() return "".join(reversed_s) -def generate_sub_table(result_file, ref_file, score_files, table_outfile, hit_per_read): +def generate_sub_table(result_file, ref_file, score_files, table_outfile, hit_per_read, insertion_size): - """ - TODO: the cross-over error has not been addressed yet. - """ + invalid_editstring_char = 0 - insertion_size = 600 + all_score_file = score_files.split(',') - all_score_file = score_files.split('&') - - if len(all_score_file) != hit_per_read: stop_err('Un-equal number of files!') + if len(all_score_file) != hit_per_read: stop_err('One or more query files is missing. Please check your dataset.') temp_table_name = tempfile.NamedTemporaryFile().name temp_table = open(temp_table_name, 'w') @@ -178,7 +174,7 @@ hits_score[readname] = {} hits_score[readname][endindex] = score - # mutation call to all mappings + # call to all mappings for readkey in hits.keys(): if len(hits[readkey]) != hit_per_read: continue @@ -211,6 +207,7 @@ match_count += 1 if match_count == 1: + for x, end_data in enumerate(matches[0]): end_strand, end_editstring, end_chr_start, end_chr_end, end_read_start, end_chrom = end_data @@ -226,20 +223,26 @@ gap_read = 0 while editindex < len(end_editstring): + editchr = end_editstring[editindex] chrA = '' chrB = '' locIndex = [] + if editchr.isdigit(): editcode = '' + while editchr.isdigit() and editindex < len(end_editstring): editcode += editchr editindex += 1 if editindex < len(end_editstring): editchr = end_editstring[editindex] + for baseIndex in range(int(editcode)): chrA += refsegment[match_len+baseIndex] chrB = chrA + match_len += int(editcode) + elif editchr == 'x': # crossover: inserted between the appropriate two bases # Two sequencing errors: 4x15x6 (25 matches with 2 crossovers) @@ -263,18 +266,21 @@ elif editchr == '(': editcode = '' + while editchr != ')' and editindex < len(end_editstring): if editindex < len(end_editstring): editchr = end_editstring[editindex] editcode += editchr editindex += 1 + editcode = editcode[1:-1] chrA = '-'*len(editcode) chrB = editcode else: - print 'Warning! Unknown symbols', editchr - + invalid_editstring_char += 1 + if end_strand == '-': + chrA = reverse_complement(chrA) chrB = reverse_complement(chrB) @@ -288,9 +294,12 @@ chrBx = chrB[mappingIndex] if chrAx and chrBx and chrBx.upper() != 'N': + if end_strand == '+': + chrom_loc = end_chr_start+match_len-len(chrA)+mappingIndex read_loc = end_read_start+match_len-len(chrA)+mappingIndex-gap_read + if chrAx == '-': chrom_loc -= 1 if chrBx == '-': @@ -300,9 +309,12 @@ # 1-based on chrom_loc and read_loc pos_line = pos_line + '\t'.join([end_chrom, str(chrom_loc+1), readkey+'/'+str(x+1), str(read_loc+1), chrAx, chrBx, scoreBx]) + '\n' + else: + chrom_loc = end_chr_end-match_len+mappingIndex read_loc = end_read_start+match_len-1-mappingIndex-gap_read + if chrAx == '-': chrom_loc -= 1 if chrBx == '-': @@ -314,11 +326,14 @@ rev_line = '\t'.join([end_chrom, str(chrom_loc+1), readkey+'/'+str(x+1), str(read_loc+1), chrAx, chrBx, scoreBx]) +'\n' + rev_line if chrom_cov.has_key(end_chrom): + if chrom_cov[end_chrom].has_key(chrom_loc): chrom_cov[end_chrom][chrom_loc] += 1 else: chrom_cov[end_chrom][chrom_loc] = 1 + else: + chrom_cov[end_chrom] = {} chrom_cov[end_chrom][chrom_loc] = 1 @@ -329,6 +344,7 @@ # chrom-wide coverage for i, line in enumerate(open(temp_table_name)): + line = line.rstrip() if not line or line.startswith('#'): continue @@ -348,6 +364,9 @@ outfile.close() if os.path.exists(temp_table_name): os.remove(temp_table_name) + + if invalid_editstring_char: + print 'Skip ', invalid_editstring_char, ' invalid characters in editstrings' return True @@ -359,7 +378,7 @@ seq_title_startswith = '' qual_title_startswith = '' - default_coding_value = 64 + default_coding_value = 64 # Solexa ascii-code fastq_block_lines = 0 for i, line in enumerate( file( infile_name ) ): @@ -448,16 +467,63 @@ def __main__(): + # SHRiMP path + shrimp = 'rmapper-ls' + # I/O - type_of_reads = sys.argv[1] # single or paired - input_target = sys.argv[2] # fasta - shrimp_outfile = sys.argv[3] # shrimp output - table_outfile = sys.argv[4] # table output - - # SHRiMP parameters: total = 15 - # TODO: put threshold on each of these parameters - if len(sys.argv) == 21 or len(sys.argv) == 22: - spaced_seed = sys.argv[5] + input_target_file = sys.argv[1] # fasta + shrimp_outfile = sys.argv[2] # shrimp output + table_outfile = sys.argv[3] # table output + single_or_paired = sys.argv[4].split(',') + + insertion_size = 600 + + if len(single_or_paired) == 1: # single or paired + type_of_reads = 'single' + hit_per_read = 1 + input_query = single_or_paired[0] + query_fasta = tempfile.NamedTemporaryFile().name + query_qual = tempfile.NamedTemporaryFile().name + + else: # paired-end + type_of_reads = 'paired' + hit_per_read = 2 + input_query_end1 = single_or_paired[0] + input_query_end2 = single_or_paired[1] + insertion_size = int(single_or_paired[2]) + query_fasta_end1 = tempfile.NamedTemporaryFile().name + query_fasta_end2 = tempfile.NamedTemporaryFile().name + query_qual_end1 = tempfile.NamedTemporaryFile().name + query_qual_end2 = tempfile.NamedTemporaryFile().name + + # SHRiMP parameters: total = 15, default values + spaced_seed = '111111011111' + seed_matches_per_window = '2' + seed_hit_taboo_length = '4' + seed_generation_taboo_length = '0' + seed_window_length = '115.0' + max_hits_per_read = '100' + max_read_length = '1000' + kmer = '-1' + sw_match_value = '100' + sw_mismatch_value = '-150' + sw_gap_open_ref = '-400' + sw_gap_open_query = '-400' + sw_gap_ext_ref = '-70' + sw_gap_ext_query = '-70' + sw_hit_threshold = '68.0' + + # TODO: put the threshold on each of these parameters + if len(sys.argv) > 5: + + try: + if sys.argv[5].isdigit(): + spaced_seed = sys.argv[5] + else: + stop_err('Error in assigning parameter: Spaced seed.') + except: + stop_err('Spaced seed must be a combination of 1s and 0s.') + seed_matches_per_window = sys.argv[6] seed_hit_taboo_length = sys.argv[7] seed_generation_taboo_length = sys.argv[8] @@ -473,53 +539,6 @@ sw_gap_ext_query = sys.argv[18] sw_hit_threshold = sys.argv[19] - # Single-end parameters - if type_of_reads == 'single': - input_query = sys.argv[20] # single-end - hit_per_read = 1 - query_fasta = tempfile.NamedTemporaryFile().name - query_qual = tempfile.NamedTemporaryFile().name - else: # Paired-end parameters - input_query_end1 = sys.argv[20] # paired-end - input_query_end2 = sys.argv[21] - hit_per_read = 2 - query_fasta_end1 = tempfile.NamedTemporaryFile().name - query_fasta_end2 = tempfile.NamedTemporaryFile().name - query_qual_end1 = tempfile.NamedTemporaryFile().name - query_qual_end2 = tempfile.NamedTemporaryFile().name - else: - spaced_seed = '111111011111' - seed_matches_per_window = '2' - seed_hit_taboo_length = '4' - seed_generation_taboo_length = '0' - seed_window_length = '115.0' - max_hits_per_read = '100' - max_read_length = '1000' - kmer = '-1' - sw_match_value = '100' - sw_mismatch_value = '-150' - sw_gap_open_ref = '-400' - sw_gap_open_query = '-400' - sw_gap_ext_ref = '-70' - sw_gap_ext_query = '-70' - sw_hit_threshold = '68.0' - - # Single-end parameters - if type_of_reads == 'single': - input_query = sys.argv[5] # single-end - hit_per_read = 1 - query_fasta = tempfile.NamedTemporaryFile().name - query_qual = tempfile.NamedTemporaryFile().name - else: # Paired-end parameters - input_query_end1 = sys.argv[5] # paired-end - input_query_end2 = sys.argv[6] - hit_per_read = 2 - query_fasta_end1 = tempfile.NamedTemporaryFile().name - query_fasta_end2 = tempfile.NamedTemporaryFile().name - query_qual_end1 = tempfile.NamedTemporaryFile().name - query_qual_end2 = tempfile.NamedTemporaryFile().name - - # temp file for shrimp log file shrimp_log = tempfile.NamedTemporaryFile().name @@ -532,7 +551,7 @@ # SHRiMP command if type_of_reads == 'single': - command = ' '.join(['rmapper-ls', '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta, input_target, '>', shrimp_outfile, '2>', shrimp_log]) + command = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta, input_target_file, '>', shrimp_outfile, '2>', shrimp_log]) try: os.system(command) @@ -541,9 +560,9 @@ if os.path.exists(query_qual): os.remove(query_qual) stop_err(str(e)) - else: - command_end1 = ' '.join(['rmapper-ls', '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end1, input_target, '>', shrimp_outfile, '2>', shrimp_log]) - command_end2 = ' '.join(['rmapper-ls', '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end2, input_target, '>>', shrimp_outfile, '2>>', shrimp_log]) + else: # paired + command_end1 = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end1, input_target_file, '>', shrimp_outfile, '2>', shrimp_log]) + command_end2 = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end2, input_target_file, '>>', shrimp_outfile, '2>>', shrimp_log]) try: os.system(command_end1) @@ -557,9 +576,9 @@ # convert to table if type_of_reads == 'single': - return_value = generate_sub_table(shrimp_outfile, input_target, query_qual, table_outfile, hit_per_read) + return_value = generate_sub_table(shrimp_outfile, input_target_file, query_qual, table_outfile, hit_per_read, insertion_size) else: - return_value = generate_sub_table(shrimp_outfile, input_target, query_qual_end1+'&'+query_qual_end2, table_outfile, hit_per_read) + return_value = generate_sub_table(shrimp_outfile, input_target_file, query_qual_end1+','+query_qual_end2, table_outfile, hit_per_read, insertion_size) # remove temp. files if type_of_reads == 'single': diff -r f1da9b95549b -r 1d326855ba89 tools/metag_tools/shrimp_wrapper.xml --- a/tools/metag_tools/shrimp_wrapper.xml Thu Sep 18 15:24:51 2008 -0400 +++ b/tools/metag_tools/shrimp_wrapper.xml Thu Sep 18 15:41:23 2008 -0400 @@ -1,50 +1,51 @@ <tool id="shrimp_wrapper" name="SHRiMP" version="1.0.0"> <description>SHort Read Mapping Package</description> <command interpreter="python"> - #if ($type_of_reads.single_or_paired=="single" and $param.skip_or_full=="skip"):#shrimp_wrapper.py $type_of_reads.single_or_paired $input_target $output1 $output2 $input_query - #elif ($type_of_reads.single_or_paired=="paired" and $param.skip_or_full=="skip"):#shrimp_wrapper.py $type_of_reads.single_or_paired $input_target $output1 $output2 ${type_of_reads.input1} ${type_of_reads.input2} - #elif ($type_of_reads.single_or_paired=="single" and $param.skip_or_full=="full"):#shrimp_wrapper.py $type_of_reads.single_or_paired $input_target $output1 $output2 $param.spaced_seed $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_hit_threshold $input_query - #elif ($type_of_reads.single_or_paired=="paired" and $param.skip_or_full=="full"):#shrimp_wrapper.py $type_of_reads.single_or_paired $input_target $output1 $output2 $param.spaced_seed $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_hit_threshold ${type_of_reads.input1} ${type_of_reads.input2} + #if ($type_of_reads.single_or_paired=="single" and $param.skip_or_full=="skip"):#shrimp_wrapper.py $input_target $output1 $output2 $input_query + #elif ($type_of_reads.single_or_paired=="paired" and $param.skip_or_full=="skip"):#shrimp_wrapper.py $input_target $output1 $output2 $type_of_reads.input1,$type_of_reads.input2,$type_of_reads.insertion_size + #elif ($type_of_reads.single_or_paired=="single" and $param.skip_or_full=="full"):#shrimp_wrapper.py $input_target $output1 $output2 $input_query $param.spaced_seed $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_hit_threshold + #elif ($type_of_reads.single_or_paired=="paired" and $param.skip_or_full=="full"):#shrimp_wrapper.py $input_target $output1 $output2 $type_of_reads.input1,$type_of_reads.input2,$type_of_reads.insertion_size $param.spaced_seed $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_hit_threshold #end if </command> <inputs> <page> - <param name="input_target" type="data" format="fasta" label="Reference sequence" /> <conditional name="type_of_reads"> <param name="single_or_paired" type="select" label="Single- or Paired-ends"> <option value="single">Single-end</option> <option value="paired">Paired-end</option> </param> <when value="single"> - <param name="input_query" type="data" format="fastqsolexa" label="Sequence file" /> + <param name="input_query" type="data" format="fastqsolexa" label="Align sequencing reads" /> </when> <when value="paired"> - <param name="input1" type="data" format="fastqsolexa" label="One end" /> - <param name="input2" type="data" format="fastqsolexa" label="The other end" /> + <param name="insertion_size" type="integer" size="5" value="600" label="Insertion length between two ends" help="bp" /> + <param name="input1" type="data" format="fastqsolexa" label="Align sequencing reads, one end" /> + <param name="input2" type="data" format="fastqsolexa" label="and the other end" /> </when> </conditional> + <param name="input_target" type="data" format="fasta" label="against reference" /> <conditional name="param"> - <param name="skip_or_full" type="select" label="SHRiMP parameter selection"> - <option value="skip">Default setting</option> - <option value="full">Full list</option> + <param name="skip_or_full" type="select" label="SHRiMP settings to use" help="For most mapping needs use Commonly used settings. If you want full control use Full List"> + <option value="skip">Commonly used</option> + <option value="full">Full Parameter List</option> </param> <when value="skip" /> <when value="full"> - <param name="spaced_seed" type="text" size="30" value="111111011111" label="Spaced Seed" /> - <param name="seed_matches_per_window" type="integer" size="5" value="2" label="Seed Matches per Window" /> - <param name="seed_hit_taboo_length" type="integer" size="5" value="4" label="Seed Hit Taboo Length" /> - <param name="seed_generation_taboo_length" type="integer" size="5" value="0" label="Seed Generation Taboo Length" /> - <param name="seed_window_length" type="float" size="10" value="115.0" label="Seed Window Length" help="in percentage"/> - <param name="max_hits_per_read" type="integer" size="10" value="100" label="Maximum Hits per Read" /> - <param name="max_read_length" type="integer" size="10" value="1000" label="Maximum Read Length" /> - <param name="kmer" type="integer" size="10" value="-1" label="Kmer Std. Deviation Limit" help="-1 as None"/> - <param name="sw_match_value" type="integer" size="10" value="100" label="S-W Match Value" /> - <param name="sw_mismatch_value" type="integer" size="10" value="-150" label="S-W Mismatch Value" /> - <param name="sw_gap_open_ref" type="integer" size="10" value="-400" label="S-W Gap Open Penalty (Reference)" /> - <param name="sw_gap_open_query" type="integer" size="10" value="-400" label="S-W Gap Open Penalty (Query)" /> - <param name="sw_gap_ext_ref" type="integer" size="10" value="-70" label="S-W Gap Extend Penalty (Reference)" /> - <param name="sw_gap_ext_query" type="integer" size="10" value="-70" label="S-W Gap Extend Penalty (Query)" /> - <param name="sw_hit_threshold" type="float" size="10" value="68.0" label="S-W Hit Threshold" help="in percentage"/> + <param name="spaced_seed" type="text" size="30" value="111111011111" label="Spaced Seed" /> + <param name="seed_matches_per_window" type="integer" size="5" value="2" label="Seed Matches per Window" /> + <param name="seed_hit_taboo_length" type="integer" size="5" value="4" label="Seed Hit Taboo Length" /> + <param name="seed_generation_taboo_length" type="integer" size="5" value="0" label="Seed Generation Taboo Length" /> + <param name="seed_window_length" type="float" size="10" value="115.0" label="Seed Window Length" help="in percentage"/> + <param name="max_hits_per_read" type="integer" size="10" value="100" label="Maximum Hits per Read" /> + <param name="max_read_length" type="integer" size="10" value="1000" label="Maximum Read Length" /> + <param name="kmer" type="integer" size="10" value="-1" label="Kmer Std. Deviation Limit" help="-1 as None"/> + <param name="sw_match_value" type="integer" size="10" value="100" label="S-W Match Value" /> + <param name="sw_mismatch_value" type="integer" size="10" value="-150" label="S-W Mismatch Value" /> + <param name="sw_gap_open_ref" type="integer" size="10" value="-400" label="S-W Gap Open Penalty (Reference)" /> + <param name="sw_gap_open_query" type="integer" size="10" value="-400" label="S-W Gap Open Penalty (Query)" /> + <param name="sw_gap_ext_ref" type="integer" size="10" value="-70" label="S-W Gap Extend Penalty (Reference)" /> + <param name="sw_gap_ext_query" type="integer" size="10" value="-70" label="S-W Gap Extend Penalty (Query)" /> + <param name="sw_hit_threshold" type="float" size="10" value="68.0" label="S-W Hit Threshold" help="in percentage"/> </when> </conditional> </page> @@ -54,7 +55,7 @@ <data name="output2" format="tabular"/> </outputs> <requirements> - <requirement type="binary">SHRiMP_rmapper</requirement> + <requirement type="binary">rmapper-ls</requirement> </requirements> <tests> <test> @@ -64,13 +65,14 @@ <param name="input_query" value="shrimp_wrapper_test1.fastq" ftype="fastqsolexa"/> <output name="output1" file="shrimp_wrapper_test1.out1" /> </test> - <!-- + <!-- <test> - <param name="input1" value="shrimp_wrapper_test2_end1.fastq" ftype="fastqsolexa" /> - <param name="input2" value="shrimp_wrapper_test2_end2.fastq" ftype="fastqsolexa" /> <param name="single_or_paired" value="paired" /> <param name="skip_or_full" value="skip" /> <param name="input_target" value="shrimp_eca_chrMT.fa" ftype="fasta" /> + <param name="input1" value="shrimp_wrapper_test2_end1.fastq" ftype="fastqsolexa" /> + <param name="input2" value="shrimp_wrapper_test2_end2.fastq" ftype="fastqsolexa" /> + <param name="insertion_size" value="600" /> <output name="output1" file="shrimp_wrapper_test2.out1" /> </test> <test> @@ -116,6 +118,7 @@ <param name="sw_hit_threshold" value="68.0" /> <param name="input1" value="shrimp_wrapper_test2_end1.fastq" ftype="fastqsolexa"/> <param name="input2" value="shrimp_wrapper_test2_end2.fastq" ftype="fastqsolexa"/> + <param name="insertion_size" value="600" /> <output name="output1" file="shrimp_wrapper_test2.out1" /> </test> --> @@ -124,67 +127,146 @@ .. class:: warningmark -Only nucleotide sequences as query. +Please note that only **nucleotide** sequences (letter-space) can be used as query. ----- **What it does** -Run SHRiMP on letter-space reads. - +SHRiMP (SHort Read Mapping Package) is a software package for aligning genomic reads against a target genome. + +This wrapper post-processes the default SHRiMP/rmapper-ls output and generates a table with all information from reads and reference for the mapping. The tool takes single- or paired-end reads. For single-end reads, only uniquely mapped alignment is considered. In paired-end reads, only pairs that meet the following criteria will be used to generate the table: 1). the ends fall within the insertion size; 2). the ends are mapped at the opposite directions. If there are still multiple mappings after applying the criteria, this paired-end read will be discarded. + + ----- - -**Example** - -- Input a multiple-fastq file like the following:: + +**Input formats** + +A multiple-fastq file, for example:: @seq1 TACCCGATTTTTTGCTTTCCACTTTATCCTACCCTT - +seq2 + +seq1 hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh -- Use default settings (for detail explanations, please see **Parameters** section) - -- Search against your own uploaded file, result will be in the following format:: - - +-------+-------+--------+----------+----------+---------+--------+--------+-------+------------+ - | id | chrom | strand | t.start | t.end | q.start | q.end | length | score | editstring | - +-------+-------+--------+----------+----------+---------+--------+--------+-------+------------+ - | >seq1 | chrMT | + | 14712 | 14747 | 1 | 36 | 36 | 3350 | 24T11 | - +-------+-------+--------+----------+----------+---------+--------+--------+-------+------------+ - -- The result will be formatted Table:: - - +-------+---------+---------+----------+---------+----------+---------+----------+ - | chrom | ref_loc | read_id | read_loc | ref_nuc | read_nuc | quality | coverage | - +-------+---------+---------+----------+---------+----------+---------+----------+ - | chrMT | 14711 | seq1 | 0 | T | T | 40 | 1 | - | chrMT | 14712 | seq1 | 1 | A | A | 40 | 1 | - | chrMT | 14713 | seq1 | 2 | C | C | 40 | 1 | - +-------+---------+---------+----------+---------+----------+---------+----------+ ----- -**Parameters** +**Outputs** -Parameter list with default value settings:: +The tool gives two outputs. + +**Table output** + +Table output contains 8 columns:: + + 1 2 3 4 5 6 7 8 + ---------------------------------------------------- + chrM 14711 seq1 0 T A 40 1 + chrM 14712 seq1 1 T T 40 1 + +where:: + + 1. (chrM) - Reference sequence id + 2. (14711) - Position of the mapping in the reference + 3. (seq1) - Read id + 4. (0) - Position of the mapping in the read + 5. (T) - Nucleotide in the reference + 6. (A) - Nucleotide in the read + 7. (40) - Quality score for the nucleotide in the position of the read + 8. (1) - The number of times this position is covered by reads + + +**SHRiMP output** + +This is the default output from SHRiMP/rmapper-ls:: + + 1 2 3 4 5 6 7 8 9 10 + ------------------------------------------------------------------- + seq1 chrM + 3644 3679 1 36 36 3600 36 + +where:: + + 1. (seq1) - Read id + 2. (chrM) - Reference sequence id + 3. (+) - Strand of the read + 4. (3466) - Start position of the alignment in the reference + 5. (3679) - End position of the alignment in the reference + 6. (1) - Start position of the alignment in the read + 7. (36) - End position of the alignment in the read + 8. (36) - Length of the read + 9. (3600) - Score + 10. (36) - Edit string + + +----- + +**SHRiMP parameter list** + +The commonly used parameters with default value setting:: -s Spaced Seed (default: 111111011111) + The spaced seed is a single contiguous string of 0's and 1's. + 0's represent wildcards, or positions which will always be + considered as matching, whereas 1's dictate positions that + must match. A string of all 1's will result in a simple kmer scan. -n Seed Matches per Window (default: 2) + The number of seed matches per window dictates how many seeds + must match within some window length of the genome before that + region is considered for Smith-Waterman alignment. A lower + value will increase sensitivity while drastically increasing + running time. Higher values will have the opposite effect. -t Seed Hit Taboo Length (default: 4) + The seed taboo length specifies how many target genome bases + or colours must exist prior to a previous seed match in order + to count another seed match as a hit. -9 Seed Generation Taboo Length (default: 0) + -w Seed Window Length (default: 115.00%) + This parameter specifies the genomic span in bases (or colours) + in which *seed_matches_per_window* must exist before the read + is given consideration by the Simth-Waterman alignment machinery. -o Maximum Hits per Read (default: 100) + This parameter specifies how many hits to remember for each read. + If more hits are encountered, ones with lower scores are dropped + to make room. -r Maximum Read Length (default: 1000) + This parameter specifies the maximum length of reads that will + be encountered in the dataset. If larger reads than the default + are used, an appropriate value must be passed to *rmapper*. -d Kmer Std. Deviation Limit (default: -1 [None]) + This option permits pruning read kmers, which occur with + frequencies greater than *kmer_std_dev_limit* standard + deviations above the average. This can shorten running + time at the cost of some sensitivity. + *Note*: A negative value disables this option. + -m S-W Match Value (default: 100) + The value applied to matches during the Smith-Waterman score calculation. + -i S-W Mismatch Value (default: -150) + The value applied to mismatches during the Smith-Waterman + score calculation. + -g S-W Gap Open Penalty (Reference) (default: -400) + The value applied to gap opens along the reference sequence + during the Smith-Waterman score calculation. + *Note*: Note that for backward compatibility, if -g is set + and -q is not set, the gap open penalty for the query will + be set to the same value as specified for the reference. + -q S-W Gap Open Penalty (Query) (default: -400) + The value applied to gap opens along the query sequence during + the Smith-Waterman score calculation. + -e S-W Gap Extend Penalty (Reference) (default: -70) + The value applied to gap extends during the Smith-Waterman score calculation. + *Note*: Note that for backward compatibility, if -e is set + and -f is not set, the gap exten penalty for the query will + be set to the same value as specified for the reference. + -f S-W Gap Extend Penalty (Query) (default: -70) + The value applied to gap extends during the Smith-Waterman score calculation. + -h S-W Hit Threshold (default: 68.00%) + In letter-space, this parameter determines the threshold + score for both vectored and full Smith-Waterman alignments. + Any values less than this quanitity will be thrown away. + *Note* This option differs slightly in meaning between letter-space and colour-space. - -m S-W Match Value (default: 100) - -i S-W Mismatch Value (default: -150) - -g S-W Gap Open Penalty (Reference) (default: -400) - -q S-W Gap Open Penalty (Query) (default: -400) - -e S-W Gap Extend Penalty (Reference) (default: -70) - -f S-W Gap Extend Penalty (Query) (default: -70) - -h S-W Hit Threshold (default: 68.00%) -----