[hg] galaxy 1517: update shrimp_wrapper.

22 Sep 2008

details:   http://www.bx.psu.edu/hg/galaxy/rev/1d326855ba89
changeset: 1517:1d326855ba89
user:      wychung
date:      Thu Sep 18 15:41:23 2008 -0400
description:
update shrimp_wrapper.

2 file(s) affected in this change:

tools/metag_tools/shrimp_wrapper.py
tools/metag_tools/shrimp_wrapper.xml

diffs (621 lines):

diff -r f1da9b95549b -r 1d326855ba89 tools/metag_tools/shrimp_wrapper.py

--- a/tools/metag_tools/shrimp_wrapper.py	Thu Sep 18 15:24:51 2008 -0400
+++ b/tools/metag_tools/shrimp_wrapper.py	Thu Sep 18 15:41:23 2008 -0400
@@ -61,17 +61,13 @@
     reversed_s.reverse()
     return "".join(reversed_s)
 
-def generate_sub_table(result_file, ref_file, score_files, table_outfile, hit_per_read):
+def generate_sub_table(result_file, ref_file, score_files, table_outfile, hit_per_read, insertion_size):
     
-    """
-    TODO: the cross-over error has not been addressed yet.
-    """
+    invalid_editstring_char = 0
     
-    insertion_size = 600
+    all_score_file = score_files.split(',')
     
-    all_score_file = score_files.split('&')
-    
-    if len(all_score_file) != hit_per_read: stop_err('Un-equal number of files!')
+    if len(all_score_file) != hit_per_read: stop_err('One or more query files is missing. Please check your dataset.')
         
     temp_table_name = tempfile.NamedTemporaryFile().name
     temp_table = open(temp_table_name, 'w')
@@ -178,7 +174,7 @@
                         hits_score[readname] = {}
                         hits_score[readname][endindex] = score
     
-    # mutation call to all mappings            
+    # call to all mappings            
     for readkey in hits.keys():
         if len(hits[readkey]) != hit_per_read: continue
 
@@ -211,6 +207,7 @@
                             match_count += 1
 
         if match_count == 1:
+            
             for x, end_data in enumerate(matches[0]):
                 
                 end_strand, end_editstring, end_chr_start, end_chr_end, end_read_start, end_chrom = end_data
@@ -226,20 +223,26 @@
                 gap_read = 0
                 
                 while editindex < len(end_editstring):
+                    
                     editchr = end_editstring[editindex]
                     chrA = ''
                     chrB = ''
                     locIndex = []
+                    
                     if editchr.isdigit():
                         editcode = ''
+                        
                         while editchr.isdigit() and editindex < len(end_editstring):
                             editcode += editchr
                             editindex += 1
                             if editindex < len(end_editstring): editchr = end_editstring[editindex]
+                        
                         for baseIndex in range(int(editcode)):
                             chrA += refsegment[match_len+baseIndex]
                             chrB = chrA
+                        
                         match_len += int(editcode)
+                        
                     elif editchr == 'x':
                         # crossover: inserted between the appropriate two bases
                         # Two sequencing errors: 4x15x6 (25 matches with 2 crossovers)
@@ -263,18 +266,21 @@
                         
                     elif editchr == '(':
                         editcode = ''
+                        
                         while editchr != ')' and editindex < len(end_editstring):
                             if editindex < len(end_editstring): editchr = end_editstring[editindex]
                             editcode += editchr
                             editindex += 1
+                        
                         editcode = editcode[1:-1]
                         chrA = '-'*len(editcode)
                         chrB = editcode
                         
                     else:
-                        print 'Warning! Unknown symbols', editchr
-                    
+                        invalid_editstring_char += 1
+                        
                     if end_strand == '-':
+                        
                         chrA = reverse_complement(chrA)
                         chrB = reverse_complement(chrB)
                         
@@ -288,9 +294,12 @@
                         chrBx = chrB[mappingIndex]
 
                         if chrAx and chrBx and chrBx.upper() != 'N':
+                            
                             if end_strand == '+':
+                            
                                 chrom_loc = end_chr_start+match_len-len(chrA)+mappingIndex         
                                 read_loc = end_read_start+match_len-len(chrA)+mappingIndex-gap_read
+                                
                                 if chrAx == '-': chrom_loc -= 1
                                 
                                 if chrBx == '-': 
@@ -300,9 +309,12 @@
                                 
                                 # 1-based on chrom_loc and read_loc
                                 pos_line = pos_line + '\t'.join([end_chrom, str(chrom_loc+1), readkey+'/'+str(x+1), str(read_loc+1), chrAx, chrBx, scoreBx]) + '\n'
+                                
                             else:
+                                
                                 chrom_loc = end_chr_end-match_len+mappingIndex
                                 read_loc = end_read_start+match_len-1-mappingIndex-gap_read
+                                
                                 if chrAx == '-': chrom_loc -= 1
                                 
                                 if chrBx == '-': 
@@ -314,11 +326,14 @@
                                 rev_line = '\t'.join([end_chrom, str(chrom_loc+1), readkey+'/'+str(x+1), str(read_loc+1), chrAx, chrBx, scoreBx]) +'\n' + rev_line
 
                             if chrom_cov.has_key(end_chrom):
+                                
                                 if chrom_cov[end_chrom].has_key(chrom_loc):
                                     chrom_cov[end_chrom][chrom_loc] += 1
                                 else:
                                     chrom_cov[end_chrom][chrom_loc] = 1
+                                    
                             else:
+                                
                                 chrom_cov[end_chrom] = {}
                                 chrom_cov[end_chrom][chrom_loc] = 1
                     
@@ -329,6 +344,7 @@
 
     # chrom-wide coverage
     for i, line in enumerate(open(temp_table_name)):
+        
         line = line.rstrip()
         if not line or line.startswith('#'): continue
         
@@ -348,6 +364,9 @@
     outfile.close()
     
     if os.path.exists(temp_table_name): os.remove(temp_table_name)
+    
+    if invalid_editstring_char:
+        print 'Skip ', invalid_editstring_char, ' invalid characters in editstrings'
         
     return True 
    
@@ -359,7 +378,7 @@
     seq_title_startswith = ''
     qual_title_startswith = ''
     
-    default_coding_value = 64
+    default_coding_value = 64       # Solexa ascii-code
     fastq_block_lines = 0
     
     for i, line in enumerate( file( infile_name ) ):
@@ -448,16 +467,63 @@
 
 def __main__():
     
+    # SHRiMP path
+    shrimp = 'rmapper-ls'
+    
     # I/O
-    type_of_reads = sys.argv[1]     # single or paired
-    input_target = sys.argv[2]      # fasta
-    shrimp_outfile = sys.argv[3]    # shrimp output
-    table_outfile = sys.argv[4]     # table output
-     
-    # SHRiMP parameters: total = 15
-    # TODO: put threshold on each of these parameters
-    if len(sys.argv) == 21 or len(sys.argv) == 22:
-        spaced_seed = sys.argv[5]
+    input_target_file = sys.argv[1]                  # fasta
+    shrimp_outfile    = sys.argv[2]                # shrimp output
+    table_outfile     = sys.argv[3]                 # table output
+    single_or_paired  = sys.argv[4].split(',')       
+    
+    insertion_size = 600
+    
+    if len(single_or_paired) == 1:                  # single or paired
+        type_of_reads = 'single'
+        hit_per_read  = 1
+        input_query   = single_or_paired[0]
+        query_fasta   = tempfile.NamedTemporaryFile().name
+        query_qual    = tempfile.NamedTemporaryFile().name
+
+    else:                                           # paired-end
+        type_of_reads    = 'paired'
+        hit_per_read     = 2
+        input_query_end1 = single_or_paired[0]  
+        input_query_end2 = single_or_paired[1]
+        insertion_size = int(single_or_paired[2])
+        query_fasta_end1 = tempfile.NamedTemporaryFile().name
+        query_fasta_end2 = tempfile.NamedTemporaryFile().name
+        query_qual_end1  = tempfile.NamedTemporaryFile().name
+        query_qual_end2  = tempfile.NamedTemporaryFile().name
+        
+    # SHRiMP parameters: total = 15, default values
+    spaced_seed = '111111011111'
+    seed_matches_per_window = '2'
+    seed_hit_taboo_length = '4'
+    seed_generation_taboo_length = '0'
+    seed_window_length = '115.0'
+    max_hits_per_read = '100'
+    max_read_length = '1000'
+    kmer = '-1'
+    sw_match_value = '100'
+    sw_mismatch_value = '-150'
+    sw_gap_open_ref = '-400'
+    sw_gap_open_query = '-400'
+    sw_gap_ext_ref = '-70'
+    sw_gap_ext_query = '-70'
+    sw_hit_threshold = '68.0'
+
+    # TODO: put the threshold on each of these parameters
+    if len(sys.argv) > 5:
+        
+        try:
+            if sys.argv[5].isdigit():
+                spaced_seed = sys.argv[5]
+            else:
+                stop_err('Error in assigning parameter: Spaced seed.')
+        except:
+            stop_err('Spaced seed must be a combination of 1s and 0s.')
+        
         seed_matches_per_window = sys.argv[6]
         seed_hit_taboo_length = sys.argv[7]
         seed_generation_taboo_length = sys.argv[8]
@@ -473,53 +539,6 @@
         sw_gap_ext_query = sys.argv[18]
         sw_hit_threshold = sys.argv[19]
     
-        # Single-end parameters
-        if type_of_reads == 'single':
-            input_query = sys.argv[20]   # single-end
-            hit_per_read = 1
-            query_fasta = tempfile.NamedTemporaryFile().name
-            query_qual = tempfile.NamedTemporaryFile().name
-        else: # Paired-end parameters
-            input_query_end1 = sys.argv[20]  # paired-end
-            input_query_end2 = sys.argv[21]
-            hit_per_read = 2
-            query_fasta_end1 = tempfile.NamedTemporaryFile().name
-            query_fasta_end2 = tempfile.NamedTemporaryFile().name
-            query_qual_end1 = tempfile.NamedTemporaryFile().name
-            query_qual_end2 = tempfile.NamedTemporaryFile().name
-    else:
-        spaced_seed = '111111011111'
-        seed_matches_per_window = '2'
-        seed_hit_taboo_length = '4'
-        seed_generation_taboo_length = '0'
-        seed_window_length = '115.0'
-        max_hits_per_read = '100'
-        max_read_length = '1000'
-        kmer = '-1'
-        sw_match_value = '100'
-        sw_mismatch_value = '-150'
-        sw_gap_open_ref = '-400'
-        sw_gap_open_query = '-400'
-        sw_gap_ext_ref = '-70'
-        sw_gap_ext_query = '-70'
-        sw_hit_threshold = '68.0'
-
-        # Single-end parameters
-        if type_of_reads == 'single':
-            input_query = sys.argv[5]   # single-end
-            hit_per_read = 1
-            query_fasta = tempfile.NamedTemporaryFile().name
-            query_qual = tempfile.NamedTemporaryFile().name
-        else: # Paired-end parameters
-            input_query_end1 = sys.argv[5]  # paired-end
-            input_query_end2 = sys.argv[6]
-            hit_per_read = 2
-            query_fasta_end1 = tempfile.NamedTemporaryFile().name
-            query_fasta_end2 = tempfile.NamedTemporaryFile().name
-            query_qual_end1 = tempfile.NamedTemporaryFile().name
-            query_qual_end2 = tempfile.NamedTemporaryFile().name
-
-            
     # temp file for shrimp log file
     shrimp_log = tempfile.NamedTemporaryFile().name
     
@@ -532,7 +551,7 @@
         
     # SHRiMP command
     if type_of_reads == 'single':
-        command = ' '.join(['rmapper-ls',  '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta, input_target, '>', shrimp_outfile, '2>', shrimp_log])
+        command = ' '.join([shrimp,  '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta, input_target_file, '>', shrimp_outfile, '2>', shrimp_log])
     
         try:
             os.system(command)
@@ -541,9 +560,9 @@
             if os.path.exists(query_qual): os.remove(query_qual)
             stop_err(str(e))
             
-    else:
-        command_end1 = ' '.join(['rmapper-ls', '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end1, input_target, '>', shrimp_outfile, '2>', shrimp_log])
-        command_end2 = ' '.join(['rmapper-ls', '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end2, input_target, '>>', shrimp_outfile, '2>>', shrimp_log])
+    else: # paired
+        command_end1 = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end1, input_target_file, '>', shrimp_outfile, '2>', shrimp_log])
+        command_end2 = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end2, input_target_file, '>>', shrimp_outfile, '2>>', shrimp_log])
         
         try:
             os.system(command_end1)
@@ -557,9 +576,9 @@
         
     # convert to table
     if type_of_reads == 'single':
-        return_value = generate_sub_table(shrimp_outfile, input_target, query_qual, table_outfile, hit_per_read)
+        return_value = generate_sub_table(shrimp_outfile, input_target_file, query_qual, table_outfile, hit_per_read, insertion_size)
     else:
-        return_value = generate_sub_table(shrimp_outfile, input_target, query_qual_end1+'&'+query_qual_end2, table_outfile, hit_per_read)
+        return_value = generate_sub_table(shrimp_outfile, input_target_file, query_qual_end1+','+query_qual_end2, table_outfile, hit_per_read, insertion_size)
         
     # remove temp. files
     if type_of_reads == 'single':
diff -r f1da9b95549b -r 1d326855ba89 tools/metag_tools/shrimp_wrapper.xml
--- a/tools/metag_tools/shrimp_wrapper.xml	Thu Sep 18 15:24:51 2008 -0400
+++ b/tools/metag_tools/shrimp_wrapper.xml	Thu Sep 18 15:41:23 2008 -0400
@@ -1,50 +1,51 @@
 <tool id="shrimp_wrapper" name="SHRiMP" version="1.0.0">
   <description>SHort Read Mapping Package</description>
   <command interpreter="python">
-    #if     ($type_of_reads.single_or_paired=="single" and $param.skip_or_full=="skip"):#shrimp_wrapper.py $type_of_reads.single_or_paired $input_target $output1 $output2 $input_query
-    #elif   ($type_of_reads.single_or_paired=="paired" and $param.skip_or_full=="skip"):#shrimp_wrapper.py $type_of_reads.single_or_paired $input_target $output1 $output2 ${type_of_reads.input1} ${type_of_reads.input2} 
-    #elif   ($type_of_reads.single_or_paired=="single" and $param.skip_or_full=="full"):#shrimp_wrapper.py $type_of_reads.single_or_paired $input_target $output1 $output2 $param.spaced_seed $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_hit_threshold $input_query 
-    #elif   ($type_of_reads.single_or_paired=="paired" and $param.skip_or_full=="full"):#shrimp_wrapper.py $type_of_reads.single_or_paired $input_target $output1 $output2 $param.spaced_seed $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_hit_threshold ${type_of_reads.input1} ${type_of_reads.input2}
+    #if     ($type_of_reads.single_or_paired=="single" and $param.skip_or_full=="skip"):#shrimp_wrapper.py $input_target $output1 $output2 $input_query
+    #elif   ($type_of_reads.single_or_paired=="paired" and $param.skip_or_full=="skip"):#shrimp_wrapper.py $input_target $output1 $output2 $type_of_reads.input1,$type_of_reads.input2,$type_of_reads.insertion_size
+    #elif   ($type_of_reads.single_or_paired=="single" and $param.skip_or_full=="full"):#shrimp_wrapper.py $input_target $output1 $output2 $input_query                                                              $param.spaced_seed $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_hit_threshold 
+    #elif   ($type_of_reads.single_or_paired=="paired" and $param.skip_or_full=="full"):#shrimp_wrapper.py $input_target $output1 $output2 $type_of_reads.input1,$type_of_reads.input2,$type_of_reads.insertion_size $param.spaced_seed $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_hit_threshold
     #end if
   </command>
     <inputs>
         <page>
-        <param name="input_target" type="data" format="fasta" label="Reference sequence" />
         <conditional name="type_of_reads">
             <param name="single_or_paired" type="select" label="Single- or Paired-ends">
                 <option value="single">Single-end</option>
                 <option value="paired">Paired-end</option>
             </param>
             <when value="single">
-                <param name="input_query" type="data" format="fastqsolexa" label="Sequence file" />
+                <param name="input_query" type="data" format="fastqsolexa" label="Align sequencing reads" />
             </when>
             <when value="paired">
-                <param name="input1" type="data" format="fastqsolexa" label="One end" />
-                <param name="input2" type="data" format="fastqsolexa" label="The other end" />
+                <param name="insertion_size" type="integer" size="5" value="600" label="Insertion length between two ends" help="bp" />
+                <param name="input1" type="data" format="fastqsolexa" label="Align sequencing reads, one end" />
+                <param name="input2" type="data" format="fastqsolexa" label="and the other end" />
             </when> 
         </conditional>
+        <param name="input_target" type="data" format="fasta" label="against reference" />
         <conditional name="param">
-            <param name="skip_or_full" type="select" label="SHRiMP parameter selection">
-                <option value="skip">Default setting</option>
-                <option value="full">Full list</option>
+            <param name="skip_or_full" type="select" label="SHRiMP settings to use" help="For most mapping needs use Commonly used settings. If you want full control use Full List">
+                <option value="skip">Commonly used</option>
+                <option value="full">Full Parameter List</option>
             </param>
             <when value="skip" />
             <when value="full">
-                <param name="spaced_seed" type="text" size="30" value="111111011111" label="Spaced Seed" />
-                <param name="seed_matches_per_window" type="integer" size="5" value="2" label="Seed Matches per Window" />
-                <param name="seed_hit_taboo_length" type="integer" size="5" value="4" label="Seed Hit Taboo Length" />
-                <param name="seed_generation_taboo_length" type="integer" size="5" value="0" label="Seed Generation Taboo Length" />
-                <param name="seed_window_length" type="float" size="10" value="115.0" label="Seed Window Length" help="in percentage"/>
-                <param name="max_hits_per_read" type="integer" size="10" value="100" label="Maximum Hits per Read" />
-                <param name="max_read_length" type="integer" size="10" value="1000" label="Maximum Read Length" />
-                <param name="kmer" type="integer" size="10" value="-1" label="Kmer Std. Deviation Limit" help="-1 as None"/>
-                <param name="sw_match_value" type="integer" size="10" value="100" label="S-W Match Value" />
-                <param name="sw_mismatch_value" type="integer" size="10" value="-150" label="S-W Mismatch Value" />
-                <param name="sw_gap_open_ref" type="integer" size="10" value="-400" label="S-W Gap Open Penalty (Reference)" />
-                <param name="sw_gap_open_query" type="integer" size="10" value="-400" label="S-W Gap Open Penalty (Query)" />
-                <param name="sw_gap_ext_ref" type="integer" size="10" value="-70" label="S-W Gap Extend Penalty (Reference)" />
-                <param name="sw_gap_ext_query" type="integer" size="10" value="-70" label="S-W Gap Extend Penalty (Query)" />
-                <param name="sw_hit_threshold" type="float" size="10" value="68.0" label="S-W Hit Threshold" help="in percentage"/>
+                <param name="spaced_seed"                   type="text"     size="30"   value="111111011111"    label="Spaced Seed" />
+                <param name="seed_matches_per_window"       type="integer"  size="5"    value="2"               label="Seed Matches per Window" />
+                <param name="seed_hit_taboo_length"         type="integer"  size="5"    value="4"               label="Seed Hit Taboo Length" />
+                <param name="seed_generation_taboo_length"  type="integer"  size="5"    value="0"               label="Seed Generation Taboo Length" />
+                <param name="seed_window_length"            type="float"    size="10"   value="115.0"           label="Seed Window Length"          help="in percentage"/>
+                <param name="max_hits_per_read"             type="integer"  size="10"   value="100"             label="Maximum Hits per Read" />
+                <param name="max_read_length"               type="integer"  size="10"   value="1000"            label="Maximum Read Length" />
+                <param name="kmer"                          type="integer"  size="10"   value="-1"              label="Kmer Std. Deviation Limit"   help="-1 as None"/>
+                <param name="sw_match_value"                type="integer"  size="10"   value="100"             label="S-W Match Value" />
+                <param name="sw_mismatch_value"             type="integer"  size="10"   value="-150"            label="S-W Mismatch Value" />
+                <param name="sw_gap_open_ref"               type="integer"  size="10"   value="-400"            label="S-W Gap Open Penalty (Reference)" />
+                <param name="sw_gap_open_query"             type="integer"  size="10"   value="-400"            label="S-W Gap Open Penalty (Query)" />
+                <param name="sw_gap_ext_ref"                type="integer"  size="10"   value="-70"             label="S-W Gap Extend Penalty (Reference)" />
+                <param name="sw_gap_ext_query"              type="integer"  size="10"   value="-70"             label="S-W Gap Extend Penalty (Query)" />
+                <param name="sw_hit_threshold"              type="float"    size="10"   value="68.0"            label="S-W Hit Threshold"           help="in percentage"/>
             </when>
         </conditional>
         </page>
@@ -54,7 +55,7 @@
         <data name="output2" format="tabular"/>
     </outputs>
     <requirements>
-      <requirement type="binary">SHRiMP_rmapper</requirement>
+      <requirement type="binary">rmapper-ls</requirement>
     </requirements>
     <tests>
         <test>
@@ -64,13 +65,14 @@
             <param name="input_query" value="shrimp_wrapper_test1.fastq" ftype="fastqsolexa"/>
             <output name="output1" file="shrimp_wrapper_test1.out1" />
         </test>
-        <!-- 
+        <!--  
         <test>
-            <param name="input1" value="shrimp_wrapper_test2_end1.fastq" ftype="fastqsolexa" />
-            <param name="input2" value="shrimp_wrapper_test2_end2.fastq" ftype="fastqsolexa" />
             <param name="single_or_paired" value="paired" />
             <param name="skip_or_full" value="skip" />
             <param name="input_target" value="shrimp_eca_chrMT.fa" ftype="fasta" />
+            <param name="input1" value="shrimp_wrapper_test2_end1.fastq" ftype="fastqsolexa" />
+            <param name="input2" value="shrimp_wrapper_test2_end2.fastq" ftype="fastqsolexa" />
+            <param name="insertion_size" value="600" />
             <output name="output1" file="shrimp_wrapper_test2.out1" />
         </test>
         <test>
@@ -116,6 +118,7 @@
             <param name="sw_hit_threshold" value="68.0" />
             <param name="input1" value="shrimp_wrapper_test2_end1.fastq" ftype="fastqsolexa"/>
             <param name="input2" value="shrimp_wrapper_test2_end2.fastq" ftype="fastqsolexa"/>
+            <param name="insertion_size" value="600" />
             <output name="output1" file="shrimp_wrapper_test2.out1" />
         </test>
         -->
@@ -124,67 +127,146 @@
     
 .. class:: warningmark 
 
-Only nucleotide sequences as query.
+Please note that only **nucleotide** sequences (letter-space) can be used as query.   
 
 -----
     
 **What it does**
  
-Run SHRiMP on letter-space reads. 
- 
+SHRiMP (SHort Read Mapping Package) is a software package for aligning genomic reads against a target genome. 
+
+This wrapper post-processes the default SHRiMP/rmapper-ls output and generates a table with all information from reads and reference for the mapping. The tool takes single- or paired-end reads. For single-end reads, only uniquely mapped alignment is considered. In paired-end reads, only pairs that meet the following criteria will be used to generate the table: 1). the ends fall within the insertion size; 2). the ends are mapped at the opposite directions. If there are still multiple mappings after applying the criteria, this paired-end read will be discarded. 
+  
+
 -----
- 
-**Example**
- 
-- Input a multiple-fastq file like the following::
+
+**Input formats**
+
+A multiple-fastq file, for example::
 
     @seq1
     TACCCGATTTTTTGCTTTCCACTTTATCCTACCCTT
-    +seq2
+    +seq1
     hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
 
-- Use default settings (for detail explanations, please see **Parameters** section)
-
-- Search against your own uploaded file, result will be in the following format::
-
-    +-------+-------+--------+----------+----------+---------+--------+--------+-------+------------+
-    | id    | chrom | strand | t.start  | t.end    | q.start | q.end  | length | score | editstring |
-    +-------+-------+--------+----------+----------+---------+--------+--------+-------+------------+
-    | >seq1 | chrMT | +      | 14712    | 14747    | 1       | 36     | 36     | 3350  | 24T11      |
-    +-------+-------+--------+----------+----------+---------+--------+--------+-------+------------+
-    
-- The result will be formatted Table::
-
-    +-------+---------+---------+----------+---------+----------+---------+----------+
-    | chrom | ref_loc | read_id | read_loc | ref_nuc | read_nuc | quality | coverage |
-    +-------+---------+---------+----------+---------+----------+---------+----------+ 
-    | chrMT | 14711   | seq1    | 0        | T       | T        | 40      | 1        |
-    | chrMT | 14712   | seq1    | 1        | A       | A        | 40      | 1        |
-    | chrMT | 14713   | seq1    | 2        | C       | C        | 40      | 1        |
-    +-------+---------+---------+----------+---------+----------+---------+----------+
 
 -----
 
-**Parameters**
+**Outputs**
 
-Parameter list with default value settings::
+The tool gives two outputs.
+
+**Table output**
+
+Table output contains 8 columns::
+
+     1     2        3       4     5     6     7     8 
+  ----------------------------------------------------
+  chrM   14711     seq1     0     T     A    40     1 
+  chrM   14712     seq1     1     T     T    40     1 
+
+where::
+    
+  1. (chrM)   - Reference sequence id
+  2. (14711)  - Position of the mapping in the reference
+  3. (seq1)   - Read id
+  4. (0)      - Position of the mapping in the read
+  5. (T)      - Nucleotide in the reference
+  6. (A)      - Nucleotide in the read
+  7. (40)     - Quality score for the nucleotide in the position of the read
+  8. (1)      - The number of times this position is covered by reads
+
+     
+**SHRiMP output**
+
+This is the default output from SHRiMP/rmapper-ls::
+ 
+     1     2     3       4      5      6     7     8      9      10
+  -------------------------------------------------------------------
+   seq1  chrM    +     3644    3679    1    36     36    3600    36  
+
+where::
+
+  1. (seq1)   - Read id 
+  2. (chrM)   - Reference sequence id
+  3. (+)      - Strand of the read
+  4. (3466)   - Start position of the alignment in the reference
+  5. (3679)   - End position of the alignment in the reference
+  6. (1)      - Start position of the alignment in the read
+  7. (36)     - End position of the alignment in the read
+  8. (36)     - Length of the read
+  9. (3600)   - Score 
+ 10. (36)     - Edit string
+
+ 
+-----
+
+**SHRiMP parameter list**
+
+The commonly used parameters with default value setting::
 
     -s    Spaced Seed                             (default: 111111011111)
+          The spaced seed is a single contiguous string of 0's and 1's. 
+          0's represent wildcards, or positions which will always be 
+          considered as matching, whereas 1's dictate positions that 
+          must match. A string of all 1's will result in a simple kmer scan.
     -n    Seed Matches per Window                 (default: 2)
+          The number of seed matches per window dictates how many seeds 
+          must match within some window length of the genome before that 
+          region is considered for Smith-Waterman alignment. A lower 
+          value will increase sensitivity while drastically increasing 
+          running time. Higher values will have the opposite effect.
     -t    Seed Hit Taboo Length                   (default: 4)
+          The seed taboo length specifies how many target genome bases 
+          or colours must exist prior to a previous seed match in order 
+          to count another seed match as a hit.
     -9    Seed Generation Taboo Length            (default: 0)
+          
     -w    Seed Window Length                      (default: 115.00%)
+          This parameter specifies the genomic span in bases (or colours) 
+          in which *seed_matches_per_window* must exist before the read 
+          is given consideration by the Simth-Waterman alignment machinery.
     -o    Maximum Hits per Read                   (default: 100)
+          This parameter specifies how many hits to remember for each read. 
+          If more hits are encountered, ones with lower scores are dropped 
+          to make room.
     -r    Maximum Read Length                     (default: 1000)
+          This parameter specifies the maximum length of reads that will 
+          be encountered in the dataset. If larger reads than the default 
+          are used, an appropriate value must be passed to *rmapper*.
     -d    Kmer Std. Deviation Limit               (default: -1 [None])
+          This option permits pruning read kmers, which occur with 
+          frequencies greater than *kmer_std_dev_limit* standard 
+          deviations above the average. This can shorten running 
+          time at the cost of some sensitivity. 
+          *Note*: A negative value disables this option.            
+    -m    S-W Match Value                         (default: 100)
+          The value applied to matches during the Smith-Waterman score calculation.
+    -i    S-W Mismatch Value                      (default: -150)
+          The value applied to mismatches during the Smith-Waterman 
+          score calculation.
+    -g    S-W Gap Open Penalty (Reference)        (default: -400)
+          The value applied to gap opens along the reference sequence 
+          during the Smith-Waterman score calculation.
+          *Note*: Note that for backward compatibility, if -g is set 
+          and -q is not set, the gap open penalty for the query will 
+          be set to the same value as specified for the reference.
+    -q    S-W Gap Open Penalty (Query)            (default: -400)
+          The value applied to gap opens along the query sequence during 
+          the Smith-Waterman score calculation.        
+    -e    S-W Gap Extend Penalty (Reference)      (default: -70)
+          The value applied to gap extends during the Smith-Waterman score calculation.
+          *Note*: Note that for backward compatibility, if -e is set 
+          and -f is not set, the gap exten penalty for the query will 
+          be set to the same value as specified for the reference. 
+    -f    S-W Gap Extend Penalty (Query)          (default: -70)
+          The value applied to gap extends during the Smith-Waterman score calculation.
+    -h    S-W Hit Threshold                       (default: 68.00%)
+          In letter-space, this parameter determines the threshold 
+          score for both vectored and full Smith-Waterman alignments. 
+          Any values less than this quanitity will be thrown away.
+          *Note* This option differs slightly in meaning between letter-space and colour-space.
 
-    -m    S-W Match Value                         (default: 100)
-    -i    S-W Mismatch Value                      (default: -150)
-    -g    S-W Gap Open Penalty (Reference)        (default: -400)
-    -q    S-W Gap Open Penalty (Query)            (default: -400)
-    -e    S-W Gap Extend Penalty (Reference)      (default: -70)
-    -f    S-W Gap Extend Penalty (Query)          (default: -70)
-    -h    S-W Hit Threshold                       (default: 68.00%)
 
 -----

    

greg＠scofield.bx.psu.edu

tags

participants (1)