[hg] galaxy 1520: Fix a bug in shrimp_wrapper and add a tool for...

22 Sep 2008

details:   http://www.bx.psu.edu/hg/galaxy/rev/9ef55e79068b
changeset: 1520:9ef55e79068b
user:      wychung
date:      Fri Sep 19 12:02:13 2008 -0400
description:
Fix a bug in shrimp_wrapper and add a tool for splitting paired-end reads.
Update datatype/fastqsolexa so the number of sequences is correct.

7 file(s) affected in this change:

lib/galaxy/datatypes/sequence.py
test-data/split_paired_reads_test1.fastq
test-data/split_paired_reads_test1.out1
tool_conf.xml.sample
tools/metag_tools/shrimp_wrapper.py
tools/metag_tools/split_paired_reads.py
tools/metag_tools/split_paired_reads.xml

diffs (216 lines):

diff -r 0f735b21dc12 -r 9ef55e79068b lib/galaxy/datatypes/sequence.py

--- a/lib/galaxy/datatypes/sequence.py	Thu Sep 18 16:48:29 2008 -0400
+++ b/lib/galaxy/datatypes/sequence.py	Fri Sep 19 12:02:13 2008 -0400
@@ -98,8 +98,8 @@
         dataset.peek = data.get_file_peek( dataset.file_name )
         count = size = 0
         bases_regexp = re.compile("^[NGTAC]*$")
-        for line in file( dataset.file_name ):
-            if line and line[0] == "@":
+        for i, line in enumerate(file( dataset.file_name )):
+            if line and line[0] == "@" and i % 4 == 0:
                 count += 1
             elif bases_regexp.match(line):
                 line = line.strip()
diff -r 0f735b21dc12 -r 9ef55e79068b test-data/split_paired_reads_test1.fastq
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/split_paired_reads_test1.fastq	Fri Sep 19 12:02:13 2008 -0400
@@ -0,0 +1,21 @@
+@HWI-EAS91_1_30788AAXX:7:21:1542:1758
+GTCAATTGTACTGGTCAATACTAAAAGAATAGGATCGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA
++HWI-EAS91_1_30788AAXX:7:21:1542:1758
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR
+@HWI-EAS91_1_30788AAXX:7:22:1621:462
+ATAATGGCTATTATTGTGGGGGGGATGATGCTGGAAACTAGCCCCAATATCAATCCTATATCAAATCTCACC
++HWI-EAS91_1_30788AAXX:7:22:1621:462
+hhhhhhhhhhhhQAhh@hhhhNhhhfhMbCIScC?hhJhhhhChhhJhhhRhhKhePhc\KhhV\KhXhJhh
+@HWI-EAS91_1_30788AAXX:7:45:408:807
+TACCCGATTTTTTGCTTTCCACTTTATCCTACCCTTATGAGTGCTAGGATCAGGATGGAGAGGATTAGGGCT
++HWI-EAS91_1_30788AAXX:7:45:408:807
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hhhZh`hhhhhRXhhYh
+@HWI-EAS91_1_30788AAXX:7:49:654:1439
+CTAACTCTATTTATTGTATTTCAACTAAAAATCTCATAGGTTTATTGATAGTTGTGTTGTTGGTGTAAATGG
++HWI-EAS91_1_30788AAXX:7:49:654:1439
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhdhh_hG\XhU@
+@HWI-EAS91_1_30788AAXX:7:64:947:234
+TATCAAAAAAGAATATAATCTGAATCAACACTACAACCTATTAGTGTGTAGAATAGGAAGTAGAGGCCTGCG
++HWI-EAS91_1_30788AAXX:7:64:947:234
+hhhhhhhhhhhhhhhhhhhhhhhRhhehhahhhhhJhhhhhhhh^hPhWfhhhhThWUhhfhh_hhNIVPUd
+
diff -r 0f735b21dc12 -r 9ef55e79068b test-data/split_paired_reads_test1.out1
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/split_paired_reads_test1.out1	Fri Sep 19 12:02:13 2008 -0400
@@ -0,0 +1,20 @@
+@HWI-EAS91_1_30788AAXX:7:21:1542:1758/1
+GTCAATTGTACTGGTCAATACTAAAAGAATAGGATC
++HWI-EAS91_1_30788AAXX:7:21:1542:1758/1
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+@HWI-EAS91_1_30788AAXX:7:22:1621:462/1
+ATAATGGCTATTATTGTGGGGGGGATGATGCTGGAA
++HWI-EAS91_1_30788AAXX:7:22:1621:462/1
+hhhhhhhhhhhhQAhh@hhhhNhhhfhMbCIScC?h
+@HWI-EAS91_1_30788AAXX:7:45:408:807/1
+TACCCGATTTTTTGCTTTCCACTTTATCCTACCCTT
++HWI-EAS91_1_30788AAXX:7:45:408:807/1
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+@HWI-EAS91_1_30788AAXX:7:49:654:1439/1
+CTAACTCTATTTATTGTATTTCAACTAAAAATCTCA
++HWI-EAS91_1_30788AAXX:7:49:654:1439/1
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+@HWI-EAS91_1_30788AAXX:7:64:947:234/1
+TATCAAAAAAGAATATAATCTGAATCAACACTACAA
++HWI-EAS91_1_30788AAXX:7:64:947:234/1
+hhhhhhhhhhhhhhhhhhhhhhhRhhehhahhhhhJ
diff -r 0f735b21dc12 -r 9ef55e79068b tool_conf.xml.sample
--- a/tool_conf.xml.sample	Thu Sep 18 16:48:29 2008 -0400
+++ b/tool_conf.xml.sample	Fri Sep 19 12:02:13 2008 -0400
@@ -274,6 +274,7 @@
     <tool file="metag_tools/short_reads_figure_high_quality_length.xml" />
     <tool file="metag_tools/short_reads_trim_seq.xml" />
     <tool file="metag_tools/blat_coverage_report.xml" />
+    <tool file="metag_tools/split_paired_reads.xml" />
   </section>
   <section name="Short Read Mapping" id="solexa_tools">
    <tool file="metag_tools/shrimp_wrapper.xml" />
diff -r 0f735b21dc12 -r 9ef55e79068b tools/metag_tools/shrimp_wrapper.py
--- a/tools/metag_tools/shrimp_wrapper.py	Thu Sep 18 16:48:29 2008 -0400
+++ b/tools/metag_tools/shrimp_wrapper.py	Fri Sep 19 12:02:13 2008 -0400
@@ -162,6 +162,7 @@
                     readname, endindex = line[1:].split('/')
             else:
                 score = line
+                
         if score:   # the last one
             if hits.has_key(readname):
                 if len(hits[readname]) == hit_per_read:
@@ -182,8 +183,9 @@
         match_count = 0
         
         if hit_per_read == 1:
-            matches = [ hits[readkey]['1'] ]
-            match_count = 1
+            if len(hits[readkey]['1']) == 1:
+                matches = [ hits[readkey]['1'] ]
+                match_count = 1
         else:
             end1_data = hits[readkey]['1']
             end2_data = hits[readkey]['2']
@@ -591,6 +593,7 @@
         if os.path.exists(query_qual_end2): os.remove(query_qual_end2)    
     
     if os.path.exists(shrimp_log): os.remove(shrimp_log)
+
     
 if __name__ == '__main__': __main__()
     
diff -r 0f735b21dc12 -r 9ef55e79068b tools/metag_tools/split_paired_reads.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/split_paired_reads.py	Fri Sep 19 12:02:13 2008 -0400
@@ -0,0 +1,46 @@
+#! /usr/bin/python
+
+"""
+Split Solexa paired end reads
+"""
+
+import os, sys
+
+if __name__ == '__main__':
+    
+    infile = sys.argv[1]
+    outfile_end1 = open(sys.argv[2], 'w')
+    outfile_end2 = open(sys.argv[3], 'w')
+    
+    for i, line in enumerate(file(infile)):
+        line = line.rstrip()
+        if not line or line.startswith('#'): continue
+        
+        end1 = ''
+        end2 = ''
+        
+        line_index = i % 4
+        
+        if line_index == 0:
+            end1 = line + '/1'
+            end2 = line + '/2'
+        
+        elif line_index == 1:
+            seq_len = len(line)/2
+            end1 = line[0:seq_len]
+            end2 = line[seq_len:]
+        
+        elif line_index == 2:
+            end1 = line + '/1'
+            end2 = line + '/2'
+        
+        else:
+            qual_len = len(line)/2
+            end1 = line[0:qual_len]
+            end2 = line[qual_len:]
+            
+        outfile_end1.write('%s\n' %(end1))
+        outfile_end2.write('%s\n' %(end2))
+        
+    outfile_end1.close()
+    outfile_end2.close()    
\ No newline at end of file
diff -r 0f735b21dc12 -r 9ef55e79068b tools/metag_tools/split_paired_reads.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/split_paired_reads.xml	Fri Sep 19 12:02:13 2008 -0400
@@ -0,0 +1,56 @@
+<tool id="split_paired_reads" name="Split" version="1.0.0">
+  <description>paired-end reads into two ends</description>
+  <command interpreter="python">
+    split_paired_reads.py $input $output1 $output2
+  </command>
+    <inputs>
+        <param name="input" type="data" format="fastqsolexa" label="Your paired-end file" />
+    </inputs>
+    <outputs>
+        <data name="output1" format="fastqsolexa"/>
+        <data name="output2" format="fastqsolexa"/>
+    </outputs>
+    <tests>
+        <test>
+            <param name="input" value="split_paired_reads_test1.fastq" ftype="fastqsolexa" />
+            <output name="output1" file="split_paired_reads_test1.out1" fype="fastqsolexa" />
+        </test>
+    </tests>
+<help>
+        
+**What it does**
+ 
+This tool splits a single paired-end file in half and returns two files with each ends.  
+
+-----
+
+**Input formats**
+
+A multiple-fastq file, for example::
+
+    @HWI-EAS91_1_30788AAXX:7:21:1542:1758
+    GTCAATTGTACTGGTCAATACTAAAAGAATAGGATCGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA
+    +HWI-EAS91_1_30788AAXX:7:21:1542:1758
+    hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR
+
+
+-----
+
+**Outputs**
+
+One end::
+
+    @HWI-EAS91_1_30788AAXX:7:21:1542:1758/1
+    GTCAATTGTACTGGTCAATACTAAAAGAATAGGATC
+    +HWI-EAS91_1_30788AAXX:7:21:1542:1758/1
+    hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+
+The other end::
+
+    @HWI-EAS91_1_30788AAXX:7:21:1542:1758/2
+    GCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA
+    +HWI-EAS91_1_30788AAXX:7:21:1542:1758/2
+    hhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR
+    
+</help>
+</tool>

    

greg＠scofield.bx.psu.edu

tags

participants (1)