details:
http://www.bx.psu.edu/hg/galaxy/rev/9ef55e79068b
changeset: 1520:9ef55e79068b
user: wychung
date: Fri Sep 19 12:02:13 2008 -0400
description:
Fix a bug in shrimp_wrapper and add a tool for splitting paired-end reads.
Update datatype/fastqsolexa so the number of sequences is correct.
7 file(s) affected in this change:
lib/galaxy/datatypes/sequence.py
test-data/split_paired_reads_test1.fastq
test-data/split_paired_reads_test1.out1
tool_conf.xml.sample
tools/metag_tools/shrimp_wrapper.py
tools/metag_tools/split_paired_reads.py
tools/metag_tools/split_paired_reads.xml
diffs (216 lines):
diff -r 0f735b21dc12 -r 9ef55e79068b lib/galaxy/datatypes/sequence.py
--- a/lib/galaxy/datatypes/sequence.py Thu Sep 18 16:48:29 2008 -0400
+++ b/lib/galaxy/datatypes/sequence.py Fri Sep 19 12:02:13 2008 -0400
@@ -98,8 +98,8 @@
dataset.peek = data.get_file_peek( dataset.file_name )
count = size = 0
bases_regexp = re.compile("^[NGTAC]*$")
- for line in file( dataset.file_name ):
- if line and line[0] == "@":
+ for i, line in enumerate(file( dataset.file_name )):
+ if line and line[0] == "@" and i % 4 == 0:
count += 1
elif bases_regexp.match(line):
line = line.strip()
diff -r 0f735b21dc12 -r 9ef55e79068b test-data/split_paired_reads_test1.fastq
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/split_paired_reads_test1.fastq Fri Sep 19 12:02:13 2008 -0400
@@ -0,0 +1,21 @@
+@HWI-EAS91_1_30788AAXX:7:21:1542:1758
+GTCAATTGTACTGGTCAATACTAAAAGAATAGGATCGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA
++HWI-EAS91_1_30788AAXX:7:21:1542:1758
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR
+@HWI-EAS91_1_30788AAXX:7:22:1621:462
+ATAATGGCTATTATTGTGGGGGGGATGATGCTGGAAACTAGCCCCAATATCAATCCTATATCAAATCTCACC
++HWI-EAS91_1_30788AAXX:7:22:1621:462
+hhhhhhhhhhhhQAhh@hhhhNhhhfhMbCIScC?hhJhhhhChhhJhhhRhhKhePhc\KhhV\KhXhJhh
+@HWI-EAS91_1_30788AAXX:7:45:408:807
+TACCCGATTTTTTGCTTTCCACTTTATCCTACCCTTATGAGTGCTAGGATCAGGATGGAGAGGATTAGGGCT
++HWI-EAS91_1_30788AAXX:7:45:408:807
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hhhZh`hhhhhRXhhYh
+@HWI-EAS91_1_30788AAXX:7:49:654:1439
+CTAACTCTATTTATTGTATTTCAACTAAAAATCTCATAGGTTTATTGATAGTTGTGTTGTTGGTGTAAATGG
++HWI-EAS91_1_30788AAXX:7:49:654:1439
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhdhh_hG\XhU@
+@HWI-EAS91_1_30788AAXX:7:64:947:234
+TATCAAAAAAGAATATAATCTGAATCAACACTACAACCTATTAGTGTGTAGAATAGGAAGTAGAGGCCTGCG
++HWI-EAS91_1_30788AAXX:7:64:947:234
+hhhhhhhhhhhhhhhhhhhhhhhRhhehhahhhhhJhhhhhhhh^hPhWfhhhhThWUhhfhh_hhNIVPUd
+
diff -r 0f735b21dc12 -r 9ef55e79068b test-data/split_paired_reads_test1.out1
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/split_paired_reads_test1.out1 Fri Sep 19 12:02:13 2008 -0400
@@ -0,0 +1,20 @@
+@HWI-EAS91_1_30788AAXX:7:21:1542:1758/1
+GTCAATTGTACTGGTCAATACTAAAAGAATAGGATC
++HWI-EAS91_1_30788AAXX:7:21:1542:1758/1
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+@HWI-EAS91_1_30788AAXX:7:22:1621:462/1
+ATAATGGCTATTATTGTGGGGGGGATGATGCTGGAA
++HWI-EAS91_1_30788AAXX:7:22:1621:462/1
+hhhhhhhhhhhhQAhh@hhhhNhhhfhMbCIScC?h
+@HWI-EAS91_1_30788AAXX:7:45:408:807/1
+TACCCGATTTTTTGCTTTCCACTTTATCCTACCCTT
++HWI-EAS91_1_30788AAXX:7:45:408:807/1
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+@HWI-EAS91_1_30788AAXX:7:49:654:1439/1
+CTAACTCTATTTATTGTATTTCAACTAAAAATCTCA
++HWI-EAS91_1_30788AAXX:7:49:654:1439/1
+hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+@HWI-EAS91_1_30788AAXX:7:64:947:234/1
+TATCAAAAAAGAATATAATCTGAATCAACACTACAA
++HWI-EAS91_1_30788AAXX:7:64:947:234/1
+hhhhhhhhhhhhhhhhhhhhhhhRhhehhahhhhhJ
diff -r 0f735b21dc12 -r 9ef55e79068b tool_conf.xml.sample
--- a/tool_conf.xml.sample Thu Sep 18 16:48:29 2008 -0400
+++ b/tool_conf.xml.sample Fri Sep 19 12:02:13 2008 -0400
@@ -274,6 +274,7 @@
<tool file="metag_tools/short_reads_figure_high_quality_length.xml"
/>
<tool file="metag_tools/short_reads_trim_seq.xml" />
<tool file="metag_tools/blat_coverage_report.xml" />
+ <tool file="metag_tools/split_paired_reads.xml" />
</section>
<section name="Short Read Mapping" id="solexa_tools">
<tool file="metag_tools/shrimp_wrapper.xml" />
diff -r 0f735b21dc12 -r 9ef55e79068b tools/metag_tools/shrimp_wrapper.py
--- a/tools/metag_tools/shrimp_wrapper.py Thu Sep 18 16:48:29 2008 -0400
+++ b/tools/metag_tools/shrimp_wrapper.py Fri Sep 19 12:02:13 2008 -0400
@@ -162,6 +162,7 @@
readname, endindex = line[1:].split('/')
else:
score = line
+
if score: # the last one
if hits.has_key(readname):
if len(hits[readname]) == hit_per_read:
@@ -182,8 +183,9 @@
match_count = 0
if hit_per_read == 1:
- matches = [ hits[readkey]['1'] ]
- match_count = 1
+ if len(hits[readkey]['1']) == 1:
+ matches = [ hits[readkey]['1'] ]
+ match_count = 1
else:
end1_data = hits[readkey]['1']
end2_data = hits[readkey]['2']
@@ -591,6 +593,7 @@
if os.path.exists(query_qual_end2): os.remove(query_qual_end2)
if os.path.exists(shrimp_log): os.remove(shrimp_log)
+
if __name__ == '__main__': __main__()
diff -r 0f735b21dc12 -r 9ef55e79068b tools/metag_tools/split_paired_reads.py
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/split_paired_reads.py Fri Sep 19 12:02:13 2008 -0400
@@ -0,0 +1,46 @@
+#! /usr/bin/python
+
+"""
+Split Solexa paired end reads
+"""
+
+import os, sys
+
+if __name__ == '__main__':
+
+ infile = sys.argv[1]
+ outfile_end1 = open(sys.argv[2], 'w')
+ outfile_end2 = open(sys.argv[3], 'w')
+
+ for i, line in enumerate(file(infile)):
+ line = line.rstrip()
+ if not line or line.startswith('#'): continue
+
+ end1 = ''
+ end2 = ''
+
+ line_index = i % 4
+
+ if line_index == 0:
+ end1 = line + '/1'
+ end2 = line + '/2'
+
+ elif line_index == 1:
+ seq_len = len(line)/2
+ end1 = line[0:seq_len]
+ end2 = line[seq_len:]
+
+ elif line_index == 2:
+ end1 = line + '/1'
+ end2 = line + '/2'
+
+ else:
+ qual_len = len(line)/2
+ end1 = line[0:qual_len]
+ end2 = line[qual_len:]
+
+ outfile_end1.write('%s\n' %(end1))
+ outfile_end2.write('%s\n' %(end2))
+
+ outfile_end1.close()
+ outfile_end2.close()
\ No newline at end of file
diff -r 0f735b21dc12 -r 9ef55e79068b tools/metag_tools/split_paired_reads.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/metag_tools/split_paired_reads.xml Fri Sep 19 12:02:13 2008 -0400
@@ -0,0 +1,56 @@
+<tool id="split_paired_reads" name="Split"
version="1.0.0">
+ <description>paired-end reads into two ends</description>
+ <command interpreter="python">
+ split_paired_reads.py $input $output1 $output2
+ </command>
+ <inputs>
+ <param name="input" type="data"
format="fastqsolexa" label="Your paired-end file" />
+ </inputs>
+ <outputs>
+ <data name="output1" format="fastqsolexa"/>
+ <data name="output2" format="fastqsolexa"/>
+ </outputs>
+ <tests>
+ <test>
+ <param name="input"
value="split_paired_reads_test1.fastq" ftype="fastqsolexa" />
+ <output name="output1"
file="split_paired_reads_test1.out1" fype="fastqsolexa" />
+ </test>
+ </tests>
+<help>
+
+**What it does**
+
+This tool splits a single paired-end file in half and returns two files with each ends.
+
+-----
+
+**Input formats**
+
+A multiple-fastq file, for example::
+
+ @HWI-EAS91_1_30788AAXX:7:21:1542:1758
+ GTCAATTGTACTGGTCAATACTAAAAGAATAGGATCGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA
+ +HWI-EAS91_1_30788AAXX:7:21:1542:1758
+ hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR
+
+
+-----
+
+**Outputs**
+
+One end::
+
+ @HWI-EAS91_1_30788AAXX:7:21:1542:1758/1
+ GTCAATTGTACTGGTCAATACTAAAAGAATAGGATC
+ +HWI-EAS91_1_30788AAXX:7:21:1542:1758/1
+ hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh
+
+The other end::
+
+ @HWI-EAS91_1_30788AAXX:7:21:1542:1758/2
+ GCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA
+ +HWI-EAS91_1_30788AAXX:7:21:1542:1758/2
+ hhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR
+
+</help>
+</tool>