details: http://www.bx.psu.edu/hg/galaxy/rev/9ef55e79068b changeset: 1520:9ef55e79068b user: wychung date: Fri Sep 19 12:02:13 2008 -0400 description: Fix a bug in shrimp_wrapper and add a tool for splitting paired-end reads. Update datatype/fastqsolexa so the number of sequences is correct. 7 file(s) affected in this change: lib/galaxy/datatypes/sequence.py test-data/split_paired_reads_test1.fastq test-data/split_paired_reads_test1.out1 tool_conf.xml.sample tools/metag_tools/shrimp_wrapper.py tools/metag_tools/split_paired_reads.py tools/metag_tools/split_paired_reads.xml diffs (216 lines): diff -r 0f735b21dc12 -r 9ef55e79068b lib/galaxy/datatypes/sequence.py --- a/lib/galaxy/datatypes/sequence.py Thu Sep 18 16:48:29 2008 -0400 +++ b/lib/galaxy/datatypes/sequence.py Fri Sep 19 12:02:13 2008 -0400 @@ -98,8 +98,8 @@ dataset.peek = data.get_file_peek( dataset.file_name ) count = size = 0 bases_regexp = re.compile("^[NGTAC]*$") - for line in file( dataset.file_name ): - if line and line[0] == "@": + for i, line in enumerate(file( dataset.file_name )): + if line and line[0] == "@" and i % 4 == 0: count += 1 elif bases_regexp.match(line): line = line.strip() diff -r 0f735b21dc12 -r 9ef55e79068b test-data/split_paired_reads_test1.fastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/split_paired_reads_test1.fastq Fri Sep 19 12:02:13 2008 -0400 @@ -0,0 +1,21 @@ +@HWI-EAS91_1_30788AAXX:7:21:1542:1758 +GTCAATTGTACTGGTCAATACTAAAAGAATAGGATCGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA ++HWI-EAS91_1_30788AAXX:7:21:1542:1758 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR +@HWI-EAS91_1_30788AAXX:7:22:1621:462 +ATAATGGCTATTATTGTGGGGGGGATGATGCTGGAAACTAGCCCCAATATCAATCCTATATCAAATCTCACC ++HWI-EAS91_1_30788AAXX:7:22:1621:462 +hhhhhhhhhhhhQAhh@hhhhNhhhfhMbCIScC?hhJhhhhChhhJhhhRhhKhePhc\KhhV\KhXhJhh +@HWI-EAS91_1_30788AAXX:7:45:408:807 +TACCCGATTTTTTGCTTTCCACTTTATCCTACCCTTATGAGTGCTAGGATCAGGATGGAGAGGATTAGGGCT ++HWI-EAS91_1_30788AAXX:7:45:408:807 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hhhZh`hhhhhRXhhYh +@HWI-EAS91_1_30788AAXX:7:49:654:1439 +CTAACTCTATTTATTGTATTTCAACTAAAAATCTCATAGGTTTATTGATAGTTGTGTTGTTGGTGTAAATGG ++HWI-EAS91_1_30788AAXX:7:49:654:1439 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhdhh_hG\XhU@ +@HWI-EAS91_1_30788AAXX:7:64:947:234 +TATCAAAAAAGAATATAATCTGAATCAACACTACAACCTATTAGTGTGTAGAATAGGAAGTAGAGGCCTGCG ++HWI-EAS91_1_30788AAXX:7:64:947:234 +hhhhhhhhhhhhhhhhhhhhhhhRhhehhahhhhhJhhhhhhhh^hPhWfhhhhThWUhhfhh_hhNIVPUd + diff -r 0f735b21dc12 -r 9ef55e79068b test-data/split_paired_reads_test1.out1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/split_paired_reads_test1.out1 Fri Sep 19 12:02:13 2008 -0400 @@ -0,0 +1,20 @@ +@HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 +GTCAATTGTACTGGTCAATACTAAAAGAATAGGATC ++HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh +@HWI-EAS91_1_30788AAXX:7:22:1621:462/1 +ATAATGGCTATTATTGTGGGGGGGATGATGCTGGAA ++HWI-EAS91_1_30788AAXX:7:22:1621:462/1 +hhhhhhhhhhhhQAhh@hhhhNhhhfhMbCIScC?h +@HWI-EAS91_1_30788AAXX:7:45:408:807/1 +TACCCGATTTTTTGCTTTCCACTTTATCCTACCCTT ++HWI-EAS91_1_30788AAXX:7:45:408:807/1 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh +@HWI-EAS91_1_30788AAXX:7:49:654:1439/1 +CTAACTCTATTTATTGTATTTCAACTAAAAATCTCA ++HWI-EAS91_1_30788AAXX:7:49:654:1439/1 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh +@HWI-EAS91_1_30788AAXX:7:64:947:234/1 +TATCAAAAAAGAATATAATCTGAATCAACACTACAA ++HWI-EAS91_1_30788AAXX:7:64:947:234/1 +hhhhhhhhhhhhhhhhhhhhhhhRhhehhahhhhhJ diff -r 0f735b21dc12 -r 9ef55e79068b tool_conf.xml.sample --- a/tool_conf.xml.sample Thu Sep 18 16:48:29 2008 -0400 +++ b/tool_conf.xml.sample Fri Sep 19 12:02:13 2008 -0400 @@ -274,6 +274,7 @@ <tool file="metag_tools/short_reads_figure_high_quality_length.xml" /> <tool file="metag_tools/short_reads_trim_seq.xml" /> <tool file="metag_tools/blat_coverage_report.xml" /> + <tool file="metag_tools/split_paired_reads.xml" /> </section> <section name="Short Read Mapping" id="solexa_tools"> <tool file="metag_tools/shrimp_wrapper.xml" /> diff -r 0f735b21dc12 -r 9ef55e79068b tools/metag_tools/shrimp_wrapper.py --- a/tools/metag_tools/shrimp_wrapper.py Thu Sep 18 16:48:29 2008 -0400 +++ b/tools/metag_tools/shrimp_wrapper.py Fri Sep 19 12:02:13 2008 -0400 @@ -162,6 +162,7 @@ readname, endindex = line[1:].split('/') else: score = line + if score: # the last one if hits.has_key(readname): if len(hits[readname]) == hit_per_read: @@ -182,8 +183,9 @@ match_count = 0 if hit_per_read == 1: - matches = [ hits[readkey]['1'] ] - match_count = 1 + if len(hits[readkey]['1']) == 1: + matches = [ hits[readkey]['1'] ] + match_count = 1 else: end1_data = hits[readkey]['1'] end2_data = hits[readkey]['2'] @@ -591,6 +593,7 @@ if os.path.exists(query_qual_end2): os.remove(query_qual_end2) if os.path.exists(shrimp_log): os.remove(shrimp_log) + if __name__ == '__main__': __main__() diff -r 0f735b21dc12 -r 9ef55e79068b tools/metag_tools/split_paired_reads.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/metag_tools/split_paired_reads.py Fri Sep 19 12:02:13 2008 -0400 @@ -0,0 +1,46 @@ +#! /usr/bin/python + +""" +Split Solexa paired end reads +""" + +import os, sys + +if __name__ == '__main__': + + infile = sys.argv[1] + outfile_end1 = open(sys.argv[2], 'w') + outfile_end2 = open(sys.argv[3], 'w') + + for i, line in enumerate(file(infile)): + line = line.rstrip() + if not line or line.startswith('#'): continue + + end1 = '' + end2 = '' + + line_index = i % 4 + + if line_index == 0: + end1 = line + '/1' + end2 = line + '/2' + + elif line_index == 1: + seq_len = len(line)/2 + end1 = line[0:seq_len] + end2 = line[seq_len:] + + elif line_index == 2: + end1 = line + '/1' + end2 = line + '/2' + + else: + qual_len = len(line)/2 + end1 = line[0:qual_len] + end2 = line[qual_len:] + + outfile_end1.write('%s\n' %(end1)) + outfile_end2.write('%s\n' %(end2)) + + outfile_end1.close() + outfile_end2.close() \ No newline at end of file diff -r 0f735b21dc12 -r 9ef55e79068b tools/metag_tools/split_paired_reads.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/metag_tools/split_paired_reads.xml Fri Sep 19 12:02:13 2008 -0400 @@ -0,0 +1,56 @@ +<tool id="split_paired_reads" name="Split" version="1.0.0"> + <description>paired-end reads into two ends</description> + <command interpreter="python"> + split_paired_reads.py $input $output1 $output2 + </command> + <inputs> + <param name="input" type="data" format="fastqsolexa" label="Your paired-end file" /> + </inputs> + <outputs> + <data name="output1" format="fastqsolexa"/> + <data name="output2" format="fastqsolexa"/> + </outputs> + <tests> + <test> + <param name="input" value="split_paired_reads_test1.fastq" ftype="fastqsolexa" /> + <output name="output1" file="split_paired_reads_test1.out1" fype="fastqsolexa" /> + </test> + </tests> +<help> + +**What it does** + +This tool splits a single paired-end file in half and returns two files with each ends. + +----- + +**Input formats** + +A multiple-fastq file, for example:: + + @HWI-EAS91_1_30788AAXX:7:21:1542:1758 + GTCAATTGTACTGGTCAATACTAAAAGAATAGGATCGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA + +HWI-EAS91_1_30788AAXX:7:21:1542:1758 + hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR + + +----- + +**Outputs** + +One end:: + + @HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 + GTCAATTGTACTGGTCAATACTAAAAGAATAGGATC + +HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 + hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh + +The other end:: + + @HWI-EAS91_1_30788AAXX:7:21:1542:1758/2 + GCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA + +HWI-EAS91_1_30788AAXX:7:21:1542:1758/2 + hhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR + +</help> +</tool>