[hg] galaxy 3561: Add FASTQ <--> Tabular converter tools.

16 Apr 2010

details:   http://www.bx.psu.edu/hg/galaxy/rev/44f2713a7279
changeset: 3561:44f2713a7279
user:      Dan Blankenberg <dan@bx.psu.edu>
date:      Wed Mar 24 15:13:57 2010 -0400
description:
Add FASTQ <--> Tabular converter tools.

diffstat:

 test-data/fastq_to_tabular_out_1.tabular |   2 +
 test-data/fastq_to_tabular_out_2.tabular |   2 +
 tool_conf.xml.main                       |   2 +
 tool_conf.xml.sample                     |   2 +
 tools/fastq/fastq_to_tabular.py          |  21 ++++++++++++++++++
 tools/fastq/fastq_to_tabular.xml         |  30 +++++++++++++++++++++++++
 tools/fastq/tabular_to_fastq.py          |  29 +++++++++++++++++++++++++
 tools/fastq/tabular_to_fastq.xml         |  37 ++++++++++++++++++++++++++++++++
 8 files changed, 125 insertions(+), 0 deletions(-)

diffs (169 lines):

diff -r 4c95f1a101f1 -r 44f2713a7279 test-data/fastq_to_tabular_out_1.tabular

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fastq_to_tabular_out_1.tabular	Wed Mar 24 15:13:57 2010 -0400
@@ -0,0 +1,2 @@
+FAKE0001 Original version has PHRED scores from 0 to 93 inclusive (in that order)	ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC	!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
+FAKE0002 Original version has PHRED scores from 93 to 0 inclusive (in that order)	CATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA	~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
diff -r 4c95f1a101f1 -r 44f2713a7279 test-data/fastq_to_tabular_out_2.tabular
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fastq_to_tabular_out_2.tabular	Wed Mar 24 15:13:57 2010 -0400
@@ -0,0 +1,2 @@
+FAKE0001 Original version has PHRED scores from 0 to 93 inclusive (in that order)	G2131313131313131313131313131313131313131313131313131313131313131313131313131313131313131313131	!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~
+FAKE0002 Original version has PHRED scores from 93 to 0 inclusive (in that order)	G3131313131313131313131313131313131313131313131313131313131313131313131313131313131313131313131	~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"!
diff -r 4c95f1a101f1 -r 44f2713a7279 tool_conf.xml.main
--- a/tool_conf.xml.main	Wed Mar 24 14:14:58 2010 -0400
+++ b/tool_conf.xml.main	Wed Mar 24 15:13:57 2010 -0400
@@ -297,6 +297,8 @@
         <tool file="fastq/fastq_trimmer.xml" />
         <tool file="fastq/fastq_manipulation.xml" />
         <tool file="fastq/fastq_to_fasta.xml" />
+        <tool file="fastq/fastq_to_tabular.xml" />
+        <tool file="fastq/tabular_to_fastq.xml" />
   </section>
   <section name="NGS: Mapping" id="ngs_mapping">
     <label text="Illumina" id="illumina"/>
diff -r 4c95f1a101f1 -r 44f2713a7279 tool_conf.xml.sample
--- a/tool_conf.xml.sample	Wed Mar 24 14:14:58 2010 -0400
+++ b/tool_conf.xml.sample	Wed Mar 24 15:13:57 2010 -0400
@@ -208,6 +208,8 @@
         <tool file="fastq/fastq_trimmer.xml" />
         <tool file="fastq/fastq_manipulation.xml" />
         <tool file="fastq/fastq_to_fasta.xml" />
+        <tool file="fastq/fastq_to_tabular.xml" />
+        <tool file="fastq/tabular_to_fastq.xml" />
   </section>
   <section name="NGS: Mapping" id="solexa_tools">
    <tool file="sr_mapping/lastz_wrapper.xml" />
diff -r 4c95f1a101f1 -r 44f2713a7279 tools/fastq/fastq_to_tabular.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_to_tabular.py	Wed Mar 24 15:13:57 2010 -0400
@@ -0,0 +1,21 @@
+#Dan Blankenberg
+import sys
+from galaxy_utils.sequence.fastq import fastqReader
+
+def main():
+    input_filename = sys.argv[1]
+    output_filename = sys.argv[2]
+    input_type = sys.argv[3] or 'sanger' #input type should ordinarily be unnecessary
+    
+    num_reads = None
+    fastq_read = None
+    out = open( output_filename, 'wb' )
+    for num_reads, fastq_read in enumerate( fastqReader( open( input_filename ), format = input_type ) ):
+        out.write( "%s\t%s\t%s\n" % ( fastq_read.identifier[1:].replace( '\t', ' ' ), fastq_read.sequence.replace( '\t', ' ' ), fastq_read.quality.replace( '\t', ' ' ) ) )
+    out.close()
+    if num_reads is None:
+        print "No valid FASTQ reads could be processed."
+    else:
+        print "%i FASTQ reads were converted to Tabular." % ( num_reads + 1 )
+    
+if __name__ == "__main__": main()
diff -r 4c95f1a101f1 -r 44f2713a7279 tools/fastq/fastq_to_tabular.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/fastq_to_tabular.xml	Wed Mar 24 15:13:57 2010 -0400
@@ -0,0 +1,30 @@
+<tool id="fastq_to_tabular" name="FASTQ to Tabular" version="1.0.0">
+  <description>converter</description>
+  <command interpreter="python">fastq_to_tabular.py '$input_file' '$output_file' '${input_file.extension[len( 'fastq' ):]}'</command>
+  <inputs>
+    <param name="input_file" type="data" format="fastqsanger,fastqcssanger,fastqillumina,fastqsolexa" label="FASTQ file to convert" />
+  </inputs>
+  <outputs>
+    <data name="output_file" format="tabular" />
+  </outputs>
+  <tests>
+    <!-- basic test -->
+    <test>
+      <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" />
+      <output name="output_file" file="fastq_to_tabular_out_1.tabular" />
+    </test>
+    <!-- color space test -->
+    <test>
+      <param name="input_file" value="sanger_full_range_as_cssanger.fastqcssanger" ftype="fastqcssanger" />
+      <output name="output_file" file="fastq_to_tabular_out_2.tabular" />
+    </test>
+  </tests>
+  <help>
+**What it does**
+
+This tool converts FASTQ sequencing reads to a Tabular file.
+
+Tab characters, if present in the source FASTQ file, will be converted to spaces.
+
+  </help>
+</tool>
diff -r 4c95f1a101f1 -r 44f2713a7279 tools/fastq/tabular_to_fastq.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/tabular_to_fastq.py	Wed Mar 24 15:13:57 2010 -0400
@@ -0,0 +1,29 @@
+#Dan Blankenberg
+import sys
+
+def main():
+    input_filename = sys.argv[1]
+    output_filename = sys.argv[2]
+    identifier_col = int( sys.argv[3] ) - 1
+    sequence_col = int( sys.argv[4] ) - 1
+    quality_col = int( sys.argv[5] ) - 1
+    
+    max_col = max( identifier_col, sequence_col, quality_col )
+    num_reads = None
+    fastq_read = None
+    skipped_lines = 0
+    out = open( output_filename, 'wb' )
+    for num_reads, line in enumerate( open( input_filename ) ):
+        fields = line.rstrip( '\n\r' ).split( '\t' )
+        if len( fields ) > max_col:
+            out.write( "@%s\n%s\n+\n%s\n" % ( fields[identifier_col], fields[sequence_col], fields[quality_col] ) )
+        else:
+            skipped_lines += 1
+    
+    out.close()
+    if num_reads is None:
+        print "Input was empty."
+    else:
+        print "%i tabular lines were written as FASTQ reads. Be sure to use the FASTQ Groomer tool on this output before further analysis." % ( num_reads + 1 - skipped_lines )
+    
+if __name__ == "__main__": main()
diff -r 4c95f1a101f1 -r 44f2713a7279 tools/fastq/tabular_to_fastq.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/fastq/tabular_to_fastq.xml	Wed Mar 24 15:13:57 2010 -0400
@@ -0,0 +1,37 @@
+<tool id="tabular_to_fastq" name="Tabular to FASTQ" version="1.0.0">
+  <description>converter</description>
+  <command interpreter="python">tabular_to_fastq.py '$input_file' '$output_file' '$identifier' '$sequence' '$quality'</command>
+  <inputs>
+    <param name="input_file" type="data" format="tabular" label="Tabular file to convert" />
+    <param name="identifier" label="Identifier column" type="data_column" data_ref="input_file" />
+    <param name="sequence" label="Sequence column" type="data_column" data_ref="input_file" />
+    <param name="quality" label="Quality column" type="data_column" data_ref="input_file" />
+  </inputs>
+  <outputs>
+    <data name="output_file" format="fastq" />
+  </outputs>
+  <tests>
+    <!-- basic test -->
+    <test>
+      <param name="input_file" value="fastq_to_tabular_out_1.tabular" ftype="tabular" />
+      <param name="identifier" value="1" />
+      <param name="sequence" value="2" />
+      <param name="quality" value="3" />
+      <output name="output_file" file="sanger_full_range_original_sanger.fastqsanger" />
+    </test>
+    <!-- color space test -->
+    <test>
+      <param name="input_file" value="fastq_to_tabular_out_2.tabular" ftype="tabular" />
+      <param name="identifier" value="1" />
+      <param name="sequence" value="2" />
+      <param name="quality" value="3" />
+      <output name="output_file" file="sanger_full_range_as_cssanger.fastqcssanger" />
+    </test>
+  </tests>
+  <help>
+**What it does**
+
+This tool attempts to convert a tabular file containing sequencing read data to a FASTQ formatted file. The FASTQ Groomer tool should always be used on the output of this tool. 
+
+  </help>
+</tool>

    

Greg Von Kuster

tags

participants (1)