[hg] galaxy 2836: Added FASTQ \"Groomer\" tool to converters sec...

9 Oct 2009

details:   http://www.bx.psu.edu/hg/galaxy/rev/b25297e88f96
changeset: 2836:b25297e88f96
user:      Kelly Vincent <kpvincent@bx.psu.edu>
date:      Tue Oct 06 21:25:01 2009 -0400
description:
Added FASTQ \"Groomer\" tool to converters section. Relies on new datatype (fastq) which will be added later.

7 file(s) affected in this change:

test-data/fastq_gen_conv_in1.fastq
test-data/fastq_gen_conv_in2.fastq
test-data/fastq_gen_conv_out1.fastqsanger
test-data/fastq_gen_conv_out2.fastqsanger
tool_conf.xml.sample
tools/next_gen_conversion/fastq_gen_conv.py
tools/next_gen_conversion/fastq_gen_conv.xml

diffs (370 lines):

diff -r 2fb0a64c6aaa -r b25297e88f96 test-data/fastq_gen_conv_in1.fastq

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fastq_gen_conv_in1.fastq	Tue Oct 06 21:25:01 2009 -0400
@@ -0,0 +1,16 @@
+@seq1
+AGTCGTGGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT
++
+;>@BCEFGHJKLMNOPQRSTUVWXYZ[\]^_?abcdefghijklmno
+@seq2
+AGTCGTTGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT
++
+;>@BCElcH@KLMNOPQ>STZVWbYu[\]^_?a=;d?fghijklmno
+@seq3
+AGTCGTCGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT
++
+7>@BCEFGHJKLMNOPQRSTUVWXYZ[\]^_?abcdefghijklmno
+@seq4
+AGTCGTAGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT
++
+;>@BCEFGHJKLMNOPQRSTUVWXYZ[\]^_?abcdefghijklmno
diff -r 2fb0a64c6aaa -r b25297e88f96 test-data/fastq_gen_conv_in2.fastq
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fastq_gen_conv_in2.fastq	Tue Oct 06 21:25:01 2009 -0400
@@ -0,0 +1,24 @@
+@seq1
+AAAGGTTTCTCTTTTGGAAATATCTAAATCCC
++
+!"#$%&\'()*+,-./0123456789:;<=>.
+@seq2
+GGGTCTCCCAGAATGATTAGAGCCGTATAGGA
++
+?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]
+@seq3
+GCGGTTCAATACGATTACCACCATGATAAATA
++
+?Aa.1ghB2K!#lk(02GY[[II])Kwl+,5M
+@seq4
+AGTCTTTTCCTCTAAAATAACATAGGATACTA
++
+ghY)N375Nh.,Ol>==/<:2#i&d%#KdNII
+@seq5
+GAGGACTCATGGTAGGTATTTTACATGACATT
++
+IIgy%hf6#394bd&hNMWL$OPB63II*,+-
+@seq6
+GGCCTACATTCATTTACGAGACTAATTAGGGA
++
+IIIIIgd6#5%jKO&.,D+s3aW=cdGB#a1$
\ No newline at end of file
diff -r 2fb0a64c6aaa -r b25297e88f96 test-data/fastq_gen_conv_out1.fastqsanger
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fastq_gen_conv_out1.fastqsanger	Tue Oct 06 21:25:01 2009 -0400
@@ -0,0 +1,12 @@
+@seq1
+AGTCGTGGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT
++
+"#$%%''()+,-./0123456789:;<=>?@#BCDEFGHIJKLMNOP
+@seq2
+AGTCGTTGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT
++
+"#$%%'MD)$,-./012#45;78C:V<=>?@#B""E#GHIJKLMNOP
+@seq4
+AGTCGTAGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT
++
+"#$%%''()+,-./0123456789:;<=>?@#BCDEFGHIJKLMNOP
diff -r 2fb0a64c6aaa -r b25297e88f96 test-data/fastq_gen_conv_out2.fastqsanger
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/fastq_gen_conv_out2.fastqsanger	Tue Oct 06 21:25:01 2009 -0400
@@ -0,0 +1,12 @@
+@seq1
+AAAGGTTTCTCTTTTGGAAATATCTAAATCCC
++
+!"#$%&\'()*+,-./0123456789:;<=>.
+@seq2
+GGGTCTCCCAGAATGATTAGAGCCGTATAGGA
++
+?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]
+@seq3
+GCGGTTCAATACGATTACCACCATGATAAATA
++
+?Aa.1ghB2K!#lk(02GY[[II])Kwl+,5M
diff -r 2fb0a64c6aaa -r b25297e88f96 tool_conf.xml.sample
--- a/tool_conf.xml.sample	Tue Oct 06 16:55:47 2009 -0400
+++ b/tool_conf.xml.sample	Tue Oct 06 21:25:01 2009 -0400
@@ -75,6 +75,7 @@
     <tool file="next_gen_conversion/solid_to_fastq.xml" />
     <tool file="next_gen_conversion/fastq_conversions.xml" />
     <tool file="fastx_toolkit/fastq_quality_converter.xml" />
+    <tool file="next_gen_conversion/fastq_gen_conv.xml" />
   </section>
   <section name="Extract Features" id="features">
     <tool file="filters/ucsc_gene_bed_to_exon_bed.xml" />
diff -r 2fb0a64c6aaa -r b25297e88f96 tools/next_gen_conversion/fastq_gen_conv.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/next_gen_conversion/fastq_gen_conv.py	Tue Oct 06 21:25:01 2009 -0400
@@ -0,0 +1,169 @@
+"""
+Converts any type of FASTQ file to Sanger type  and makes small adjustments if necessary.
+
+usage: %prog [options]
+   -i, --input=i: Input FASTQ candidate file
+   -r, --origType=r: Original type
+   -a, --allOrNot=a: Whether or not to check all blocks
+   -b, --blocks=b: Number of blocks to check
+   -o, --output=o: Output file
+
+usage: %prog input_file oroutput_file
+"""
+
+import math, sys
+from galaxy import eggs
+import pkg_resources; pkg_resources.require( "bx-python" )
+from bx.cookbook import doc_optparse
+
+def stop_err( msg ):
+    sys.stderr.write( "%s\n" % msg )
+    sys.exit()
+    
+def all_bases_valid(seq):
+    """Confirm that the sequence contains only bases"""
+    valid_bases = ['a', 'A', 'c', 'C', 'g', 'G', 't', 'T', 'N']
+    for base in seq:
+        if base not in valid_bases:
+            return False
+    return True
+
+def __main__():
+    #Parse Command Line
+    options, args = doc_optparse.parse( __doc__ )
+    orig_type = options.origType
+    if orig_type == 'sanger' and options.allOrNot == 'not':
+        max_blocks = int(options.blocks)
+    else:
+        max_blocks = -1
+    fin = file(options.input, 'r')
+    fout = file(options.output, 'w')
+    range_min = 1000
+    range_max = -5
+    block_num = 0
+    bad_blocks = 0
+    base_len = -1
+    line_count = 0
+    lines = []
+    line = fin.readline()
+    while line:
+        if max_blocks >= 0 and block_num > 0 and orig_type == 'sanger' and max_blocks < block_num:
+            print 'break'
+            break
+        if line.strip():
+            # the line that starts of a block, with a name
+            if line_count % 4 == 0 and line.startswith('@'):
+                lines.append(line)
+                block_num += 1
+            else:
+                # if we expect a sequence of bases
+                if line_count % 4 == 1 and all_bases_valid(line.strip()):
+                    lines.append(line)
+                    base_len = len(line.strip())
+                # if we expect the second name line
+                elif line_count % 4 == 2 and line.startswith('+'):
+                    lines.append(line)
+                # if we expect a sequence of qualities and it's the expected length
+                elif line_count % 4 == 3:
+                    split_line = line.strip().split()
+                    # decimal qualities
+                    if len(split_line) == base_len:
+                        # convert
+                        phred_list = []
+                        for ch in split_line:
+                            int_ch = int(ch)
+                            if int_ch < range_min:
+                                range_min = int_ch
+                            if int_ch > range_max:
+                                range_max = int_ch
+                            if int_ch >= 0 and int_ch <= 93:
+                                phred_list.append(chr(int_ch + 33))
+                        # make sure we haven't lost any quality values
+                        if len(phred_list) == base_len:
+                            # print first three lines
+                            for l in lines:
+                                fout.write(l)
+                            # print converted quality line
+                            fout.write(''.join(phred_list))
+                            # reset
+                            lines = []
+                            base_len = -1
+                        # abort if so
+                        else:
+                            bad_blocks += 1
+                            lines = []
+                            base_len = -1
+                    # ascii qualities
+                    elif len(split_line[0]) == base_len:
+                        qualities = []
+                        # print converted quality line
+                        if orig_type == 'illumina':
+                            for c in line.strip():
+                                if ord(c) - 64 < range_min:
+                                    range_min = ord(c) - 64
+                                if ord(c) - 64 > range_max:
+                                    range_max = ord(c) - 64
+                                if ord(c) < 64 or ord(c) > 126:
+                                    bad_blocks += 1
+                                    base_len = -1
+                                    lines = []
+                                    break
+                                else:
+                                    qualities.append( chr( ord(c) - 31 ) )
+                            quals = ''.join(qualities)
+                        elif orig_type == 'solexa':
+                            for c in line.strip():
+                                if ord(c) - 64 < range_min:
+                                    range_min = ord(c) - 64
+                                if ord(c) - 64 > range_max:
+                                    range_max = ord(c) - 64
+                                if ord(c) < 59 or ord(c) > 126:
+                                    bad_blocks += 1
+                                    base_len = -1
+                                    lines = []
+                                    break
+                                else:
+                                    p = 10.0**( ( ord(c) - 64 ) / -10.0 ) / ( 1 + 10.0**( ( ord(c) - 64 ) / -10.0 ) )
+                                    qualities.append( chr( int( -10.0*math.log10( p ) ) + 33 ) )
+                            quals = ''.join(qualities)
+                        else:  # 'sanger'
+                            for c in line.strip():
+                                if ord(c) - 33 < range_min:
+                                    range_min = ord(c) - 33
+                                if ord(c) - 33 > range_max:
+                                    range_max = ord(c) - 33
+                                if ord(c) < 33 or ord(c) > 126:
+                                    bad_blocks += 1
+                                    base_len = -1
+                                    lines = []
+                                    break
+                                else:
+                                    qualities.append(c)
+                            quals = ''.join(qualities)
+                        # make sure we don't have bad qualities
+                        if len(quals) == base_len:
+                            # print first three lines
+                            for l in lines:
+                                fout.write(l)
+                            # print out quality line
+                            fout.write(quals+'\n')
+                        # reset
+                        lines = []
+                        base_len = -1
+                    else:
+                        bad_blocks += 1
+                        base_len = -1
+                        lines = []
+            line_count += 1
+        line = fin.readline()
+    fout.close()
+    fin.close()
+    if range_min != 1000 and range_min != -5:
+        outmsg = 'The range of quality values found were: %s to %s' % (range_min, range_max)
+    else:
+        outmsg = ''
+    if bad_blocks > 0:
+        outmsg += '\nThere were %s bad blocks skipped' % (bad_blocks)
+    sys.stdout.write(outmsg)
+
+if __name__=="__main__": __main__() 
\ No newline at end of file
diff -r 2fb0a64c6aaa -r b25297e88f96 tools/next_gen_conversion/fastq_gen_conv.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/next_gen_conversion/fastq_gen_conv.xml	Tue Oct 06 21:25:01 2009 -0400
@@ -0,0 +1,100 @@
+<tool id="fastq_gen_conv" name="FASTQ Groomer" version="1.0.0">
+  <description>converts any type of FASTQ file to Sanger type and validates data</description>
+  <command interpreter="python">
+    fastq_gen_conv.py 
+     --input=$input 
+     --origType=$origTypeChoice.origType
+     #if $origTypeChoice.origType == "sanger":
+      --allOrNot=$origTypeChoice.howManyBlocks.allOrNot 
+      #if $origTypeChoice.howManyBlocks.allOrNot == "not":
+       --blocks=$origTypeChoice.howManyBlocks.blocks
+      #else:
+       --blocks="None"
+      #end if
+     #else:
+      --allOrNot="None"
+      --blocks="None"
+     #end if
+     --output=$output
+  </command>
+  <inputs>
+    <param name="input" type="data" format="fastq" label="FASTQ file to check:" />
+    <conditional name="origTypeChoice">
+      <param name="origType" type="select" label="What type of FASTQ do you think this is?">
+        <option value="solexa">Solexa</option>
+        <option value="illumina">Illumina</option>
+        <option value="sanger">Sanger</option>
+      </param>
+      <when value="solexa" />
+      <when value="illumina" />
+      <when value="sanger">
+        <conditional name="howManyBlocks">
+          <param name="allOrNot" type="select" label="Do you want to do a subset of lines, or do the whole file?">
+            <option value="all">Check all</option> 
+            <option value="not">Select blocks</option>
+          </param>
+          <when value="all" />
+          <when value="not">
+            <param name="blocks" type="integer" value="1000" label="How many blocks (four lines each) do you want to do?" />
+          </when>
+        </conditional>
+      </when>
+    </conditional>
+  </inputs>
+  <outputs>
+    <data name="output" format="fastqsanger"/>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="fastq_gen_conv_in1.fastq" ftype="fastq" />
+      <param name="origType" value="solexa" />
+      <output name="output" format="fastqsanger" file="fastq_gen_conv_out1.fastqsanger" />
+    </test>
+    <test>
+      <param name="input" value="fastq_gen_conv_in2.fastq" ftype="fastq" />
+      <param name="origType" value="sanger" />
+      <param name="allOrNot" value="not" />
+      <param name="blocks" value="3" />
+      <output name="output" format="fastqsanger" file="fastq_gen_conv_out2.fastqsanger" />
+    </test>
+  </tests>
+  <help>
+
+**What it does**
+
+This tool takes a FASTQ file (Solexa or Illumina) and converts it to Sanger format. It only converts valid blocks. It also can confirm the validity of Sanger FASTQ.
+
+-----
+
+**Example**
+
+- Converting the following Solexa FASTQ file::
+
+    @seq1
+    AGTCGTGGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT
+    +
+    ;>@BCEFGHJKLMNOPQRSTUVWXYZ[\]^_?abcdefghijklmno
+    @seq2
+    AGTCGTTGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT
+    +
+    ;>@BCElcH@KLMNOPQ>STZVWbYu[\]^_?a=;d?fghijklmno
+    @seq3
+    AGTCGTCGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT
+    +
+    7>@BCEFGHJKLMNOPQRSTUVWXYZ[\]^_?abcdefghijklmno
+
+- will produce the following Sanger FASTQ data::
+
+    @seq1
+    AGTCGTGGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT
+    +
+    "#$%%''()+,-./0123456789:;<=>?@#BCDEFGHIJKLMNOP
+    @seq2
+    AGTCGTTGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT
+    +
+    "#$%%'MD)$,-./012#45;78C:V%lt;=>?@#B""E#GHIJKLMNOP
+    
+- Note that seq3 was not converted, because it contained an invalid Solexa quality value (7).
+
+    </help>
+</tool>

    

Greg Von Kuster

tags

participants (1)