[hg] galaxy 2836: Added FASTQ \"Groomer\" tool to converters sec...
details: http://www.bx.psu.edu/hg/galaxy/rev/b25297e88f96 changeset: 2836:b25297e88f96 user: Kelly Vincent <kpvincent@bx.psu.edu> date: Tue Oct 06 21:25:01 2009 -0400 description: Added FASTQ \"Groomer\" tool to converters section. Relies on new datatype (fastq) which will be added later. 7 file(s) affected in this change: test-data/fastq_gen_conv_in1.fastq test-data/fastq_gen_conv_in2.fastq test-data/fastq_gen_conv_out1.fastqsanger test-data/fastq_gen_conv_out2.fastqsanger tool_conf.xml.sample tools/next_gen_conversion/fastq_gen_conv.py tools/next_gen_conversion/fastq_gen_conv.xml diffs (370 lines): diff -r 2fb0a64c6aaa -r b25297e88f96 test-data/fastq_gen_conv_in1.fastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fastq_gen_conv_in1.fastq Tue Oct 06 21:25:01 2009 -0400 @@ -0,0 +1,16 @@ +@seq1 +AGTCGTGGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT ++ +;>@BCEFGHJKLMNOPQRSTUVWXYZ[\]^_?abcdefghijklmno +@seq2 +AGTCGTTGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT ++ +;>@BCElcH@KLMNOPQ>STZVWbYu[\]^_?a=;d?fghijklmno +@seq3 +AGTCGTCGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT ++ +7>@BCEFGHJKLMNOPQRSTUVWXYZ[\]^_?abcdefghijklmno +@seq4 +AGTCGTAGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT ++ +;>@BCEFGHJKLMNOPQRSTUVWXYZ[\]^_?abcdefghijklmno diff -r 2fb0a64c6aaa -r b25297e88f96 test-data/fastq_gen_conv_in2.fastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fastq_gen_conv_in2.fastq Tue Oct 06 21:25:01 2009 -0400 @@ -0,0 +1,24 @@ +@seq1 +AAAGGTTTCTCTTTTGGAAATATCTAAATCCC ++ +!"#$%&\'()*+,-./0123456789:;<=>. +@seq2 +GGGTCTCCCAGAATGATTAGAGCCGTATAGGA ++ +?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\] +@seq3 +GCGGTTCAATACGATTACCACCATGATAAATA ++ +?Aa.1ghB2K!#lk(02GY[[II])Kwl+,5M +@seq4 +AGTCTTTTCCTCTAAAATAACATAGGATACTA ++ +ghY)N375Nh.,Ol>==/<:2#i&d%#KdNII +@seq5 +GAGGACTCATGGTAGGTATTTTACATGACATT ++ +IIgy%hf6#394bd&hNMWL$OPB63II*,+- +@seq6 +GGCCTACATTCATTTACGAGACTAATTAGGGA ++ +IIIIIgd6#5%jKO&.,D+s3aW=cdGB#a1$ \ No newline at end of file diff -r 2fb0a64c6aaa -r b25297e88f96 test-data/fastq_gen_conv_out1.fastqsanger --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fastq_gen_conv_out1.fastqsanger Tue Oct 06 21:25:01 2009 -0400 @@ -0,0 +1,12 @@ +@seq1 +AGTCGTGGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT ++ +"#$%%''()+,-./0123456789:;<=>?@#BCDEFGHIJKLMNOP +@seq2 +AGTCGTTGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT ++ +"#$%%'MD)$,-./012#45;78C:V<=>?@#B""E#GHIJKLMNOP +@seq4 +AGTCGTAGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT ++ +"#$%%''()+,-./0123456789:;<=>?@#BCDEFGHIJKLMNOP diff -r 2fb0a64c6aaa -r b25297e88f96 test-data/fastq_gen_conv_out2.fastqsanger --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/fastq_gen_conv_out2.fastqsanger Tue Oct 06 21:25:01 2009 -0400 @@ -0,0 +1,12 @@ +@seq1 +AAAGGTTTCTCTTTTGGAAATATCTAAATCCC ++ +!"#$%&\'()*+,-./0123456789:;<=>. +@seq2 +GGGTCTCCCAGAATGATTAGAGCCGTATAGGA ++ +?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\] +@seq3 +GCGGTTCAATACGATTACCACCATGATAAATA ++ +?Aa.1ghB2K!#lk(02GY[[II])Kwl+,5M diff -r 2fb0a64c6aaa -r b25297e88f96 tool_conf.xml.sample --- a/tool_conf.xml.sample Tue Oct 06 16:55:47 2009 -0400 +++ b/tool_conf.xml.sample Tue Oct 06 21:25:01 2009 -0400 @@ -75,6 +75,7 @@ <tool file="next_gen_conversion/solid_to_fastq.xml" /> <tool file="next_gen_conversion/fastq_conversions.xml" /> <tool file="fastx_toolkit/fastq_quality_converter.xml" /> + <tool file="next_gen_conversion/fastq_gen_conv.xml" /> </section> <section name="Extract Features" id="features"> <tool file="filters/ucsc_gene_bed_to_exon_bed.xml" /> diff -r 2fb0a64c6aaa -r b25297e88f96 tools/next_gen_conversion/fastq_gen_conv.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/next_gen_conversion/fastq_gen_conv.py Tue Oct 06 21:25:01 2009 -0400 @@ -0,0 +1,169 @@ +""" +Converts any type of FASTQ file to Sanger type and makes small adjustments if necessary. + +usage: %prog [options] + -i, --input=i: Input FASTQ candidate file + -r, --origType=r: Original type + -a, --allOrNot=a: Whether or not to check all blocks + -b, --blocks=b: Number of blocks to check + -o, --output=o: Output file + +usage: %prog input_file oroutput_file +""" + +import math, sys +from galaxy import eggs +import pkg_resources; pkg_resources.require( "bx-python" ) +from bx.cookbook import doc_optparse + +def stop_err( msg ): + sys.stderr.write( "%s\n" % msg ) + sys.exit() + +def all_bases_valid(seq): + """Confirm that the sequence contains only bases""" + valid_bases = ['a', 'A', 'c', 'C', 'g', 'G', 't', 'T', 'N'] + for base in seq: + if base not in valid_bases: + return False + return True + +def __main__(): + #Parse Command Line + options, args = doc_optparse.parse( __doc__ ) + orig_type = options.origType + if orig_type == 'sanger' and options.allOrNot == 'not': + max_blocks = int(options.blocks) + else: + max_blocks = -1 + fin = file(options.input, 'r') + fout = file(options.output, 'w') + range_min = 1000 + range_max = -5 + block_num = 0 + bad_blocks = 0 + base_len = -1 + line_count = 0 + lines = [] + line = fin.readline() + while line: + if max_blocks >= 0 and block_num > 0 and orig_type == 'sanger' and max_blocks < block_num: + print 'break' + break + if line.strip(): + # the line that starts of a block, with a name + if line_count % 4 == 0 and line.startswith('@'): + lines.append(line) + block_num += 1 + else: + # if we expect a sequence of bases + if line_count % 4 == 1 and all_bases_valid(line.strip()): + lines.append(line) + base_len = len(line.strip()) + # if we expect the second name line + elif line_count % 4 == 2 and line.startswith('+'): + lines.append(line) + # if we expect a sequence of qualities and it's the expected length + elif line_count % 4 == 3: + split_line = line.strip().split() + # decimal qualities + if len(split_line) == base_len: + # convert + phred_list = [] + for ch in split_line: + int_ch = int(ch) + if int_ch < range_min: + range_min = int_ch + if int_ch > range_max: + range_max = int_ch + if int_ch >= 0 and int_ch <= 93: + phred_list.append(chr(int_ch + 33)) + # make sure we haven't lost any quality values + if len(phred_list) == base_len: + # print first three lines + for l in lines: + fout.write(l) + # print converted quality line + fout.write(''.join(phred_list)) + # reset + lines = [] + base_len = -1 + # abort if so + else: + bad_blocks += 1 + lines = [] + base_len = -1 + # ascii qualities + elif len(split_line[0]) == base_len: + qualities = [] + # print converted quality line + if orig_type == 'illumina': + for c in line.strip(): + if ord(c) - 64 < range_min: + range_min = ord(c) - 64 + if ord(c) - 64 > range_max: + range_max = ord(c) - 64 + if ord(c) < 64 or ord(c) > 126: + bad_blocks += 1 + base_len = -1 + lines = [] + break + else: + qualities.append( chr( ord(c) - 31 ) ) + quals = ''.join(qualities) + elif orig_type == 'solexa': + for c in line.strip(): + if ord(c) - 64 < range_min: + range_min = ord(c) - 64 + if ord(c) - 64 > range_max: + range_max = ord(c) - 64 + if ord(c) < 59 or ord(c) > 126: + bad_blocks += 1 + base_len = -1 + lines = [] + break + else: + p = 10.0**( ( ord(c) - 64 ) / -10.0 ) / ( 1 + 10.0**( ( ord(c) - 64 ) / -10.0 ) ) + qualities.append( chr( int( -10.0*math.log10( p ) ) + 33 ) ) + quals = ''.join(qualities) + else: # 'sanger' + for c in line.strip(): + if ord(c) - 33 < range_min: + range_min = ord(c) - 33 + if ord(c) - 33 > range_max: + range_max = ord(c) - 33 + if ord(c) < 33 or ord(c) > 126: + bad_blocks += 1 + base_len = -1 + lines = [] + break + else: + qualities.append(c) + quals = ''.join(qualities) + # make sure we don't have bad qualities + if len(quals) == base_len: + # print first three lines + for l in lines: + fout.write(l) + # print out quality line + fout.write(quals+'\n') + # reset + lines = [] + base_len = -1 + else: + bad_blocks += 1 + base_len = -1 + lines = [] + line_count += 1 + line = fin.readline() + fout.close() + fin.close() + if range_min != 1000 and range_min != -5: + outmsg = 'The range of quality values found were: %s to %s' % (range_min, range_max) + else: + outmsg = '' + if bad_blocks > 0: + outmsg += '\nThere were %s bad blocks skipped' % (bad_blocks) + sys.stdout.write(outmsg) + +if __name__=="__main__": __main__() \ No newline at end of file diff -r 2fb0a64c6aaa -r b25297e88f96 tools/next_gen_conversion/fastq_gen_conv.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/next_gen_conversion/fastq_gen_conv.xml Tue Oct 06 21:25:01 2009 -0400 @@ -0,0 +1,100 @@ +<tool id="fastq_gen_conv" name="FASTQ Groomer" version="1.0.0"> + <description>converts any type of FASTQ file to Sanger type and validates data</description> + <command interpreter="python"> + fastq_gen_conv.py + --input=$input + --origType=$origTypeChoice.origType + #if $origTypeChoice.origType == "sanger": + --allOrNot=$origTypeChoice.howManyBlocks.allOrNot + #if $origTypeChoice.howManyBlocks.allOrNot == "not": + --blocks=$origTypeChoice.howManyBlocks.blocks + #else: + --blocks="None" + #end if + #else: + --allOrNot="None" + --blocks="None" + #end if + --output=$output + </command> + <inputs> + <param name="input" type="data" format="fastq" label="FASTQ file to check:" /> + <conditional name="origTypeChoice"> + <param name="origType" type="select" label="What type of FASTQ do you think this is?"> + <option value="solexa">Solexa</option> + <option value="illumina">Illumina</option> + <option value="sanger">Sanger</option> + </param> + <when value="solexa" /> + <when value="illumina" /> + <when value="sanger"> + <conditional name="howManyBlocks"> + <param name="allOrNot" type="select" label="Do you want to do a subset of lines, or do the whole file?"> + <option value="all">Check all</option> + <option value="not">Select blocks</option> + </param> + <when value="all" /> + <when value="not"> + <param name="blocks" type="integer" value="1000" label="How many blocks (four lines each) do you want to do?" /> + </when> + </conditional> + </when> + </conditional> + </inputs> + <outputs> + <data name="output" format="fastqsanger"/> + </outputs> + <tests> + <test> + <param name="input" value="fastq_gen_conv_in1.fastq" ftype="fastq" /> + <param name="origType" value="solexa" /> + <output name="output" format="fastqsanger" file="fastq_gen_conv_out1.fastqsanger" /> + </test> + <test> + <param name="input" value="fastq_gen_conv_in2.fastq" ftype="fastq" /> + <param name="origType" value="sanger" /> + <param name="allOrNot" value="not" /> + <param name="blocks" value="3" /> + <output name="output" format="fastqsanger" file="fastq_gen_conv_out2.fastqsanger" /> + </test> + </tests> + <help> + +**What it does** + +This tool takes a FASTQ file (Solexa or Illumina) and converts it to Sanger format. It only converts valid blocks. It also can confirm the validity of Sanger FASTQ. + +----- + +**Example** + +- Converting the following Solexa FASTQ file:: + + @seq1 + AGTCGTGGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT + + + ;>@BCEFGHJKLMNOPQRSTUVWXYZ[\]^_?abcdefghijklmno + @seq2 + AGTCGTTGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT + + + ;>@BCElcH@KLMNOPQ>STZVWbYu[\]^_?a=;d?fghijklmno + @seq3 + AGTCGTCGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT + + + 7>@BCEFGHJKLMNOPQRSTUVWXYZ[\]^_?abcdefghijklmno + +- will produce the following Sanger FASTQ data:: + + @seq1 + AGTCGTGGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT + + + "#$%%''()+,-./0123456789:;<=>?@#BCDEFGHIJKLMNOP + @seq2 + AGTCGTTGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT + + + "#$%%'MD)$,-./012#45;78C:V%lt;=>?@#B""E#GHIJKLMNOP + +- Note that seq3 was not converted, because it contained an invalid Solexa quality value (7). + + </help> +</tool>
participants (1)
-
Greg Von Kuster