details: http://www.bx.psu.edu/hg/galaxy/rev/3ad620871b25 changeset: 2839:3ad620871b25 user: Anton Nekrutenko <anton@bx.psu.edu> date: Wed Oct 07 11:18:14 2009 -0400 description: ngs updates 6 file(s) affected in this change: tool_conf.xml.sample tools/fastx_toolkit/fastq_quality_converter.xml tools/fastx_toolkit/fastq_to_fasta.xml tools/fastx_toolkit/fastx_quality_statistics.xml tools/metag_tools/split_paired_reads.xml tools/next_gen_conversion/fastq_gen_conv.xml diffs (251 lines): diff -r 9a75d2428e21 -r 3ad620871b25 tool_conf.xml.sample --- a/tool_conf.xml.sample Wed Oct 07 11:12:39 2009 -0400 +++ b/tool_conf.xml.sample Wed Oct 07 11:18:14 2009 -0400 @@ -72,10 +72,6 @@ <tool file="maf/maf_to_fasta.xml" /> <tool file="fasta_tools/tabular_to_fasta.xml" /> <tool file="fastx_toolkit/fastq_to_fasta.xml" /> - <tool file="next_gen_conversion/solid_to_fastq.xml" /> - <tool file="next_gen_conversion/fastq_conversions.xml" /> - <tool file="fastx_toolkit/fastq_quality_converter.xml" /> - <tool file="next_gen_conversion/fastq_gen_conv.xml" /> </section> <section name="Extract Features" id="features"> <tool file="filters/ucsc_gene_bed_to_exon_bed.xml" /> @@ -175,32 +171,27 @@ </section> <section name="NGS: QC and manipulation" id="cshl_library_information"> <label text="Generic FASTQ data" id="fastq" /> + <tool file="next_gen_conversion/fastq_gen_conv.xml" /> + <tool file="fastx_toolkit/fastq_quality_converter.xml" /> <tool file="fastx_toolkit/fastx_quality_statistics.xml" /> <tool file="fastx_toolkit/fastq_quality_boxplot.xml" /> <tool file="fastx_toolkit/fastx_nucleotides_distribution.xml" /> - <!-- <tool file="fastx_toolkit/fasta_clipping_histogram.xml" /> --> - <!-- <tool file="fastx_toolkit/fastx_clipper.xml" /> --> - <tool file="fastx_toolkit/fastx_trimmer.xml" /> - <tool file="fastx_toolkit/fastx_renamer.xml" /> - <tool file="fastx_toolkit/fastx_reverse_complement.xml" /> - <tool file="fastx_toolkit/fastx_artifacts_filter.xml" /> - <tool file="fastx_toolkit/fastq_quality_filter.xml" /> - <!--<tool file="fastx_toolkit/fastx_barcode_splitter.xml" />--> <tool file="metag_tools/split_paired_reads.xml" /> <label text="Roche-454 data" id="454" /> <tool file="metag_tools/short_reads_figure_score.xml" /> <tool file="metag_tools/short_reads_trim_seq.xml" /> <label text="AB-SOLiD data" id="solid" /> + <tool file="next_gen_conversion/solid_to_fastq.xml" /> <tool file="solid_tools/solid_qual_stats.xml" /> <tool file="solid_tools/solid_qual_boxplot.xml" /> </section> <section name="NGS: Mapping" id="solexa_tools"> <!-- <tool file="sr_mapping/lastz_wrapper.xml" /> --> + <tool file="sr_mapping/bowtie_wrapper.xml" /> + <tool file="sr_mapping/bwa_wrapper.xml" /> <tool file="metag_tools/megablast_wrapper.xml" /> <tool file="metag_tools/megablast_xml_parser.xml" /> - <tool file="sr_mapping/bowtie_wrapper.xml" /> - <tool file="sr_mapping/bwa_wrapper.xml" /> - </section> + </section> <section name="NGS: SAM Tools" id="samtools"> <tool file="samtools/sam_bitwise_flag_filter.xml" /> <tool file="samtools/sam2interval.xml" /> diff -r 9a75d2428e21 -r 3ad620871b25 tools/fastx_toolkit/fastq_quality_converter.xml --- a/tools/fastx_toolkit/fastq_quality_converter.xml Wed Oct 07 11:12:39 2009 -0400 +++ b/tools/fastx_toolkit/fastq_quality_converter.xml Wed Oct 07 11:18:14 2009 -0400 @@ -2,7 +2,7 @@ <description>(ASCII-Numeric)</description> <command>zcat -f $input | fastq_quality_converter $QUAL_FORMAT -o $output -Q $offset</command> <inputs> - <param format="fastqsolexa,fastqsanger" name="input" type="data" label="Library to convert" /> + <param format="fastq" name="input" type="data" label="Library to convert" /> <param name="QUAL_FORMAT" type="select" label="Desired output format"> <option value="-a">ASCII (letters) quality scores</option> @@ -11,7 +11,7 @@ <param name="offset" type="select" label="FASTQ ASCII offset"> <option value="33">33</option> - <option value="64">64</option> + <option selected="true" value="64">64</option> </param> </inputs> @@ -47,7 +47,7 @@ </tests> <outputs> - <data format="fastqsolexa" name="output" metadata_source="input" /> + <data format="fastq" name="output" metadata_source="input" /> </outputs> <help> diff -r 9a75d2428e21 -r 3ad620871b25 tools/fastx_toolkit/fastq_to_fasta.xml --- a/tools/fastx_toolkit/fastq_to_fasta.xml Wed Oct 07 11:12:39 2009 -0400 +++ b/tools/fastx_toolkit/fastq_to_fasta.xml Wed Oct 07 11:18:14 2009 -0400 @@ -3,7 +3,7 @@ <command>gunzip -cf $input | fastq_to_fasta $SKIPN $RENAMESEQ -o $output -v </command> <inputs> - <param format="fastqsolexa,fastqsanger" name="input" type="data" label="FASTQ Library to convert" /> + <param format="fastq" name="input" type="data" label="FASTQ Library to convert" /> <param name="SKIPN" type="select" label="Discard sequences with unknown (N) bases "> <option value="">yes</option> diff -r 9a75d2428e21 -r 3ad620871b25 tools/fastx_toolkit/fastx_quality_statistics.xml --- a/tools/fastx_toolkit/fastx_quality_statistics.xml Wed Oct 07 11:12:39 2009 -0400 +++ b/tools/fastx_toolkit/fastx_quality_statistics.xml Wed Oct 07 11:18:14 2009 -0400 @@ -3,11 +3,8 @@ <command>zcat -f $input | fastx_quality_stats -o $output -Q $offset</command> <inputs> - <param format="fasta,fastqsolexa,fastqsanger" name="input" type="data" label="Library to analyse" /> - <param name="offset" type="select" label="FASTQ ASCII offset"> - <option value="33">33</option> - <option value="64">64</option> - </param> + <param format="fastqsanger" name="input" type="data" label="Library to analyse" /> + <param name="offset" type="hidden" value="33"/> </inputs> <tests> diff -r 9a75d2428e21 -r 3ad620871b25 tools/metag_tools/split_paired_reads.xml --- a/tools/metag_tools/split_paired_reads.xml Wed Oct 07 11:12:39 2009 -0400 +++ b/tools/metag_tools/split_paired_reads.xml Wed Oct 07 11:18:14 2009 -0400 @@ -4,7 +4,7 @@ split_paired_reads.py $input $output1 $output2 </command> <inputs> - <param name="input" type="data" format="fastqsolexa,fastqsanger" label="Your paired-end file" /> + <param name="input" type="data" format="fastqsanger" label="Your paired-end file" /> </inputs> <outputs> <data name="output1" format="input"/> @@ -12,8 +12,8 @@ </outputs> <tests> <test> - <param name="input" value="split_paired_reads_test1.fastq" ftype="fastqsolexa" /> - <output name="output1" file="split_paired_reads_test1.out1" fype="fastqsolexa" /> + <param name="input" value="split_paired_reads_test1.fastq" ftype="fastqsanger"/> + <output name="output1" file="split_paired_reads_test1.out1" ftype="fastqsanger"/> </test> </tests> <help> diff -r 9a75d2428e21 -r 3ad620871b25 tools/next_gen_conversion/fastq_gen_conv.xml --- a/tools/next_gen_conversion/fastq_gen_conv.xml Wed Oct 07 11:12:39 2009 -0400 +++ b/tools/next_gen_conversion/fastq_gen_conv.xml Wed Oct 07 11:18:14 2009 -0400 @@ -1,5 +1,5 @@ <tool id="fastq_gen_conv" name="FASTQ Groomer" version="1.0.0"> - <description>converts any type of FASTQ file to Sanger type and validates data</description> + <description>converts any FASTQ to Sanger</description> <command interpreter="python"> fastq_gen_conv.py --input=$input @@ -18,24 +18,24 @@ --output=$output </command> <inputs> - <param name="input" type="data" format="fastq" label="FASTQ file to check:" /> + <param name="input" type="data" format="fastq" label="Groom this dataset" /> <conditional name="origTypeChoice"> - <param name="origType" type="select" label="What type of FASTQ do you think this is?"> - <option value="solexa">Solexa</option> - <option value="illumina">Illumina</option> - <option value="sanger">Sanger</option> + <param name="origType" type="select" label="How do you think quality values are scaled?" help="See below for explanation"> + <option value="solexa">Solexa/Illumina 1.0</option> + <option value="illumina">Illumina 1.3+</option> + <option value="sanger">Sanger (validation only)</option> </param> <when value="solexa" /> <when value="illumina" /> <when value="sanger"> <conditional name="howManyBlocks"> - <param name="allOrNot" type="select" label="Do you want to do a subset of lines, or do the whole file?"> - <option value="all">Check all</option> - <option value="not">Select blocks</option> + <param name="allOrNot" type="select" label="Since your fastq is already in Sanger format you can check it for consistency"> + <option value="all">Check all (may take a while)</option> + <option selected="true" value="not">Check selected number of blocks</option> </param> <when value="all" /> <when value="not"> - <param name="blocks" type="integer" value="1000" label="How many blocks (four lines each) do you want to do?" /> + <param name="blocks" type="integer" value="1000" label="How many blocks (four lines each) do you want to check?" /> </when> </conditional> </when> @@ -62,39 +62,45 @@ **What it does** -This tool takes a FASTQ file (Solexa or Illumina) and converts it to Sanger format. It only converts valid blocks. It also can confirm the validity of Sanger FASTQ. +Galaxy pipeline for mapping of Illumina data requires data to be in fastq format with quality values conforming to so called "Sanger" format. Unfortunately there are many other types of fastq. Thus the main objective of this tool is to "groom" multiple types of fastq into Sanger-conforming fastq that can be used in downstream application such as mapping. + +.. class:: infomark + +**TIP**: If the input dataset is already in Sanger format the tool does not perform conversion. However validation (described below) is still performed. ----- -**Example** +**Types of fastq datasets** -- Converting the following Solexa FASTQ file:: +A good description of fastq datasets can be found `here`__, while a description of Galaxy's fastq "logic" can be found `here`__. Because ranges of quality values within different types of fastq datasets overlap it very difficult to detect them automatically. This tool supports conversion of two commonly found types (Solexa/Illumina 1.0 and Illumina 1.3+) into fastq Sanger. - @seq1 - AGTCGTGGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT - + - ;>@BCEFGHJKLMNOPQRSTUVWXYZ[\]^_?abcdefghijklmno - @seq2 - AGTCGTTGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT - + - ;>@BCElcH@KLMNOPQ>STZVWbYu[\]^_?a=;d?fghijklmno - @seq3 - AGTCGTCGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT - + - 7>@BCEFGHJKLMNOPQRSTUVWXYZ[\]^_?abcdefghijklmno + .. __: http://en.wikipedia.org/wiki/FASTQ_format + .. __: http://bitbucket.org/galaxy/galaxy-central/wiki/NGS -- will produce the following Sanger FASTQ data:: +.. class:: warningmark - @seq1 - AGTCGTGGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT - + - "#$%%''()+,-./0123456789:;<=>?@#BCDEFGHIJKLMNOP - @seq2 - AGTCGTTGTCATCGTGACTAGTCGATCTAGCTAGCTCTCTAGAGTGT - + - "#$%%'MD)$,-./012#45;78C:V%lt;=>?@#B""E#GHIJKLMNOP - -- Note that seq3 was not converted, because it contained an invalid Solexa quality value (7). +**NOTE** that there is also a type of fastq format where quality values are represented by a list of space-delimited integers (e.g., 40 40 20 15 -5 20 ...). This tool **does not** handle such fastq. If you have such a dataset, it needs to be converted into ASCII-type fastq (where quality values are encoded by characters) by "Numeric-to-ASCII" utility before it can accepted by this tool. + +----- + +**Validation** + +In addition to converting quality values to Sanger format the tool also checks the input dataset for consistency. Specifically, it performs these four checks: + +- skips empty lines +- checks that blocks are properly formed by making sure that: + + #. there are four lines per block + #. the first line starts with "@" + #. the third line starts with "+" + #. lengths of second line (sequences) and the fourth line (quality string) are identical + +- checks that quality values are within range for the chosen fastq format (e.g., the format provided by the user in **How do you think quality values are scaled?** drop down. + +To see exactly what the tool does you can take a look at its source code `here`__. + + .. __: http://bitbucket.org/galaxy/galaxy-central/src/tip/tools/next_gen_conversion... + </help> </tool>