details: http://www.bx.psu.edu/hg/galaxy/rev/81de8629eeb6 changeset: 3620:81de8629eeb6 user: Dan Blankenberg <dan@bx.psu.edu> date: Thu Apr 08 15:25:38 2010 -0400 description: Add FASTQ Quality Trimmer by sliding window tool. diffstat: test-data/sanger_full_range_quality_trimmed_out_1.fastqsanger | 8 + test-data/sanger_full_range_quality_trimmed_out_2.fastqsanger | 8 + test-data/sanger_full_range_quality_trimmed_out_3.fastqsanger | 8 + tool_conf.xml.main | 1 + tool_conf.xml.sample | 1 + tools/fastq/fastq_trimmer_by_quality.py | 121 ++++++++++ tools/fastq/fastq_trimmer_by_quality.xml | 112 +++++++++ 7 files changed, 259 insertions(+), 0 deletions(-) diffs (299 lines): diff -r 61b09dc1dff2 -r 81de8629eeb6 test-data/sanger_full_range_quality_trimmed_out_1.fastqsanger --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sanger_full_range_quality_trimmed_out_1.fastqsanger Thu Apr 08 15:25:38 2010 -0400 @@ -0,0 +1,8 @@ +@FAKE0001 Original version has PHRED scores from 0 to 93 inclusive (in that order) +ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC ++ +56789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ +@FAKE0002 Original version has PHRED scores from 93 to 0 inclusive (in that order) +CATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA ++ +~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:98765 diff -r 61b09dc1dff2 -r 81de8629eeb6 test-data/sanger_full_range_quality_trimmed_out_2.fastqsanger --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sanger_full_range_quality_trimmed_out_2.fastqsanger Thu Apr 08 15:25:38 2010 -0400 @@ -0,0 +1,8 @@ +@FAKE0001 Original version has PHRED scores from 0 to 93 inclusive (in that order) +ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC ++ +56789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ +@FAKE0002 Original version has PHRED scores from 93 to 0 inclusive (in that order) +CATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA ++ +~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"! diff -r 61b09dc1dff2 -r 81de8629eeb6 test-data/sanger_full_range_quality_trimmed_out_3.fastqsanger --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sanger_full_range_quality_trimmed_out_3.fastqsanger Thu Apr 08 15:25:38 2010 -0400 @@ -0,0 +1,8 @@ +@FAKE0001 Original version has PHRED scores from 0 to 93 inclusive (in that order) +ACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC ++ +!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ +@FAKE0002 Original version has PHRED scores from 93 to 0 inclusive (in that order) +CATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCA ++ +~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:98765 diff -r 61b09dc1dff2 -r 81de8629eeb6 tool_conf.xml.main --- a/tool_conf.xml.main Wed Apr 07 11:12:00 2010 -0400 +++ b/tool_conf.xml.main Thu Apr 08 15:25:38 2010 -0400 @@ -295,6 +295,7 @@ <label text="Generic FASTQ manipulation" id="generic_fastq" /> <tool file="fastq/fastq_filter.xml" /> <tool file="fastq/fastq_trimmer.xml" /> + <tool file="fastq/fastq_trimmer_by_quality.xml" /> <tool file="fastq/fastq_manipulation.xml" /> <tool file="fastq/fastq_to_fasta.xml" /> <tool file="fastq/fastq_to_tabular.xml" /> diff -r 61b09dc1dff2 -r 81de8629eeb6 tool_conf.xml.sample --- a/tool_conf.xml.sample Wed Apr 07 11:12:00 2010 -0400 +++ b/tool_conf.xml.sample Thu Apr 08 15:25:38 2010 -0400 @@ -209,6 +209,7 @@ <label text="Generic FASTQ manipulation" id="generic_fastq" /> <tool file="fastq/fastq_filter.xml" /> <tool file="fastq/fastq_trimmer.xml" /> + <tool file="fastq/fastq_trimmer_by_quality.xml" /> <tool file="fastq/fastq_manipulation.xml" /> <tool file="fastq/fastq_to_fasta.xml" /> <tool file="fastq/fastq_to_tabular.xml" /> diff -r 61b09dc1dff2 -r 81de8629eeb6 tools/fastq/fastq_trimmer_by_quality.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/fastq/fastq_trimmer_by_quality.py Thu Apr 08 15:25:38 2010 -0400 @@ -0,0 +1,121 @@ +#Dan Blankenberg +from optparse import OptionParser +from galaxy_utils.sequence.fastq import fastqReader, fastqWriter + +def mean( score_list ): + return float( sum( score_list ) ) / float( len( score_list ) ) + +ACTION_METHODS = { 'min':min, 'max':max, 'sum':sum, 'mean':mean } + +def compare( aggregated_value, operator, threshold_value ): + if operator == '>': + return aggregated_value > threshold_value + elif operator == '>=': + return aggregated_value >= threshold_value + elif operator == '==': + return aggregated_value == threshold_value + elif operator == '<': + return aggregated_value < threshold_value + elif operator == '<=': + return aggregated_value <= threshold_value + elif operator == '!=': + return aggregated_value != threshold_value + +def exclude( value_list, exclude_indexes ): + rval = [] + for i, val in enumerate( value_list ): + if i not in exclude_indexes: + rval.append( val ) + return rval + +def exclude_and_compare( aggregate_action, aggregate_list, operator, threshold_value, exclude_indexes = None ): + if not aggregate_list or compare( aggregate_action( aggregate_list ), operator, threshold_value ): + return True + if exclude_indexes: + for exclude_index in exclude_indexes: + excluded_list = exclude( aggregate_list, exclude_index ) + if not excluded_list or compare( aggregate_action( excluded_list ), operator, threshold_value ): + return True + return False + +def main(): + usage = "usage: %prog [options] input_file output_file" + parser = OptionParser( usage=usage ) + parser.add_option( '-f', '--format', dest='format', type='choice', default='sanger', choices=( 'sanger', 'cssanger', 'solexa', 'illumina' ), help='FASTQ variant type' ) + parser.add_option( '-s', '--window_size', type="int", dest='window_size', default='1', help='Window size' ) + parser.add_option( '-t', '--window_step', type="int", dest='window_step', default='1', help='Window step' ) + parser.add_option( '-e', '--trim_ends', type="choice", dest='trim_ends', default='53', choices=('5','3','53','35' ), help='Ends to Trim' ) + parser.add_option( '-a', '--aggregation_action', type="choice", dest='aggregation_action', default='min', choices=('min','max','sum','mean' ), help='Aggregate action for window' ) + parser.add_option( '-x', '--exclude_count', type="int", dest='exclude_count', default='0', help='Maximum number of bases to exclude from the window during aggregation' ) + parser.add_option( '-c', '--score_comparison', type="choice", dest='score_comparison', default='>=', choices=('>','>=','==','<', '<=', '!=' ), help='Keep read when aggregate score is' ) + parser.add_option( '-q', '--quality_score', type="float", dest='quality_score', default='0', help='Quality Score' ) + parser.add_option( "-k", "--keep_zero_length", action="store_true", dest="keep_zero_length", default=False, help="Keep reads with zero length") + ( options, args ) = parser.parse_args() + + if len ( args ) != 2: + parser.error( "Need to specify an input file and an output file" ) + + #determine an exhaustive list of window indexes that can be excluded from aggregation + exclude_window_indexes = [] + last_exclude_indexes = [] + for exclude_count in range( min( options.exclude_count, options.window_size ) ): + if last_exclude_indexes: + new_exclude_indexes = [] + for exclude_list in last_exclude_indexes: + for window_index in range( options.window_size ): + if window_index not in exclude_list: + new_exclude = sorted( exclude_list + [ window_index ] ) + if new_exclude not in exclude_window_indexes + new_exclude_indexes: + new_exclude_indexes.append( new_exclude ) + exclude_window_indexes += new_exclude_indexes + last_exclude_indexes = new_exclude_indexes + else: + for window_index in range( options.window_size ): + last_exclude_indexes.append( [ window_index ] ) + exclude_window_indexes = list( last_exclude_indexes ) + + out = fastqWriter( open( args[1], 'wb' ), format = options.format ) + action = ACTION_METHODS[ options.aggregation_action ] + window_step = abs( options.window_step ) + + num_reads = None + num_reads_excluded = 0 + for num_reads, fastq_read in enumerate( fastqReader( open( args[0] ), format = options.format ) ): + for trim_end in options.trim_ends: + quality_list = fastq_read.get_decimal_quality_scores() + if trim_end == '5': + lwindow_position = 0 #left position of window + while True: + if lwindow_position >= len( quality_list ): + fastq_read.sequence = '' + fastq_read.quality = '' + break + if exclude_and_compare( action, quality_list[ lwindow_position:lwindow_position + options.window_size ], options.score_comparison, options.quality_score, exclude_window_indexes ): + fastq_read = fastq_read.slice( lwindow_position, None ) + break + lwindow_position += window_step + else: + rwindow_position = len( quality_list ) #right position of window + while True: + lwindow_position = rwindow_position - options.window_size #left position of window + if rwindow_position <= 0 or lwindow_position < 0: + fastq_read.sequence = '' + fastq_read.quality = '' + break + if exclude_and_compare( action, quality_list[ lwindow_position:rwindow_position ], options.score_comparison, options.quality_score, exclude_window_indexes ): + fastq_read = fastq_read.slice( None, rwindow_position ) + break + rwindow_position -= window_step + if options.keep_zero_length or len( fastq_read ): + out.write( fastq_read ) + else: + num_reads_excluded += 1 + out.close() + if num_reads is None: + print "No valid FASTQ reads could be processed." + else: + print "%i FASTQ reads were processed." % ( num_reads + 1 ) + if num_reads_excluded: + print "%i reads of zero length were excluded from the output." % num_reads_excluded + +if __name__ == "__main__": main() diff -r 61b09dc1dff2 -r 81de8629eeb6 tools/fastq/fastq_trimmer_by_quality.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/fastq/fastq_trimmer_by_quality.xml Thu Apr 08 15:25:38 2010 -0400 @@ -0,0 +1,112 @@ +<tool id="fastq_quality_trimmer" name="FASTQ Quality Trimmer" version="1.0.0"> + <description>by sliding window</description> + <command interpreter="python">fastq_trimmer_by_quality.py '$input_file' '$output_file' -f '${input_file.extension[len( 'fastq' ):]}' -s '$window_size' + -t '$step_size' -e '$trim_ends' -a '$aggregation_action' -x '$exclude_count' -c '$score_comparison' -q '$quality_score' + #if $keep_zero_length.value: + -k + #end if + </command> + <inputs> + <param name="input_file" type="data" format="fastqsanger,fastqcssanger" label="FASTQ File"/> + <param name="keep_zero_length" label="Keep reads with zero length" type="boolean" truevalue="keep_zero_length" falsevalue="exclude_zero_length" selected="False"/> + <param name="trim_ends" type="select" label="Trim ends"> + <option value="53" selected="True">5' and 3'</option> + <option value="5">5' only</option> + <option value="3">3' only</option> + </param> + <param name="window_size" type="integer" value="1" label="Window size"/> + <param name="step_size" type="integer" value="1" label="Step Size" /> + <param name="exclude_count" label="Maximum number of bases to exclude from the window during aggregation" value="0" type="integer" /> + <param name="aggregation_action" type="select" label="Aggregate action for window"> + <option value="min" selected="True">min score</option> + <option value="max">max score</option> + <option value="sum">sum of scores</option> + <option value="mean">mean of scores</option> + </param> + <param name="score_comparison" type="select" label="Trim until aggregate score is"> + <sanitizer> + <valid initial="none"> + <add value="<>=!"/> <!-- only allow lt, gt, e, le, ge, ne for this parameter; will be single-quote escaped on commandline --> + </valid> + </sanitizer> + <option value=">">></option> + <option value=">=" selected="true">>=</option> + <option value="==">==</option> + <option value="!=">!=</option> + <option value="<"><</option> + <option value="<="><=</option> + </param> + <param name="quality_score" label="Quality Score" value="0" type="float" /> + </inputs> + <outputs> + <data name="output_file" format="input" /> + </outputs> + <tests> + <test> + <!-- Trim until window size 1 >= 20;both ends --> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="keep_zero_length" value="exclude_zero_length" /> + <param name="trim_ends" value="53"/> + <param name="window_size" value="1"/> + <param name="step_size" value="1"/> + <param name="exclude_count" value="0"/> + <param name="aggregation_action" value="min"/> + <param name="score_comparison" value=">="/> + <param name="quality_score" value="20"/> + <output name="output_file" file="sanger_full_range_quality_trimmed_out_1.fastqsanger" /> + </test> + <test> + <!-- Trim until window size 1 >= 20; 5' end only --> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="keep_zero_length" value="exclude_zero_length" /> + <param name="trim_ends" value="5"/> + <param name="window_size" value="1"/> + <param name="step_size" value="1"/> + <param name="exclude_count" value="0"/> + <param name="aggregation_action" value="min"/> + <param name="score_comparison" value=">="/> + <param name="quality_score" value="20"/> + <output name="output_file" file="sanger_full_range_quality_trimmed_out_2.fastqsanger" /> + </test> + <test> + <!-- Trim until window size 1 >= 20; 3' end only --> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="keep_zero_length" value="exclude_zero_length" /> + <param name="trim_ends" value="3"/> + <param name="window_size" value="1"/> + <param name="step_size" value="1"/> + <param name="exclude_count" value="0"/> + <param name="aggregation_action" value="min"/> + <param name="score_comparison" value=">="/> + <param name="quality_score" value="20"/> + <output name="output_file" file="sanger_full_range_quality_trimmed_out_3.fastqsanger" /> + </test> + <test> + <!-- Trim until window size 2 >= 1;both ends, 1 deviant score --> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="keep_zero_length" value="exclude_zero_length" /> + <param name="trim_ends" value="53"/> + <param name="window_size" value="2"/> + <param name="step_size" value="1"/> + <param name="exclude_count" value="1"/> + <param name="aggregation_action" value="min"/> + <param name="score_comparison" value=">="/> + <param name="quality_score" value="1"/> + <output name="output_file" file="sanger_full_range_original_sanger.fastqsanger" /> + </test> + </tests> + <help> +This tool allows you to trim the ends of reads based upon the aggregate value of quality scores found within a sliding window; a sliding window of size 1 is equivalent to 'simple' trimming of the ends. + +The user specifies the aggregating action (min, max, sum, mean) to perform on the quality score values found within the sliding window to be used with the user defined comparison operation and comparison value. + +The user can provide a maximum count of bases that can be excluded from the aggregation within the window. When set, this tool will first check the aggregation of the entire window, then after removing 1 value, then after removing 2 values, up to the number declared. Setting this value to be equal to or greater than the window size will cause no trimming to occur. + +----- + +.. class:: warningmark + +Trimming a color space read will cause any adapter base to be lost. + + </help> +</tool>