[hg] galaxy 3803: Add a tool to mask FASTQ bases according to qu...
details: http://www.bx.psu.edu/hg/galaxy/rev/c9b41f94d707 changeset: 3803:c9b41f94d707 user: Dan Blankenberg <dan@bx.psu.edu> date: Fri May 21 16:48:59 2010 -0400 description: Add a tool to mask FASTQ bases according to quality score. Currently replacement by Ns or lowercase is allowed. diffstat: test-data/sanger_full_range_masked_N.fastqsanger | 8 + test-data/sanger_full_range_masked_lowercase.fastqsanger | 8 + tool_conf.xml.sample | 1 + tools/fastq/fastq_masker_by_quality.py | 83 ++++++++++++++++ tools/fastq/fastq_masker_by_quality.xml | 53 ++++++++++ 5 files changed, 153 insertions(+), 0 deletions(-) diffs (179 lines): diff -r 3b8e4af25be2 -r c9b41f94d707 test-data/sanger_full_range_masked_N.fastqsanger --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sanger_full_range_masked_N.fastqsanger Fri May 21 16:48:59 2010 -0400 @@ -0,0 +1,8 @@ +@FAKE0001 Original version has PHRED scores from 0 to 93 inclusive (in that order) +NNNNNNNNNNNNNNNNNNNNNCGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC ++ +!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ +@FAKE0002 Original version has PHRED scores from 93 to 0 inclusive (in that order) +CATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCNNNNNNNNNNNNNNNNNNNNN ++ +~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"! diff -r 3b8e4af25be2 -r c9b41f94d707 test-data/sanger_full_range_masked_lowercase.fastqsanger --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/sanger_full_range_masked_lowercase.fastqsanger Fri May 21 16:48:59 2010 -0400 @@ -0,0 +1,8 @@ +@FAKE0001 Original version has PHRED scores from 0 to 93 inclusive (in that order) +acgtacgtacgtacgtacgtaCGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTACGTAC ++ +!"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~ +@FAKE0002 Original version has PHRED scores from 93 to 0 inclusive (in that order) +CATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCATGCatgcatgcatgcatgcatgca ++ +~}|{zyxwvutsrqponmlkjihgfedcba`_^]\[ZYXWVUTSRQPONMLKJIHGFEDCBA@?>=<;:9876543210/.-,+*)('&%$#"! diff -r 3b8e4af25be2 -r c9b41f94d707 tool_conf.xml.sample --- a/tool_conf.xml.sample Fri May 21 15:50:49 2010 -0400 +++ b/tool_conf.xml.sample Fri May 21 16:48:59 2010 -0400 @@ -221,6 +221,7 @@ <tool file="fastq/fastq_filter.xml" /> <tool file="fastq/fastq_trimmer.xml" /> <tool file="fastq/fastq_trimmer_by_quality.xml" /> + <tool file="fastq/fastq_masker_by_quality.xml" /> <tool file="fastq/fastq_manipulation.xml" /> <tool file="fastq/fastq_to_fasta.xml" /> <tool file="fastq/fastq_to_tabular.xml" /> diff -r 3b8e4af25be2 -r c9b41f94d707 tools/fastq/fastq_masker_by_quality.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/fastq/fastq_masker_by_quality.py Fri May 21 16:48:59 2010 -0400 @@ -0,0 +1,83 @@ +#Dan Blankenberg +import string +from optparse import OptionParser +from galaxy_utils.sequence.fastq import fastqReader, fastqWriter + + +def get_score_comparer( operator ): + if operator == 'gt': + return compare_gt + elif operator == 'ge': + return compare_ge + elif operator == 'eq': + return compare_eq + elif operator == 'lt': + return compare_lt + elif operator == 'le': + return compare_le + elif operator == 'ne': + return compare_ne + raise 'Invalid operator provided: %s' % operator + +def compare_gt( quality_score, threshold_value ): + return quality_score > threshold_value + +def compare_ge( quality_score, threshold_value ): + return quality_score >= threshold_value + +def compare_eq( quality_score, threshold_value ): + return quality_score == threshold_value + +def compare_ne( quality_score, threshold_value ): + return quality_score != threshold_value + +def compare_lt( quality_score, threshold_value ): + return quality_score < threshold_value + +def compare_le( quality_score, threshold_value ): + return quality_score <= threshold_value + +class BaseReplacer( object ): + def __init__( self, replace_character ): + self.replace_character = replace_character + def __call__( self, base_character ): + return self.replace_character + +def main(): + usage = "usage: %prog [options] input_file output_file" + parser = OptionParser( usage=usage ) + parser.add_option( '-f', '--format', dest='format', type='choice', default='sanger', choices=( 'sanger', 'cssanger', 'solexa', 'illumina' ), help='FASTQ variant type' ) + parser.add_option( '-m', '--mask_character', dest='mask_character', default='N', help='Mask Character to use' ) + parser.add_option( '-c', '--score_comparison', type="choice", dest='score_comparison', default='le', choices=('gt','ge','eq','lt', 'le', 'ne' ), help='Mask base when score is' ) + parser.add_option( '-s', '--quality_score', type="float", dest='quality_score', default='0', help='Quality Score' ) + parser.add_option( "-l", "--lowercase", action="store_true", dest="lowercase", default=False, help="Use lowercase masking") + ( options, args ) = parser.parse_args() + + if len ( args ) != 2: + parser.error( "Need to specify an input file and an output file" ) + + score_comparer = get_score_comparer( options.score_comparison ) + + if options.lowercase: + base_masker = string.lower + else: + base_masker = BaseReplacer( options.mask_character ) + + out = fastqWriter( open( args[1], 'wb' ), format = options.format ) + + num_reads = None + num_reads_excluded = 0 + for num_reads, fastq_read in enumerate( fastqReader( open( args[0] ), format = options.format ) ): + sequence_list = list( fastq_read.sequence ) + for i, quality_score in enumerate( fastq_read.get_decimal_quality_scores() ): + if score_comparer( quality_score, options.quality_score ): + sequence_list[ i ] = base_masker( sequence_list[ i ] ) + fastq_read.sequence = "".join( sequence_list ) + out.write( fastq_read ) + + if num_reads is not None: + print "Processed %i %s reads." % ( num_reads + 1, options.format ) + else: + print "No valid FASTQ reads were provided." + +if __name__ == "__main__": main() diff -r 3b8e4af25be2 -r c9b41f94d707 tools/fastq/fastq_masker_by_quality.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/fastq/fastq_masker_by_quality.xml Fri May 21 16:48:59 2010 -0400 @@ -0,0 +1,53 @@ +<tool id="fastq_masker_by_quality" name="FASTQ Masker" version="1.0.0"> + <description>by quality score</description> + <command interpreter="python">fastq_masker_by_quality.py '$input_file' '$output_file' -f '${input_file.extension[len( 'fastq' ):]}' -s '${quality_score}' -c '${score_comparison}' + #if $mask_type.value == 'lowercase' + --lowercase + #else + -m '${mask_type}' + #end if + </command> + <inputs> + <param name="input_file" type="data" format="fastqsanger" label="File to mask" /> + <param name="mask_type" type="select" label="Mask input with"> + <option value="N">N's</option> + <option value="lowercase">Lowercase</option> + </param> + <param name="score_comparison" type="select" label="When score is"> + <option value="le" selected="True">Less than or equal</option> + <option value="lt">Less than</option> + <option value="eq">Equal to</option> + <option value="ne">Not Equal to</option> + <option value="ge">Greater than</option> + <option value="gt">Greater than or equal</option> + </param> + <param name="quality_score" type="integer" value="0"/> + </inputs> + <outputs> + <data name="output_file" format="fastqsanger" /> + </outputs> + <tests> + <test> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="mask_type" value="N" /> + <param name="score_comparison" value="le" /> + <param name="quality_score" value="20" /> + <output name="output_file" file="sanger_full_range_masked_N.fastqsanger" /> + </test> + <test> + <param name="input_file" value="sanger_full_range_original_sanger.fastqsanger" ftype="fastqsanger" /> + <param name="mask_type" value="lowercase" /> + <param name="score_comparison" value="le" /> + <param name="quality_score" value="20" /> + <output name="output_file" file="sanger_full_range_masked_lowercase.fastqsanger" /> + </test> + </tests> + <help> +**What it does** + +This tool allows masking base characters in FASTQ format files dependent upon user specified quality score value and comparison method. + +This tool is not available for use on color space (csSanger) formats. + + </help> +</tool>
participants (1)
-
Nate Coraor