[hg] galaxy 2807: Added text trimmer that can be used to groom f...
details: http://www.bx.psu.edu/hg/galaxy/rev/aafecc059ba6 changeset: 2807:aafecc059ba6 user: Anton Nekrutenko <anton@bx.psu.edu> date: Wed Sep 30 14:11:01 2009 -0400 description: Added text trimmer that can be used to groom fastq data 6 file(s) affected in this change: test-data/trimmer_a_f_c0_s1_e13_i62.dat test-data/trimmer_a_f_c2_s1_e2_i62.dat test-data/trimmer_tab_delimited.dat tool_conf.xml.sample tools/filters/trimmer.py tools/filters/trimmer.xml diffs (273 lines): diff -r 268ab95fc8a7 -r aafecc059ba6 test-data/trimmer_a_f_c0_s1_e13_i62.dat --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/trimmer_a_f_c0_s1_e13_i62.dat Wed Sep 30 14:11:01 2009 -0400 @@ -0,0 +1,5 @@ +12345 abcdef +67890 ghjkl g +>assa lljlj ljlj +sasas hghg hg +@dgf gfgf gfg diff -r 268ab95fc8a7 -r aafecc059ba6 test-data/trimmer_a_f_c2_s1_e2_i62.dat --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/trimmer_a_f_c2_s1_e2_i62.dat Wed Sep 30 14:11:01 2009 -0400 @@ -0,0 +1,5 @@ +12345 ab xyz +67890 gh ghjt +>assa lljlj ljlj +sasas hg hghg +@dgf gf gfgf diff -r 268ab95fc8a7 -r aafecc059ba6 test-data/trimmer_tab_delimited.dat --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/trimmer_tab_delimited.dat Wed Sep 30 14:11:01 2009 -0400 @@ -0,0 +1,5 @@ +12345 abcdef xyz +67890 ghjkl ghjt +>assa lljlj ljlj +sasas hghg hghg +@dgf gfgf gfgf diff -r 268ab95fc8a7 -r aafecc059ba6 tool_conf.xml.sample --- a/tool_conf.xml.sample Wed Sep 30 14:06:06 2009 -0400 +++ b/tool_conf.xml.sample Wed Sep 30 14:11:01 2009 -0400 @@ -45,6 +45,7 @@ <tool file="filters/remove_beginning.xml" /> <tool file="filters/headWrapper.xml" /> <tool file="filters/tailWrapper.xml" /> + <tool file="filters/trimmer.xml" /> </section> <section name="Filter and Sort" id="filter"> <tool file="stats/filtering.xml" /> diff -r 268ab95fc8a7 -r aafecc059ba6 tools/filters/trimmer.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/filters/trimmer.py Wed Sep 30 14:11:01 2009 -0400 @@ -0,0 +1,108 @@ +#!/usr/bin/env python + +import sys +import optparse + +def stop_err( msg ): + sys.stderr.write( msg ) + sys.exit() + +def main(): + usage = """%prog [options] + +options (listed below) default to 'None' if omitted + """ + parser = optparse.OptionParser(usage=usage) + + parser.add_option( + '-a','--ascii', + dest='ascii', + action='store_true', + default = False, + help='Use ascii codes to defined ignored beginnings instead of raw characters') + + parser.add_option( + '-q','--fastq', + dest='fastq', + action='store_true', + default = False, + help='The input data in fastq format. It selected the script skips every even line since they contain sequence ids') + + parser.add_option( + '-i','--ignore', + dest='ignore', + help='A comma separated list on ignored beginnings (e.g., ">,@"), or its ascii codes (e.g., "60,42") if option -a is enabled') + + parser.add_option( + '-s','--start', + dest='start', + default = '0', + help='Trim from beginning to here (1-based)') + + parser.add_option( + '-e','--end', + dest='end', + default = '0', + help='Trim from here to the ned (1-based)') + + parser.add_option( + '-f','--file', + dest='input_txt', + default = False, + help='Name of file to be chopped. STDIN is default') + + parser.add_option( + '-c','--column', + dest='col', + default = '0', + help='Column to chop. If 0 = chop the whole line') + + + options, args = parser.parse_args() + invalid_starts = [] + + if options.input_txt: + infile = open ( options.input_txt, 'r') + else: + infile = sys.stdin + + if options.ignore and options.ignore != "None": + invalid_starts = options.ignore.split(',') + + if options.ascii and options.ignore and options.ignore != "None": + for i, item in enumerate( invalid_starts ): + invalid_starts[i] = chr( int( item ) ) + + col = int( options.col ) + + for i, line in enumerate( infile ): + line = line.rstrip( '\r\n' ) + if line: + + if options.fastq and i % 2 == 0: + print line + continue + + if options.fastq and line.startswith('@'): + stop_err('Malformed fastq file: even numbered line starts with @') + + if line[0] not in invalid_starts: + if col == 0: + if int( options.end ) > 0: + line = line[ int( options.start )-1 : int( options.end ) ] + else: + line = line[ int( options.start )-1 : ] + else: + fields = line.split( '\t' ) + if col-1 > len( fields ): + stop_err('Column %d does not exist. Check input parameters\n' % col) + + if int( options.end ) > 0: + fields[col - 1] = fields[col - 1][ int( options.start )-1 : int( options.end ) ] + else: + fields[col - 1] = fields[col - 1][ int( options.start )-1 : ] + line = '\t'.join(fields) + print line + +if __name__ == "__main__": main() + diff -r 268ab95fc8a7 -r aafecc059ba6 tools/filters/trimmer.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/filters/trimmer.xml Wed Sep 30 14:11:01 2009 -0400 @@ -0,0 +1,119 @@ +<tool id="trimmer" name="Trim" version="0.0.1"> + <description>leading or trailing characters</description> + <command interpreter="python"> + chopper.py -a -f $input1 -c $col -s $start -e $end -i $ignore $fastq > $out_file1 + </command> + <inputs> + <param format="tabular,text" name="input1" type="data" label="this dataset"/> + <param name="col" type="integer" value="0" label="Trim this column only" help="0 = process entire line" /> + <param name="start" type="integer" size="10" value="1" label="Trim from the beginning to this position" help="1 = do not trim the beginning"/> + <param name="end" type="integer" size="10" value="0" label="Remove everything from this position to the end" help="0 = do not trim the end"/> + <param name="fastq" type="select" label="Is input dataset in fastq format?" help="If set to YES, the tool will not trim evenly numbered lines (0, 2, 4, etc...)"> + <option value="-q">Yes</option> + <option value="">No</option> + </param> + <param name="ignore" type="select" display="checkboxes" multiple="True" label="Ignore lines beginning with these characters" help="lines beginning with these are not trimmed"> + <option value="62">></option> + <option value="64">@</option> + <option value="43">+</option> + <option value="60"><</option> + <option value="42">*</option> + <option value="45">-</option> + <option value="61">=</option> + <option value="124">|</option> + <option value="63">?</option> + <option value="36">$</option> + <option value="46">.</option> + <option value="58">:</option> + <option value="38">&</option> + <option value="37">%</option> + <option value="94">^</option> + </param> + </inputs> + <outputs> + <data name="out_file1" format="input" metadata_source="input1"/> + </outputs> + <tests> + <test> + <param name="input1" value="trimmer_tab_delimited.dat"/> + <param name="col" value="0"/> + <param name="start" value="1"/> + <param name="end" value="13"/> + <param name="ignore" value="62"/> + <param name="fastq" value="No"/> + <output name="out_file1" file="trimmer_a_f_c0_s1_e13_i62.dat"/> + </test> + <test> + <param name="input1" value="trimmer_tab_delimited.dat"/> + <param name="col" value="2"/> + <param name="start" value="1"/> + <param name="end" value="2"/> + <param name="ignore" value="62"/> + <param name="fastq" value="No"/> + <output name="out_file1" file="trimmer_a_f_c2_s1_e2_i62.dat"/> + </test> + + </tests> + + <help> + + +**What it does** + +Trims specified number of characters from a dataset or its field (if dataset is tab-delimited). + +----- + +**Example 1** + +Trimming this dataset:: + + 1234567890 + abcdefghijk + +by setting **Trim from the beginning to this position** to *2* and **Remove everything from this position to the end** to *6* will produce:: + + 23456 + bcdef + +----- + +**Eaxmple 2** + +Trimming column 2 of this dataset:: + + bcde 12345 fghij 67890 + fghij 67890 abcde 12345 + +by setting **Trim content of this column only** to *2*, **Trim from the beginning to this position** to *2*, and **Remove everything from this position to the end** to *4* will produce:: + + abcde 234 fghij 67890 + fghij 789 abcde 12345 + +----- + +**Trimming FASTQ datasets** + +This tool can be used to trim sequences and quality strings in fastq datasets. This is done by selected *Yes* from the **Is input dataset in fastq format?** dropdown. If set to *Yes*, the tool will skip all even numbered lines (see warning below). For example, trimming last 5 bases of this dataset:: + + @081017-and-081020:1:1:1715:1759 + GGACTCAGATAGTAATCCACGCTCCTTTAAAATATC + + + II#IIIIIII$5+.(9IIIIIII$%*$G$A31I&&B + +cab done by setting **Remove everything from this position to the end** to 31:: + + @081017-and-081020:1:1:1715:1759 + GGACTCAGATAGTAATCCACGCTCCTTTAAA + + + II#IIIIIII$5+.(9IIIIIII$%*$G$A3 + +**Note** that headers are skipped. + +.. class:: warningmark + +**WARNING:** This tool will only work on properly formatted fastq datasets where (1) each read and quality string occupy one line and (2) '@' (read header) and "+" (quality header) lines are evenly numbered like in the above example. + + + </help> +</tool>
participants (1)
-
Nate Coraor