details: http://www.bx.psu.edu/hg/galaxy/rev/b07142ce9dbb changeset: 3599:b07142ce9dbb user: Kanwei Li <kanwei@gmail.com> date: Thu Apr 01 22:48:57 2010 -0400 description: New tool to select N random lines from a file/dataset. Closes #238 diffstat: tool_conf.xml.sample | 1 + tools/filters/randomlines.py | 33 +++++++++++++++++++++++++++++++++ tools/filters/randomlines.xml | 42 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 76 insertions(+), 0 deletions(-) diffs (94 lines): diff -r 9f7d7e900cdf -r b07142ce9dbb tool_conf.xml.sample --- a/tool_conf.xml.sample Thu Apr 01 20:36:31 2010 -0400 +++ b/tool_conf.xml.sample Thu Apr 01 22:48:57 2010 -0400 @@ -46,6 +46,7 @@ <tool file="filters/changeCase.xml" /> <tool file="filters/pasteWrapper.xml" /> <tool file="filters/remove_beginning.xml" /> + <tool file="filters/randomlines.xml" /> <tool file="filters/headWrapper.xml" /> <tool file="filters/tailWrapper.xml" /> <tool file="filters/trimmer.xml" /> diff -r 9f7d7e900cdf -r b07142ce9dbb tools/filters/randomlines.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/filters/randomlines.py Thu Apr 01 22:48:57 2010 -0400 @@ -0,0 +1,33 @@ +#!/usr/bin/env python +# Kanwei Li, 2010 +# Selects N random lines from a file and outputs to another file + +import random, sys + +def main(): + infile = open(sys.argv[1], 'r') + total_lines = int(sys.argv[2]) + + if total_lines < 1: + sys.stderr.write( "Must select at least one line." ) + sys.exit() + + kept = [] + n = 0 + for line in infile: + line = line.rstrip("\n") + n += 1 + if (n <= total_lines): + kept.append(line) + elif random.randint(1, n) <= total_lines: + kept.pop(random.randint(0, total_lines-1)) + kept.append(line) + + if n < total_lines: + sys.stderr.write( "Error: asked to select more lines than there were in the file." ) + sys.exit() + + open(sys.argv[3], 'w').write( "\n".join(kept) ) + +if __name__ == "__main__": + main() diff -r 9f7d7e900cdf -r b07142ce9dbb tools/filters/randomlines.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/filters/randomlines.xml Thu Apr 01 22:48:57 2010 -0400 @@ -0,0 +1,42 @@ +<tool id="random_lines1" name="Select random lines"> + <description>from a file</description> + <command interpreter="python">randomlines.py $input $num_lines $out_file1</command> + <inputs> + <param name="num_lines" size="5" type="integer" value="1" label="Randomly select" help="lines"/> + <param format="txt" name="input" type="data" label="from"/> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input"/> + </outputs> + <tests> + <test> + <param name="num_lines" value="65"/> + <param name="input" value="1.bed"/> + <output name="out_file1" file="1.bed"/> + </test> + </tests> + <help> + +**What it does** + +This tool selects N random lines from a file, with no repeats, and preserving ordering. + +----- + +**Example** + +Input File:: + + chr7 56632 56652 D17003_CTCF_R6 310 + + chr7 56736 56756 D17003_CTCF_R7 354 + + chr7 56761 56781 D17003_CTCF_R4 220 + + chr7 56772 56792 D17003_CTCF_R7 372 + + chr7 56775 56795 D17003_CTCF_R4 207 + + +Selecting 2 random lines might return this:: + + chr7 56736 56756 D17003_CTCF_R7 354 + + chr7 56775 56795 D17003_CTCF_R4 207 + + + </help> +</tool>