1 new changeset in galaxy-central:
http://bitbucket.org/galaxy/galaxy-central/changeset/2469c53051ea/
changeset: 2469c53051ea
user: dan
date: 2011-09-25 19:27:14
summary: Some random lines tool tweaks.
affected #: 3 files (-1 bytes)
--- a/test-data/1_bed_random_lines_1_seed_asdf_out.bed Sat Sep 24 10:45:34 2011 -0400
+++ b/test-data/1_bed_random_lines_1_seed_asdf_out.bed Sun Sep 25 13:27:14 2011 -0400
@@ -1,1 +1,1 @@
-chrX 152694029 152694263 CCDS14736.1_cds_0_0_chrX_152694030_r 0 -
+chr5 131424298 131424460 CCDS4149.1_cds_0_0_chr5_131424299_f 0 +
--- a/tools/filters/random_lines_two_pass.py Sat Sep 24 10:45:34 2011 -0400
+++ b/tools/filters/random_lines_two_pass.py Sun Sep 25 13:27:14 2011 -0400
@@ -2,9 +2,25 @@
#Dan Blankenberg
#Selects N random lines from a file and outputs to another file, maintaining original
line order
#allows specifying a seed
-#does two passes to determine line counts and offsets, and then to output contents
+#does two passes to determine line offsets/count, and then to output contents
-import optparse, random
+import optparse, random
+
+def get_random_by_subtraction( line_offsets, num_lines ):
+ while len( line_offsets ) > num_lines:
+ del line_offsets[ random.randint( 0, len( line_offsets ) - 1 ) ]
+ return line_offsets
+
+def get_random_by_sample( line_offsets, num_lines ):
+ line_offsets = random.sample( line_offsets, num_lines )
+ line_offsets.sort()
+ return line_offsets
+
+def get_random( line_offsets, num_lines ):
+ if num_lines > ( len( line_offsets ) / 2 ):
+ return get_random_by_subtraction( line_offsets, num_lines )
+ else:
+ return get_random_by_sample( line_offsets, num_lines )
def __main__():
#Parse Command Line
@@ -14,7 +30,7 @@
assert len( args ) == 3, "Invalid command line specified."
- input = open( args[0] )
+ input = open( args[0], 'rb' )
output = open( args[1], 'wb' )
num_lines = int( args[2] )
assert num_lines > 0, "You must select at least one line."
@@ -24,10 +40,13 @@
#get line offsets
line_offsets = []
+ teller = input.tell
+ readliner = input.readline
+ appender = line_offsets.append
while True:
- offset = input.tell()
- if input.readline():
- line_offsets.append( offset )
+ offset = teller()
+ if readliner():
+ appender( offset )
else:
break
@@ -35,13 +54,14 @@
assert num_lines <= total_lines, "Error: asked to select more lines (%i) than
there were in the file (%i)." % ( num_lines, total_lines )
#get random line offsets
- while len( line_offsets ) > num_lines:
- line_offsets.pop( random.randint( 0, len( line_offsets ) - 1 ) )
+ line_offsets = get_random( line_offsets, num_lines )
#write out random lines
+ seeker = input.seek
+ writer = output.write
for line_offset in line_offsets:
- input.seek( line_offset )
- output.write( input.readline() )
+ seeker( line_offset )
+ writer( readliner() )
input.close()
output.close()
print "Kept %i of %i total lines." % ( num_lines, total_lines )
--- a/tools/filters/randomlines.xml Sat Sep 24 10:45:34 2011 -0400
+++ b/tools/filters/randomlines.xml Sun Sep 25 13:27:14 2011 -0400
@@ -1,4 +1,4 @@
-<tool id="random_lines1" name="Select random lines"
version="2.0.0">
+<tool id="random_lines1" name="Select random lines"
version="2.0.1"><description>from a file</description><command
interpreter="python">random_lines_two_pass.py "${input}"
"${out_file1}" "${num_lines}"
#if str( $seed_source.seed_source_selector ) == "set_seed":
Repository URL:
https://bitbucket.org/galaxy/galaxy-central/
--
This is a commit notification from
bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.