1 new changeset in galaxy-central: http://bitbucket.org/galaxy/galaxy-central/changeset/2469c53051ea/ changeset: 2469c53051ea user: dan date: 2011-09-25 19:27:14 summary: Some random lines tool tweaks. affected #: 3 files (-1 bytes) --- a/test-data/1_bed_random_lines_1_seed_asdf_out.bed Sat Sep 24 10:45:34 2011 -0400 +++ b/test-data/1_bed_random_lines_1_seed_asdf_out.bed Sun Sep 25 13:27:14 2011 -0400 @@ -1,1 +1,1 @@ -chrX 152694029 152694263 CCDS14736.1_cds_0_0_chrX_152694030_r 0 - +chr5 131424298 131424460 CCDS4149.1_cds_0_0_chr5_131424299_f 0 + --- a/tools/filters/random_lines_two_pass.py Sat Sep 24 10:45:34 2011 -0400 +++ b/tools/filters/random_lines_two_pass.py Sun Sep 25 13:27:14 2011 -0400 @@ -2,9 +2,25 @@ #Dan Blankenberg #Selects N random lines from a file and outputs to another file, maintaining original line order #allows specifying a seed -#does two passes to determine line counts and offsets, and then to output contents +#does two passes to determine line offsets/count, and then to output contents -import optparse, random +import optparse, random + +def get_random_by_subtraction( line_offsets, num_lines ): + while len( line_offsets ) > num_lines: + del line_offsets[ random.randint( 0, len( line_offsets ) - 1 ) ] + return line_offsets + +def get_random_by_sample( line_offsets, num_lines ): + line_offsets = random.sample( line_offsets, num_lines ) + line_offsets.sort() + return line_offsets + +def get_random( line_offsets, num_lines ): + if num_lines > ( len( line_offsets ) / 2 ): + return get_random_by_subtraction( line_offsets, num_lines ) + else: + return get_random_by_sample( line_offsets, num_lines ) def __main__(): #Parse Command Line @@ -14,7 +30,7 @@ assert len( args ) == 3, "Invalid command line specified." - input = open( args[0] ) + input = open( args[0], 'rb' ) output = open( args[1], 'wb' ) num_lines = int( args[2] ) assert num_lines > 0, "You must select at least one line." @@ -24,10 +40,13 @@ #get line offsets line_offsets = [] + teller = input.tell + readliner = input.readline + appender = line_offsets.append while True: - offset = input.tell() - if input.readline(): - line_offsets.append( offset ) + offset = teller() + if readliner(): + appender( offset ) else: break @@ -35,13 +54,14 @@ assert num_lines <= total_lines, "Error: asked to select more lines (%i) than there were in the file (%i)." % ( num_lines, total_lines ) #get random line offsets - while len( line_offsets ) > num_lines: - line_offsets.pop( random.randint( 0, len( line_offsets ) - 1 ) ) + line_offsets = get_random( line_offsets, num_lines ) #write out random lines + seeker = input.seek + writer = output.write for line_offset in line_offsets: - input.seek( line_offset ) - output.write( input.readline() ) + seeker( line_offset ) + writer( readliner() ) input.close() output.close() print "Kept %i of %i total lines." % ( num_lines, total_lines ) --- a/tools/filters/randomlines.xml Sat Sep 24 10:45:34 2011 -0400 +++ b/tools/filters/randomlines.xml Sun Sep 25 13:27:14 2011 -0400 @@ -1,4 +1,4 @@ -<tool id="random_lines1" name="Select random lines" version="2.0.0"> +<tool id="random_lines1" name="Select random lines" version="2.0.1"><description>from a file</description><command interpreter="python">random_lines_two_pass.py "${input}" "${out_file1}" "${num_lines}" #if str( $seed_source.seed_source_selector ) == "set_seed": Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.