[hg] galaxy 1713: remove the usage of hast tables in the fasta-t...
details: http://www.bx.psu.edu/hg/galaxy/rev/6d849785ff86 changeset: 1713:6d849785ff86 user: wychung date: Mon Jan 19 15:47:02 2009 -0500 description: remove the usage of hast tables in the fasta-tools 8 file(s) affected in this change: test-data/fasta_to_tabular_out1.tabular test-data/fasta_to_tabular_out3.tabular test-data/fasta_tool_compute_length_1.out test-data/fasta_tool_compute_length_3.out test-data/fasta_tool_filter_length_1.out tools/fasta_tools/fasta_compute_length.py tools/fasta_tools/fasta_filter_by_length.py tools/fasta_tools/fasta_to_tabular.py diffs (305 lines): diff -r e76a153769d4 -r 6d849785ff86 test-data/fasta_to_tabular_out1.tabular --- a/test-data/fasta_to_tabular_out1.tabular Mon Jan 19 11:47:29 2009 -0500 +++ b/test-data/fasta_to_tabular_out1.tabular Mon Jan 19 15:47:02 2009 -0500 @@ -14,5 +14,5 @@ EYKX4VC01BB4QL length=57 xy=0431_0363 region=1 run=R_2007_11_07_16_15_57_ GGGGAGGAGCTAATAATATGCTCTTGGGGAGGAGCTAATTATATGCTCTTGGGGAGG EYKX4VC01BJ37M length=64 xy=0522_0192 region=1 run=R_2007_11_07_16_15_57_ TCGAGTATGTATCAAGGACTACATACAAATTTGCCAAAAGAGATTATGCACTATCCCGACTTCC EYKX4VC01BV9R8 length=54 xy=0660_2038 region=1 run=R_2007_11_07_16_15_57_ AAAACTCGGAGAAACTATTCAGCAGCACTGCGTTTCGCTGAATTTTAGACCGTT +EYKX4VC01CEPP8 length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_ CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG EYKX4VC01BTLME length=78 xy=0630_0292 region=1 run=R_2007_11_07_16_15_57_ TTATCCACACGCTGTCCGGATCCAGCGCCAGGCGCCGACGCTGGACTTCCGCCGCCTGCGCCCAGTTGCCCTGACTTC -EYKX4VC01CEPP8 length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_ CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG diff -r e76a153769d4 -r 6d849785ff86 test-data/fasta_to_tabular_out3.tabular --- a/test-data/fasta_to_tabular_out3.tabular Mon Jan 19 11:47:29 2009 -0500 +++ b/test-data/fasta_to_tabular_out3.tabular Mon Jan 19 15:47:02 2009 -0500 @@ -14,5 +14,5 @@ EYKX4VC01BB4QL GGGGAGGAGCTAATAATATGCTCTTGGGGAGGAGCTAATTATATGCTCTTGGGGAGG EYKX4VC01BJ37M TCGAGTATGTATCAAGGACTACATACAAATTTGCCAAAAGAGATTATGCACTATCCCGACTTCC EYKX4VC01BV9R8 AAAACTCGGAGAAACTATTCAGCAGCACTGCGTTTCGCTGAATTTTAGACCGTT +EYKX4VC01CEPP8 CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG EYKX4VC01BTLME TTATCCACACGCTGTCCGGATCCAGCGCCAGGCGCCGACGCTGGACTTCCGCCGCCTGCGCCCAGTTGCCCTGACTTC -EYKX4VC01CEPP8 CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG diff -r e76a153769d4 -r 6d849785ff86 test-data/fasta_tool_compute_length_1.out --- a/test-data/fasta_tool_compute_length_1.out Mon Jan 19 11:47:29 2009 -0500 +++ b/test-data/fasta_tool_compute_length_1.out Mon Jan 19 15:47:02 2009 -0500 @@ -14,5 +14,5 @@ EYKX4VC01BB4QL length=57 xy=0431_0363 region=1 run=R_2007_11_07_16_15_57_ 57 EYKX4VC01BJ37M length=64 xy=0522_0192 region=1 run=R_2007_11_07_16_15_57_ 64 EYKX4VC01BV9R8 length=54 xy=0660_2038 region=1 run=R_2007_11_07_16_15_57_ 54 +EYKX4VC01CEPP8 length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_ 60 EYKX4VC01BTLME length=78 xy=0630_0292 region=1 run=R_2007_11_07_16_15_57_ 78 -EYKX4VC01CEPP8 length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_ 60 diff -r e76a153769d4 -r 6d849785ff86 test-data/fasta_tool_compute_length_3.out --- a/test-data/fasta_tool_compute_length_3.out Mon Jan 19 11:47:29 2009 -0500 +++ b/test-data/fasta_tool_compute_length_3.out Mon Jan 19 15:47:02 2009 -0500 @@ -14,5 +14,5 @@ EYKX4VC01BB4QL 57 EYKX4VC01BJ37M 64 EYKX4VC01BV9R8 54 +EYKX4VC01CEPP8 60 EYKX4VC01BTLME 78 -EYKX4VC01CEPP8 60 diff -r e76a153769d4 -r 6d849785ff86 test-data/fasta_tool_filter_length_1.out --- a/test-data/fasta_tool_filter_length_1.out Mon Jan 19 11:47:29 2009 -0500 +++ b/test-data/fasta_tool_filter_length_1.out Mon Jan 19 15:47:02 2009 -0500 @@ -53,9 +53,9 @@
EYKX4VC01BV9R8 length=54 xy=0660_2038 region=1 run=R_2007_11_07_16_15_57_ AAAACTCGGAGAAACTATTCAGCAGCACTGCGTTTCGCTGAATTTTAGAC CGTT +>EYKX4VC01CEPP8 length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_ +CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTG +TTAGTGTCAG EYKX4VC01BTLME length=78 xy=0630_0292 region=1 run=R_2007_11_07_16_15_57_ TTATCCACACGCTGTCCGGATCCAGCGCCAGGCGCCGACGCTGGACTTCC GCCGCCTGCGCCCAGTTGCCCTGACTTC ->EYKX4VC01CEPP8 length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_ -CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTG -TTAGTGTCAG diff -r e76a153769d4 -r 6d849785ff86 tools/fasta_tools/fasta_compute_length.py --- a/tools/fasta_tools/fasta_compute_length.py Mon Jan 19 11:47:29 2009 -0500 +++ b/tools/fasta_tools/fasta_compute_length.py Mon Jan 19 15:47:02 2009 -0500 @@ -1,8 +1,8 @@ #! /usr/bin/python """ -Input: fasta, minimal length, maximal length -Output: fasta -Return sequences whose lengths are within the range. +Input: fasta, int +Output: tabular +Return titles with lengths of corresponding seq """
import sys, os @@ -10,41 +10,37 @@ assert sys.version_info[:2] >= ( 2, 4 ) def __main__(): - input_filename = sys.argv[1] - output_filename = sys.argv[2] + + infile = sys.argv[1] + outfile = sys.argv[2] keep_first = int( sys.argv[3] ) - tmp_title = tmp_seq = '' - tmp_seq_count = 0 - seq_hash = {} + + fasta_title = fasta_seq = '' + # number of char to keep in the title if keep_first == 0: keep_first = None else: keep_first += 1 - for i, line in enumerate( file( input_filename ) ): + out = open(outfile, 'w') + + for i, line in enumerate( file( infile ) ): line = line.rstrip( '\r\n' ) if not line or line.startswith( '#' ): continue if line[0] == '>': - if len( tmp_seq ) > 0: - tmp_seq_count += 1 - seq_hash[ ( tmp_seq_count, tmp_title ) ] = tmp_seq - tmp_title = line - tmp_seq = '' + if len( fasta_seq ) > 0 : + out.write( "%s\t%d\n" % ( fasta_title[ 1:keep_first ], len( fasta_seq ) ) ) + fasta_title = line + fasta_seq = '' else: - tmp_seq = "%s%s" % ( tmp_seq, line ) - if line.split() and line.split()[0].isdigit(): - tmp_seq = "%s " % tmp_seq - if len( tmp_seq ) > 0: - seq_hash[ ( tmp_seq_count, tmp_title ) ] = tmp_seq - - title_keys = seq_hash.keys() - title_keys.sort() - output_handle = open( output_filename, 'w' ) - for i, fasta_title in title_keys: - tmp_seq = seq_hash[ ( i, fasta_title ) ] - output_handle.write( "%s\t%d\n" % ( fasta_title[ 1:keep_first ], len( tmp_seq ) ) ) - output_handle.close() + fasta_seq = "%s%s" % ( fasta_seq, line ) + + # check the last sequence + if len( fasta_seq ) > 0: + out.write( "%s\t%d\n" % ( fasta_title[ 1:keep_first ], len( fasta_seq ) ) ) + + out.close() if __name__ == "__main__" : __main__() \ No newline at end of file diff -r e76a153769d4 -r 6d849785ff86 tools/fasta_tools/fasta_filter_by_length.py --- a/tools/fasta_tools/fasta_filter_by_length.py Mon Jan 19 11:47:29 2009 -0500 +++ b/tools/fasta_tools/fasta_filter_by_length.py Mon Jan 19 15:47:02 2009 -0500 @@ -15,7 +15,7 @@ def __main__(): - input_filename = sys.argv[1] + infile = sys.argv[1] try: min_length = int( sys.argv[2] ) except: @@ -24,49 +24,62 @@ max_length = int( sys.argv[3] ) except: stop_err( "Maximum length of the return sequence requires a numerical value." ) - output_filename = sys.argv[4] - tmp_title = tmp_seq = '' - tmp_seq_count = 0 - seq_hash = {} + outfile = sys.argv[4] + fasta_title = fasta_seq = '' + at_least_one = 0 + + out = open( outfile, 'w' ) - for i, line in enumerate( file( input_filename ) ): + for i, line in enumerate( file( infile ) ): line = line.rstrip( '\r\n' ) if not line or line.startswith( '#' ): continue + if line[0] == '>': - if len( tmp_seq ) > 0: - tmp_seq_count += 1 - seq_hash[ ( tmp_seq_count, tmp_title ) ] = tmp_seq - tmp_title = line - tmp_seq = '' + if len( fasta_seq ) > 0: + + if max_length <= 0: + compare_max_length = len( fasta_seq ) + 1 + else: + compare_max_length = max_length + + l = len( fasta_seq ) + + if l >= min_length and l <= compare_max_length: + at_least_one += 1 + out.write( "%s\n" % fasta_title ) + c = 0 + s = fasta_seq + while c < l: + b = min( c + 50, l ) + out.write( "%s\n" % s[ c:b ] ) + c = b + + fasta_title = line + fasta_seq = '' else: - tmp_seq = "%s%s" % ( tmp_seq, line ) - if line.split()[0].isdigit(): - tmp_seq = "%s " % tmp_seq - if len( tmp_seq ) > 0: - seq_hash[ ( tmp_seq_count, tmp_title ) ] = tmp_seq + fasta_seq = "%s%s" % ( fasta_seq, line ) - title_keys = seq_hash.keys() - title_keys.sort() - output_handle = open( output_filename, 'w' ) - at_least_one = 0 - for i, fasta_title in title_keys: - tmp_seq = seq_hash[ ( i, fasta_title ) ] + if len( fasta_seq ) > 0: + if max_length <= 0: - compare_max_length = len( tmp_seq ) + 1 + compare_max_length = len( fasta_seq ) + 1 else: compare_max_length = max_length - l = len( tmp_seq ) + + l = len( fasta_seq ) + if l >= min_length and l <= compare_max_length: at_least_one += 1 - output_handle.write( "%s\n" % fasta_title ) + out.write( "%s\n" % fasta_title ) c = 0 - s = tmp_seq + s = fasta_seq while c < l: b = min( c + 50, l ) - output_handle.write( "%s\n" % s[ c:b ] ) + out.write( "%s\n" % s[ c:b ] ) c = b - output_handle.close() + + out.close() if at_least_one == 0: print "There is no sequence that falls within your range." diff -r e76a153769d4 -r 6d849785ff86 tools/fasta_tools/fasta_to_tabular.py --- a/tools/fasta_tools/fasta_to_tabular.py Mon Jan 19 11:47:29 2009 -0500 +++ b/tools/fasta_tools/fasta_to_tabular.py Mon Jan 19 15:47:02 2009 -0500 @@ -1,9 +1,9 @@ #! /usr/bin/python # This code exists in 2 places: ~/datatypes/converters and ~/tools/fasta_tools """ -Input: fasta, minimal length, maximal length -Output: fasta -Return sequences whose lengths are within the range. +Input: fasta, int +Output: tabular +format convert: fasta to tabular """ import sys, os @@ -14,38 +14,31 @@ infile = sys.argv[1] outfile = sys.argv[2] keep_first = int( sys.argv[3] ) - title = '' - sequence = '' - sequence_count = 0 + fasta_title = fasta_seq = '' if keep_first == 0: keep_first = None else: keep_first += 1 + out = open( outfile, 'w' ) + for i, line in enumerate( open( infile ) ): line = line.rstrip( '\r\n' ) if not line or line.startswith( '#' ): continue if line.startswith( '>' ): - if sequence: - sequence_count += 1 - seq_hash[( sequence_count, title )] = sequence - title = line - sequence = '' + if fasta_seq: + out.write( "%s\t%s\n" %( fasta_title[ 1:keep_first ], fasta_seq ) ) + fasta_title = line + fasta_seq = '' else: - sequence = "%s%s" % ( sequence, line ) - if line.split() and line.split()[0].isdigit(): - sequence += ' ' - if sequence: - seq_hash[( sequence_count, title )] = sequence - # return only those lengths are in the range - title_keys = seq_hash.keys() - title_keys.sort() - out = open( outfile, 'w' ) - for i, fasta_title in title_keys: - sequence = seq_hash[( i, fasta_title )] - out.write( "%s\t%s\n" %( fasta_title[ 1:keep_first ], sequence ) ) + if line: + fasta_seq = "%s%s" % ( fasta_seq, line ) + + if fasta_seq: + out.write( "%s\t%s\n" %( fasta_title[ 1:keep_first ], fasta_seq ) ) + out.close() if __name__ == "__main__" : __main__() \ No newline at end of file
participants (1)
-
Greg Von Kuster