details: http://www.bx.psu.edu/hg/galaxy/rev/2177d6f7bf80 changeset: 2492:2177d6f7bf80 user: Kelly Vincent <kpvincent@bx.psu.edu> date: Thu Jul 23 12:16:44 2009 -0400 description: Performance improvements for fasta_filter_by_length and fasta_to_tabular 2 file(s) affected in this change: tools/fasta_tools/fasta_filter_by_length.py tools/fasta_tools/fasta_to_tabular.py diffs (143 lines): diff -r cd366f615220 -r 2177d6f7bf80 tools/fasta_tools/fasta_filter_by_length.py --- a/tools/fasta_tools/fasta_filter_by_length.py Thu Jul 23 11:11:41 2009 -0400 +++ b/tools/fasta_tools/fasta_filter_by_length.py Thu Jul 23 12:16:44 2009 -0400 @@ -14,8 +14,7 @@ sys.exit() def __main__(): - - infile = sys.argv[1] + input_filename = sys.argv[1] try: min_length = int( sys.argv[2] ) except: @@ -24,65 +23,30 @@ max_length = int( sys.argv[3] ) except: stop_err( "Maximum length of the return sequence requires a numerical value." ) - outfile = sys.argv[4] - fasta_title = fasta_seq = '' + output_filename = sys.argv[4] + output_handle = open( output_filename, 'w' ) + tmp_size = 0 #-1 + tmp_buf = '' at_least_one = 0 - - out = open( outfile, 'w' ) - - for i, line in enumerate( file( infile ) ): - line = line.rstrip( '\r\n' ) - if not line or line.startswith( '#' ): + for line in file(input_filename): + if not line or line.startswith('#'): continue - if line[0] == '>': - if len( fasta_seq ) > 0: - - if max_length <= 0: - compare_max_length = len( fasta_seq ) + 1 - else: - compare_max_length = max_length - - l = len( fasta_seq ) - - if l >= min_length and l <= compare_max_length: - at_least_one += 1 - out.write( "%s\n" % fasta_title ) - c = 0 - s = fasta_seq - while c < l: - b = min( c + 50, l ) - out.write( "%s\n" % s[ c:b ] ) - c = b - - fasta_title = line - fasta_seq = '' + if min_length <= tmp_size <= max_length or (min_length <= tmp_size and max_length == 0): + output_handle.write(tmp_buf) + at_least_one = 1 + tmp_buf = line + tmp_size = 0 else: - fasta_seq = "%s%s" % ( fasta_seq, line ) - - if len( fasta_seq ) > 0: - - if max_length <= 0: - compare_max_length = len( fasta_seq ) + 1 - else: - compare_max_length = max_length - - l = len( fasta_seq ) - - if l >= min_length and l <= compare_max_length: - at_least_one += 1 - out.write( "%s\n" % fasta_title ) - c = 0 - s = fasta_seq - while c < l: - b = min( c + 50, l ) - out.write( "%s\n" % s[ c:b ] ) - c = b - - out.close() - + if max_length == 0 or tmp_size < max_length: + tmp_size += len(line.rstrip('\r\n')) + tmp_buf += line + # final flush of buffer + if min_length <= tmp_size <= max_length or (min_length <= tmp_size and max_length == 0): + output_handle.write(tmp_buf.rstrip('\r\n')) + at_least_one = 1 + output_handle.close() if at_least_one == 0: print "There is no sequence that falls within your range." - -if __name__ == "__main__" : __main__() \ No newline at end of file +if __name__ == "__main__" : __main__() diff -r cd366f615220 -r 2177d6f7bf80 tools/fasta_tools/fasta_to_tabular.py --- a/tools/fasta_tools/fasta_to_tabular.py Thu Jul 23 11:11:41 2009 -0400 +++ b/tools/fasta_tools/fasta_to_tabular.py Thu Jul 23 12:16:44 2009 -0400 @@ -15,30 +15,23 @@ outfile = sys.argv[2] keep_first = int( sys.argv[3] ) fasta_title = fasta_seq = '' - if keep_first == 0: keep_first = None else: keep_first += 1 - out = open( outfile, 'w' ) - for i, line in enumerate( open( infile ) ): line = line.rstrip( '\r\n' ) if not line or line.startswith( '#' ): continue if line.startswith( '>' ): - if fasta_seq: - out.write( "%s\t%s\n" %( fasta_title[ 1:keep_first ], fasta_seq ) ) - fasta_title = line - fasta_seq = '' + if i: + out.write('\n') + out.write(line[1:keep_first]) + out.write('\t') else: - if line: - fasta_seq = "%s%s" % ( fasta_seq, line ) - - if fasta_seq: - out.write( "%s\t%s\n" %( fasta_title[ 1:keep_first ], fasta_seq ) ) - + out.write(line) out.close() -if __name__ == "__main__" : __main__() \ No newline at end of file +if __name__ == "__main__" : __main__() + \ No newline at end of file