[galaxy-dev] [hg] galaxy 1713: remove the usage of hast tables in the fasta-t...

22 Jan 2009

details:   http://www.bx.psu.edu/hg/galaxy/rev/6d849785ff86
changeset: 1713:6d849785ff86
user:      wychung
date:      Mon Jan 19 15:47:02 2009 -0500
description:
remove the usage of hast tables in the fasta-tools

8 file(s) affected in this change:

test-data/fasta_to_tabular_out1.tabular
test-data/fasta_to_tabular_out3.tabular
test-data/fasta_tool_compute_length_1.out
test-data/fasta_tool_compute_length_3.out
test-data/fasta_tool_filter_length_1.out
tools/fasta_tools/fasta_compute_length.py
tools/fasta_tools/fasta_filter_by_length.py
tools/fasta_tools/fasta_to_tabular.py

diffs (305 lines):

diff -r e76a153769d4 -r 6d849785ff86 test-data/fasta_to_tabular_out1.tabular

--- a/test-data/fasta_to_tabular_out1.tabular	Mon Jan 19 11:47:29 2009 -0500
+++ b/test-data/fasta_to_tabular_out1.tabular	Mon Jan 19 15:47:02 2009 -0500
@@ -14,5 +14,5 @@
 EYKX4VC01BB4QL length=57 xy=0431_0363 region=1 run=R_2007_11_07_16_15_57_	GGGGAGGAGCTAATAATATGCTCTTGGGGAGGAGCTAATTATATGCTCTTGGGGAGG
 EYKX4VC01BJ37M length=64 xy=0522_0192 region=1 run=R_2007_11_07_16_15_57_	TCGAGTATGTATCAAGGACTACATACAAATTTGCCAAAAGAGATTATGCACTATCCCGACTTCC
 EYKX4VC01BV9R8 length=54 xy=0660_2038 region=1 run=R_2007_11_07_16_15_57_	AAAACTCGGAGAAACTATTCAGCAGCACTGCGTTTCGCTGAATTTTAGACCGTT
+EYKX4VC01CEPP8 length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_	CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG
 EYKX4VC01BTLME length=78 xy=0630_0292 region=1 run=R_2007_11_07_16_15_57_	TTATCCACACGCTGTCCGGATCCAGCGCCAGGCGCCGACGCTGGACTTCCGCCGCCTGCGCCCAGTTGCCCTGACTTC
-EYKX4VC01CEPP8 length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_	CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG
diff -r e76a153769d4 -r 6d849785ff86 test-data/fasta_to_tabular_out3.tabular
--- a/test-data/fasta_to_tabular_out3.tabular	Mon Jan 19 11:47:29 2009 -0500
+++ b/test-data/fasta_to_tabular_out3.tabular	Mon Jan 19 15:47:02 2009 -0500
@@ -14,5 +14,5 @@
 EYKX4VC01BB4QL	GGGGAGGAGCTAATAATATGCTCTTGGGGAGGAGCTAATTATATGCTCTTGGGGAGG
 EYKX4VC01BJ37M	TCGAGTATGTATCAAGGACTACATACAAATTTGCCAAAAGAGATTATGCACTATCCCGACTTCC
 EYKX4VC01BV9R8	AAAACTCGGAGAAACTATTCAGCAGCACTGCGTTTCGCTGAATTTTAGACCGTT
+EYKX4VC01CEPP8	CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG
 EYKX4VC01BTLME	TTATCCACACGCTGTCCGGATCCAGCGCCAGGCGCCGACGCTGGACTTCCGCCGCCTGCGCCCAGTTGCCCTGACTTC
-EYKX4VC01CEPP8	CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTGTTAGTGTCAG
diff -r e76a153769d4 -r 6d849785ff86 test-data/fasta_tool_compute_length_1.out
--- a/test-data/fasta_tool_compute_length_1.out	Mon Jan 19 11:47:29 2009 -0500
+++ b/test-data/fasta_tool_compute_length_1.out	Mon Jan 19 15:47:02 2009 -0500
@@ -14,5 +14,5 @@
 EYKX4VC01BB4QL length=57 xy=0431_0363 region=1 run=R_2007_11_07_16_15_57_	57
 EYKX4VC01BJ37M length=64 xy=0522_0192 region=1 run=R_2007_11_07_16_15_57_	64
 EYKX4VC01BV9R8 length=54 xy=0660_2038 region=1 run=R_2007_11_07_16_15_57_	54
+EYKX4VC01CEPP8 length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_	60
 EYKX4VC01BTLME length=78 xy=0630_0292 region=1 run=R_2007_11_07_16_15_57_	78
-EYKX4VC01CEPP8 length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_	60
diff -r e76a153769d4 -r 6d849785ff86 test-data/fasta_tool_compute_length_3.out
--- a/test-data/fasta_tool_compute_length_3.out	Mon Jan 19 11:47:29 2009 -0500
+++ b/test-data/fasta_tool_compute_length_3.out	Mon Jan 19 15:47:02 2009 -0500
@@ -14,5 +14,5 @@
 EYKX4VC01BB4QL	57
 EYKX4VC01BJ37M	64
 EYKX4VC01BV9R8	54
+EYKX4VC01CEPP8	60
 EYKX4VC01BTLME	78
-EYKX4VC01CEPP8	60
diff -r e76a153769d4 -r 6d849785ff86 test-data/fasta_tool_filter_length_1.out
--- a/test-data/fasta_tool_filter_length_1.out	Mon Jan 19 11:47:29 2009 -0500
+++ b/test-data/fasta_tool_filter_length_1.out	Mon Jan 19 15:47:02 2009 -0500
@@ -53,9 +53,9 @@
...
EYKX4VC01BV9R8 length=54 xy=0660_2038 region=1 run=R_2007_11_07_16_15_57_
 AAAACTCGGAGAAACTATTCAGCAGCACTGCGTTTCGCTGAATTTTAGAC
 CGTT
+>EYKX4VC01CEPP8 length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_
+CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTG
+TTAGTGTCAG
EYKX4VC01BTLME length=78 xy=0630_0292 region=1 run=R_2007_11_07_16_15_57_
 TTATCCACACGCTGTCCGGATCCAGCGCCAGGCGCCGACGCTGGACTTCC
 GCCGCCTGCGCCCAGTTGCCCTGACTTC
->EYKX4VC01CEPP8 length=60 xy=0870_2350 region=1 run=R_2007_11_07_16_15_57_
-CTGGGTGGGTGCACTACAGGAACGTCATTTGTTCAATCCTCACGTTGTTG
-TTAGTGTCAG
diff -r e76a153769d4 -r 6d849785ff86 tools/fasta_tools/fasta_compute_length.py
--- a/tools/fasta_tools/fasta_compute_length.py	Mon Jan 19 11:47:29 2009 -0500
+++ b/tools/fasta_tools/fasta_compute_length.py	Mon Jan 19 15:47:02 2009 -0500
@@ -1,8 +1,8 @@
 #! /usr/bin/python
 """
-Input: fasta, minimal length, maximal length
-Output: fasta
-Return sequences whose lengths are within the range.
+Input: fasta, int
+Output: tabular
+Return titles with lengths of corresponding seq
 """
import sys, os
@@ -10,41 +10,37 @@
 assert sys.version_info[:2] >= ( 2, 4 )
 
 def __main__():
-    input_filename = sys.argv[1]
-    output_filename = sys.argv[2]
+    
+    infile = sys.argv[1]
+    outfile = sys.argv[2]
     keep_first = int( sys.argv[3] )
-    tmp_title = tmp_seq = ''
-    tmp_seq_count = 0
-    seq_hash = {}
+    
+    fasta_title = fasta_seq = ''
 
+    # number of char to keep in the title
     if keep_first == 0:
         keep_first = None
     else:
         keep_first += 1    
 
-    for i, line in enumerate( file( input_filename ) ):
+    out = open(outfile, 'w')
+    
+    for i, line in enumerate( file( infile ) ):
         line = line.rstrip( '\r\n' )
         if not line or line.startswith( '#' ):
             continue
         if line[0] == '>':
-            if len( tmp_seq ) > 0:
-                tmp_seq_count += 1
-                seq_hash[ ( tmp_seq_count, tmp_title ) ] = tmp_seq
-            tmp_title = line
-            tmp_seq = ''
+            if len( fasta_seq ) > 0 :
+                out.write( "%s\t%d\n" % ( fasta_title[ 1:keep_first ], len( fasta_seq ) ) )
+            fasta_title = line
+            fasta_seq = ''
         else:
-            tmp_seq = "%s%s" % ( tmp_seq, line )
-            if line.split() and line.split()[0].isdigit():
-                tmp_seq = "%s " % tmp_seq
-    if len( tmp_seq ) > 0:
-        seq_hash[ ( tmp_seq_count, tmp_title ) ] = tmp_seq
-    
-    title_keys = seq_hash.keys()
-    title_keys.sort()
-    output_handle = open( output_filename, 'w' )
-    for i, fasta_title in title_keys:
-        tmp_seq = seq_hash[ ( i, fasta_title ) ]
-        output_handle.write( "%s\t%d\n" % ( fasta_title[ 1:keep_first ], len( tmp_seq ) ) )
-    output_handle.close()
+            fasta_seq = "%s%s" % ( fasta_seq, line )
+            
+    # check the last sequence
+    if len( fasta_seq ) > 0:
+        out.write( "%s\t%d\n" % ( fasta_title[ 1:keep_first ], len( fasta_seq ) ) )
+            
+    out.close()
 
 if __name__ == "__main__" : __main__()
\ No newline at end of file
diff -r e76a153769d4 -r 6d849785ff86 tools/fasta_tools/fasta_filter_by_length.py
--- a/tools/fasta_tools/fasta_filter_by_length.py	Mon Jan 19 11:47:29 2009 -0500
+++ b/tools/fasta_tools/fasta_filter_by_length.py	Mon Jan 19 15:47:02 2009 -0500
@@ -15,7 +15,7 @@
 
 def __main__():
     
-    input_filename = sys.argv[1]
+    infile = sys.argv[1]
     try:
         min_length = int( sys.argv[2] )
     except:
@@ -24,49 +24,62 @@
         max_length = int( sys.argv[3] )
     except:
         stop_err( "Maximum length of the return sequence requires a numerical value." )
-    output_filename = sys.argv[4]
-    tmp_title = tmp_seq = ''
-    tmp_seq_count = 0
-    seq_hash = {}
+    outfile = sys.argv[4]
+    fasta_title = fasta_seq = ''
+    at_least_one = 0
+    
+    out = open( outfile, 'w' )
 
-    for i, line in enumerate( file( input_filename ) ):
+    for i, line in enumerate( file( infile ) ):
         line = line.rstrip( '\r\n' )
         if not line or line.startswith( '#' ):
             continue
+        
         if line[0] == '>':
-            if len( tmp_seq ) > 0:
-                tmp_seq_count += 1
-                seq_hash[ ( tmp_seq_count, tmp_title ) ] = tmp_seq
-            tmp_title = line
-            tmp_seq = ''
+            if len( fasta_seq ) > 0:
+
+                if max_length <= 0: 
+                    compare_max_length = len( fasta_seq ) + 1
+                else:
+                    compare_max_length = max_length
+                    
+                l = len( fasta_seq )
+                
+                if l >= min_length and l <= compare_max_length:
+                    at_least_one += 1
+                    out.write( "%s\n" % fasta_title )
+                    c = 0
+                    s = fasta_seq
+                    while c < l:
+                        b = min( c + 50, l )
+                        out.write( "%s\n" % s[ c:b ] )   
+                        c = b
+                                        
+            fasta_title = line
+            fasta_seq = ''
         else:
-            tmp_seq = "%s%s" % ( tmp_seq, line ) 
-            if line.split()[0].isdigit():
-                tmp_seq = "%s " % tmp_seq
-    if len( tmp_seq ) > 0:
-        seq_hash[ ( tmp_seq_count, tmp_title ) ] = tmp_seq
+            fasta_seq = "%s%s" % ( fasta_seq, line ) 
     
-    title_keys = seq_hash.keys()
-    title_keys.sort()
-    output_handle = open( output_filename, 'w' )
-    at_least_one = 0
-    for i, fasta_title in title_keys:
-        tmp_seq = seq_hash[ ( i, fasta_title ) ]
+    if len( fasta_seq ) > 0:
+                
         if max_length <= 0: 
-            compare_max_length = len( tmp_seq ) + 1
+            compare_max_length = len( fasta_seq ) + 1
         else:
             compare_max_length = max_length
-        l = len( tmp_seq )
+            
+        l = len( fasta_seq )
+        
         if l >= min_length and l <= compare_max_length:
             at_least_one += 1
-            output_handle.write( "%s\n" % fasta_title )
+            out.write( "%s\n" % fasta_title )
             c = 0
-            s = tmp_seq
+            s = fasta_seq
             while c < l:
                 b = min( c + 50, l )
-                output_handle.write( "%s\n" % s[ c:b ] )   
+                out.write( "%s\n" % s[ c:b ] )   
                 c = b
-    output_handle.close()
+
+    out.close()
 
     if at_least_one == 0:
         print "There is no sequence that falls within your range."
diff -r e76a153769d4 -r 6d849785ff86 tools/fasta_tools/fasta_to_tabular.py
--- a/tools/fasta_tools/fasta_to_tabular.py	Mon Jan 19 11:47:29 2009 -0500
+++ b/tools/fasta_tools/fasta_to_tabular.py	Mon Jan 19 15:47:02 2009 -0500
@@ -1,9 +1,9 @@
 #! /usr/bin/python
 # This code exists in 2 places: ~/datatypes/converters and ~/tools/fasta_tools
 """
-Input: fasta, minimal length, maximal length
-Output: fasta
-Return sequences whose lengths are within the range.
+Input: fasta, int
+Output: tabular
+format convert: fasta to tabular
 """
 
 import sys, os
@@ -14,38 +14,31 @@
     infile = sys.argv[1]
     outfile = sys.argv[2]
     keep_first = int( sys.argv[3] )
-    title = ''
-    sequence = ''
-    sequence_count = 0
+    fasta_title = fasta_seq = ''
     
     if keep_first == 0:
         keep_first = None
     else:
         keep_first += 1
 
+    out = open( outfile, 'w' )
+    
     for i, line in enumerate( open( infile ) ):
         line = line.rstrip( '\r\n' )
         if not line or line.startswith( '#' ):
             continue
         if line.startswith( '>' ):
-            if sequence:
-                sequence_count += 1
-                seq_hash[( sequence_count, title )] = sequence
-            title = line
-            sequence = ''
+            if fasta_seq:
+                out.write( "%s\t%s\n" %( fasta_title[ 1:keep_first ], fasta_seq ) )
+            fasta_title = line
+            fasta_seq = ''
         else:
-            sequence = "%s%s" % ( sequence, line )
-            if line.split() and line.split()[0].isdigit():
-                sequence += ' '
-    if sequence:
-        seq_hash[( sequence_count, title )] = sequence
-    # return only those lengths are in the range
-    title_keys = seq_hash.keys()
-    title_keys.sort()
-    out = open( outfile, 'w' )
-    for i, fasta_title in title_keys:
-        sequence = seq_hash[( i, fasta_title )]
-        out.write( "%s\t%s\n" %( fasta_title[ 1:keep_first ], sequence ) )
+            if line:
+                fasta_seq = "%s%s" % ( fasta_seq, line )
+
+    if fasta_seq:
+        out.write( "%s\t%s\n" %( fasta_title[ 1:keep_first ], fasta_seq ) )
+                
     out.close()
 
 if __name__ == "__main__" : __main__()
\ No newline at end of file

    

[galaxy-dev] [hg] galaxy 1713: remove the usage of hast tables in the fasta-t...

Greg Von Kuster