commit/galaxy-central: kanwei: [Peter Cock] Filtering tool: Only cast used columns. Fix valid lines and skipped info lines. [Kanwei] Add new testcases to test this behavior. Fixes #537. Fixes #535

27 May 2011

1 new changeset in galaxy-central:

http://bitbucket.org/galaxy/galaxy-central/changeset/cc686fadddbc/
changeset:   cc686fadddbc
branches:    
user:        kanwei
date:        2011-05-27 10:47:17
summary:     [Peter Cock] Filtering tool: Only cast used columns. Fix valid lines and skipped info lines. [Kanwei] Add new testcases to test this behavior. Fixes #537. Fixes #535
affected #:  4 files (364 bytes)

--- a/tools/stats/filtering.py	Thu May 26 13:29:00 2011 -0400
+++ b/tools/stats/filtering.py	Fri May 27 10:47:17 2011 +0200
@@ -32,7 +32,7 @@
 cond_text = sys.argv[3]
 try:
     in_columns = int( sys.argv[4] )
-    assert sys.argv[5]  #check to see that the column types varaible isn't null
+    assert sys.argv[5]  #check to see that the column types variable isn't null
     in_column_types = sys.argv[5].split( ',' )
 except:
     stop_err( "Data does not appear to be tabular.  This tool can only be used with tab-delimited data." )
@@ -60,22 +60,25 @@
     except:
         if operand in secured:
             stop_err( "Illegal value '%s' in condition '%s'" % ( operand, cond_text ) )
-            
-# Find the largest column used in the filter.
-largest_col_index = -1
-for match in re.finditer( 'c(\d)+', cond_text ):
-    col_index = int( match.group()[1:] )
-    if col_index > largest_col_index:
-        largest_col_index = col_index
+
+# Work out which columns are used in the filter (save using 1 based counting)
+used_cols = sorted(set(int(match.group()[1:]) \
+                   for match in re.finditer('c(\d)+', cond_text))) 
+largest_col_index = max(used_cols)
 
 # Prepare the column variable names and wrappers for column data types. Only 
-# prepare columns up to largest column in condition.
+# cast columns used in the filter.
 cols, type_casts = [], []
 for col in range( 1, largest_col_index + 1 ):
     col_name = "c%d" % col
     cols.append( col_name )
     col_type = in_column_types[ col - 1 ]
-    type_cast = "%s(%s)" % ( col_type, col_name )
+    if col in used_cols:
+        type_cast = "%s(%s)" % ( col_type, col_name )
+    else:
+        #If we don't use this column, don't cast it.
+        #Otherwise we get errors on things like optional integer columns.
+        type_cast = col_name
     type_casts.append( type_cast )
  
 col_str = ', '.join( cols )    # 'c1, c2, c3, c4'
@@ -83,6 +86,7 @@
 assign = "%s, = line.split( '\\t' )[:%i]" % ( col_str, largest_col_index )
 wrap = "%s = %s" % ( col_str, type_cast_str )
 skipped_lines = 0
+invalid_lines = 0
 first_invalid_line = 0
 invalid_line = None
 lines_kept = 0
@@ -96,9 +100,6 @@
     line = line.rstrip( '\\r\\n' )
     if not line or line.startswith( '#' ):
         skipped_lines += 1
-        if not invalid_line:
-            first_invalid_line = i + 1
-            invalid_line = line
         continue
     try:
         %s
@@ -107,7 +108,7 @@
             lines_kept += 1
             print >> out, line
     except:
-        skipped_lines += 1
+        invalid_lines += 1
         if not invalid_line:
             first_invalid_line = i + 1
             invalid_line = line
@@ -132,5 +133,7 @@
         print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines )
     else:
         print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % cond_text
-    if skipped_lines > 0:
-        print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line )
+    if invalid_lines:
+        print 'Skipped %d invalid line(s) starting at line #%d: "%s"' % ( invalid_lines, first_invalid_line, invalid_line )
+    if skipped_lines:
+        print 'Skipped %i comment (starting with #) or blank line(s)' % skipped_lines


--- a/tools/stats/filtering.xml	Thu May 26 13:29:00 2011 -0400
+++ b/tools/stats/filtering.xml	Fri May 27 10:47:17 2011 +0200
@@ -1,4 +1,4 @@
-<tool id="Filter1" name="Filter" version="1.0.1">
+<tool id="Filter1" name="Filter" version="1.1.0"><description>data on any column using simple expressions</description><command interpreter="python">
     filtering.py $input $out_file1 "$cond" ${input.metadata.columns} "${input.metadata.column_types}"
@@ -29,7 +29,11 @@
       <param name="cond" value="c3=='chr1' and c5>5"/><output name="out_file1" file="filter1_test3.sam"/></test>
-    
+    <test>
+      <param name="input" value="filter1_inbad.bed"/>
+      <param name="cond" value="c1=='chr22'"/>
+      <output name="out_file1" file="filter1_test4.bed"/>
+    </test></tests><help>

Repository URL: https://bitbucket.org/galaxy/galaxy-central/

--

This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.

    

Bitbucket

tags

participants (1)