commit/galaxy-central: kanwei: [Peter Cock] Filtering tool: Only cast used columns. Fix valid lines and skipped info lines. [Kanwei] Add new testcases to test this behavior. Fixes #537. Fixes #535
1 new changeset in galaxy-central: http://bitbucket.org/galaxy/galaxy-central/changeset/cc686fadddbc/ changeset: cc686fadddbc branches: user: kanwei date: 2011-05-27 10:47:17 summary: [Peter Cock] Filtering tool: Only cast used columns. Fix valid lines and skipped info lines. [Kanwei] Add new testcases to test this behavior. Fixes #537. Fixes #535 affected #: 4 files (364 bytes) --- a/tools/stats/filtering.py Thu May 26 13:29:00 2011 -0400 +++ b/tools/stats/filtering.py Fri May 27 10:47:17 2011 +0200 @@ -32,7 +32,7 @@ cond_text = sys.argv[3] try: in_columns = int( sys.argv[4] ) - assert sys.argv[5] #check to see that the column types varaible isn't null + assert sys.argv[5] #check to see that the column types variable isn't null in_column_types = sys.argv[5].split( ',' ) except: stop_err( "Data does not appear to be tabular. This tool can only be used with tab-delimited data." ) @@ -60,22 +60,25 @@ except: if operand in secured: stop_err( "Illegal value '%s' in condition '%s'" % ( operand, cond_text ) ) - -# Find the largest column used in the filter. -largest_col_index = -1 -for match in re.finditer( 'c(\d)+', cond_text ): - col_index = int( match.group()[1:] ) - if col_index > largest_col_index: - largest_col_index = col_index + +# Work out which columns are used in the filter (save using 1 based counting) +used_cols = sorted(set(int(match.group()[1:]) \ + for match in re.finditer('c(\d)+', cond_text))) +largest_col_index = max(used_cols) # Prepare the column variable names and wrappers for column data types. Only -# prepare columns up to largest column in condition. +# cast columns used in the filter. cols, type_casts = [], [] for col in range( 1, largest_col_index + 1 ): col_name = "c%d" % col cols.append( col_name ) col_type = in_column_types[ col - 1 ] - type_cast = "%s(%s)" % ( col_type, col_name ) + if col in used_cols: + type_cast = "%s(%s)" % ( col_type, col_name ) + else: + #If we don't use this column, don't cast it. + #Otherwise we get errors on things like optional integer columns. + type_cast = col_name type_casts.append( type_cast ) col_str = ', '.join( cols ) # 'c1, c2, c3, c4' @@ -83,6 +86,7 @@ assign = "%s, = line.split( '\\t' )[:%i]" % ( col_str, largest_col_index ) wrap = "%s = %s" % ( col_str, type_cast_str ) skipped_lines = 0 +invalid_lines = 0 first_invalid_line = 0 invalid_line = None lines_kept = 0 @@ -96,9 +100,6 @@ line = line.rstrip( '\\r\\n' ) if not line or line.startswith( '#' ): skipped_lines += 1 - if not invalid_line: - first_invalid_line = i + 1 - invalid_line = line continue try: %s @@ -107,7 +108,7 @@ lines_kept += 1 print >> out, line except: - skipped_lines += 1 + invalid_lines += 1 if not invalid_line: first_invalid_line = i + 1 invalid_line = line @@ -132,5 +133,7 @@ print 'kept %4.2f%% of %d lines.' % ( 100.0*lines_kept/valid_lines, total_lines ) else: print 'Possible invalid filter condition "%s" or non-existent column referenced. See tool tips, syntax and examples.' % cond_text - if skipped_lines > 0: - print 'Skipped %d invalid lines starting at line #%d: "%s"' % ( skipped_lines, first_invalid_line, invalid_line ) + if invalid_lines: + print 'Skipped %d invalid line(s) starting at line #%d: "%s"' % ( invalid_lines, first_invalid_line, invalid_line ) + if skipped_lines: + print 'Skipped %i comment (starting with #) or blank line(s)' % skipped_lines --- a/tools/stats/filtering.xml Thu May 26 13:29:00 2011 -0400 +++ b/tools/stats/filtering.xml Fri May 27 10:47:17 2011 +0200 @@ -1,4 +1,4 @@ -<tool id="Filter1" name="Filter" version="1.0.1"> +<tool id="Filter1" name="Filter" version="1.1.0"><description>data on any column using simple expressions</description><command interpreter="python"> filtering.py $input $out_file1 "$cond" ${input.metadata.columns} "${input.metadata.column_types}" @@ -29,7 +29,11 @@ <param name="cond" value="c3=='chr1' and c5>5"/><output name="out_file1" file="filter1_test3.sam"/></test> - + <test> + <param name="input" value="filter1_inbad.bed"/> + <param name="cond" value="c1=='chr22'"/> + <output name="out_file1" file="filter1_test4.bed"/> + </test></tests><help> Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
Bitbucket