[hg] galaxy 2410: Add the ability for the tabular join tool to f...
details: http://www.bx.psu.edu/hg/galaxy/rev/a8d4ab6b6dd3 changeset: 2410:a8d4ab6b6dd3 user: Dan Blankenberg <dan@bx.psu.edu> date: Thu May 14 12:55:14 2009 -0400 description: Add the ability for the tabular join tool to fill in empty columns. Working tests are needed. 2 file(s) affected in this change: tools/filters/join.py tools/filters/joiner.xml diffs (235 lines): diff -r 25d736f5cbf7 -r a8d4ab6b6dd3 tools/filters/join.py --- a/tools/filters/join.py Wed May 13 21:54:45 2009 -0400 +++ b/tools/filters/join.py Thu May 14 12:55:14 2009 -0400 @@ -8,8 +8,21 @@ """ -import optparse, os, sys, tempfile, struct +import optparse, os, sys, tempfile, struct import psyco_full + +try: + simple_json_exception = None + from galaxy import eggs + from galaxy.util.bunch import Bunch + from galaxy.util import stringify_dictionary_keys + import pkg_resources + pkg_resources.require("simplejson") + import simplejson +except Exception, e: + simplejson_exception = e + simplejson = None + class OffsetList: def __init__( self, filesize = 0, fmt = None ): @@ -235,7 +248,22 @@ for offset in self.buffered_offsets[identifier]: yield self.index.get_line_by_offset( offset ) -def join_files( filename1, column1, filename2, column2, out_filename, split = None, buffer = 1000000, keep_unmatched = False, keep_partial = False, index_depth = 3 ): + +def fill_empty_columns( line, split, fill_values ): + if not fill_values: + return line + filled_columns = [] + for i, field in enumerate( line.split( split ) ): + if field or i >= len( fill_values ): + filled_columns.append( field ) + else: + filled_columns.append( fill_values[i] ) + if len( fill_values ) > len( filled_columns ): + filled_columns.extend( fill_values[ len( filled_columns ) : ] ) + return split.join( filled_columns ) + + +def join_files( filename1, column1, filename2, column2, out_filename, split = None, buffer = 1000000, keep_unmatched = False, keep_partial = False, index_depth = 3, fill_options = None ): #return identifier based upon line def get_identifier_by_line( line, column, split = None ): if isinstance( line, str ): @@ -243,6 +271,8 @@ if column < len( fields ): return fields[column] return None + if fill_options is None: + fill_options = Bunch( fill_unjoined_only = True, file1_columns = None, file2_columns = None ) out = open( out_filename, 'w+b' ) index = BufferedIndex( filename2, column2, split, buffer, index_depth ) for line1 in open( filename1, 'rb' ): @@ -250,12 +280,21 @@ if identifier: written = False for line2 in index.get_lines_by_identifier( identifier ): - out.write( "%s%s%s\n" % ( line1.rstrip( '\r\n' ), split, line2.rstrip( '\r\n' ) ) ) + if not fill_options.fill_unjoined_only: + out.write( "%s%s%s\n" % ( fill_empty_columns( line1.rstrip( '\r\n' ), split, fill_options.file1_columns ), split, fill_empty_columns( line2.rstrip( '\r\n' ), split, fill_options.file2_columns ) ) ) + else: + out.write( "%s%s%s\n" % ( line1.rstrip( '\r\n' ), split, line2.rstrip( '\r\n' ) ) ) written = True if not written and keep_unmatched: - out.write( "%s\n" % ( line1.rstrip( '\r\n' ) ) ) + out.write( fill_empty_columns( line1.rstrip( '\r\n' ), split, fill_options.file1_columns ) ) + if fill_options: + out.write( fill_empty_columns( "", split, fill_options.file2_columns ) ) + out.write( "\n" ) elif keep_partial: - out.write( "%s\n" % ( line1.rstrip( '\r\n' ) ) ) + out.write( fill_empty_columns( line1.rstrip( '\r\n' ), split, fill_options.file1_columns ) ) + if fill_options: + out.write( fill_empty_columns( "", split, fill_options.file2_columns ) ) + out.write( "\n" ) out.close() def main(): @@ -284,8 +323,32 @@ dest='keep_unmatched', default=False, help='Keep rows in first input which are not joined with the second input.') + parser.add_option( + '-f','--fill_options_file', + dest='fill_options_file', + type='str',default=None, + help='Fill empty columns with a values from a JSONified file.') + options, args = parser.parse_args() + + fill_options = None + if options.fill_options_file is not None: + try: + if simplejson is None: + raise simplejson_exception + fill_options = Bunch( **stringify_dictionary_keys( simplejson.load( open( options.fill_options_file ) ) ) ) #simplejson.load( open( options.fill_options_file ) ) + except Exception, e: + print "Warning: Ignoring fill options due to simplejson error (%s)." % e + if fill_options is None: + fill_options = Bunch() + if 'fill_unjoined_only' not in fill_options: + fill_options.fill_unjoined_only = True + if 'file1_columns' not in fill_options: + fill_options.file1_columns = None + if 'file2_columns' not in fill_options: + fill_options.file2_columns = None + try: filename1 = args[0] @@ -300,6 +363,6 @@ #Character for splitting fields and joining lines split = "\t" - return join_files( filename1, column1, filename2, column2, out_filename, split, options.buffer, options.keep_unmatched, options.keep_partial, options.index_depth ) + return join_files( filename1, column1, filename2, column2, out_filename, split, options.buffer, options.keep_unmatched, options.keep_partial, options.index_depth, fill_options = fill_options ) if __name__ == "__main__": main() diff -r 25d736f5cbf7 -r a8d4ab6b6dd3 tools/filters/joiner.xml --- a/tools/filters/joiner.xml Wed May 13 21:54:45 2009 -0400 +++ b/tools/filters/joiner.xml Thu May 14 12:55:14 2009 -0400 @@ -1,6 +1,6 @@ -<tool id="join1" name="Join two Queries" version="2.0.0"> +<tool id="join1" name="Join two Queries" version="2.0.1"> <description>side by side on a specified field</description> - <command interpreter="python">join.py $input1 $input2 $field1 $field2 $out_file1 $unmatched $partial --index_depth=3 --buffer=50000000</command> + <command interpreter="python">join.py $input1 $input2 $field1 $field2 $out_file1 $unmatched $partial --index_depth=3 --buffer=50000000 --fill_options_file=$fill_options_file</command> <inputs> <param format="tabular" name="input1" type="data" label="Join"/> <param name="field1" label="using column" type="data_column" data_ref="input1" /> @@ -14,7 +14,66 @@ <option value="-p">Yes</option> <option value="" selected="true">No</option> </param> + <conditional name="fill_empty_columns"> + <param name="fill_empty_columns_switch" type="select" label="Fill empty columns"> + <option value="no_fill" selected="True">No</option> + <option value="fill_empty">Yes</option> + </param> + <when value="no_fill"> + <!-- do nothing --> + </when> + <when value="fill_empty"> + <param type="select" name="fill_columns_by" label="Only fill unjoined rows"> + <option value="fill_unjoined_only" selected="True">Yes</option> + <option value="fill_all">No</option> + </param> + <conditional name="do_fill_empty_columns"> + <param name="column_fill_type" type="select" label="Fill Columns by"> + <option value="single_fill_value" selected="True">Single fill value</option> + <option value="fill_value_by_column">Values by column</option> + </param> + <when value="single_fill_value"> + <param type="text" name="fill_value" label="Fill value" value="."/> + </when> + <when value="fill_value_by_column"> + <repeat name="column_fill1" title="Fill Column for Input 1"> + <param name="column_number1" label="Column" type="data_column" data_ref="input1" /> + <param type="text" name="fill_value1" value="."/> + </repeat> + <repeat name="column_fill2" title="Fill Column for Input 2"> + <param name="column_number2" label="Column" type="data_column" data_ref="input2" /> + <param type="text" name="fill_value2" value="."/> + </repeat> + </when> + </conditional> + </when> + </conditional> </inputs> + <configfiles> + <configfile name="fill_options_file"><% +import simplejson +%>#set $__fill_options = {} +#if $fill_empty_columns['fill_empty_columns_switch'] == 'fill_empty': +#set $__fill_options['fill_unjoined_only'] = $fill_empty_columns['fill_columns_by'].value == 'fill_unjoined_only' +#if $fill_empty_columns['do_fill_empty_columns']['column_fill_type'] == 'single_fill_value': +#set $__start_fill = $fill_empty_columns['do_fill_empty_columns']['fill_value'].value +#else: +#set $__start_fill = "" +#end if +#set $__fill_options['file1_columns'] = [ $__start_fill for i in range( int( $input1.metadata.columns ) ) ] +#set $__fill_options['file2_columns'] = [ $__start_fill for i in range( int( $input2.metadata.columns ) ) ] +#if $fill_empty_columns['do_fill_empty_columns']['column_fill_type'] == 'fill_value_by_column': +#for column_fill1 in $fill_empty_columns['do_fill_empty_columns']['column_fill1']: +#set $__fill_options['file1_columns'][ int( column_fill1['column_number1'].value ) - 1 ] = column_fill1['fill_value1'].value +#end for +#for column_fill2 in $fill_empty_columns['do_fill_empty_columns']['column_fill2']: +#set $__fill_options['file2_columns'][ int( column_fill2['column_number2'].value ) - 1 ] = column_fill2['fill_value2'].value +#end for +#end if +#end if +${simplejson.dumps( __fill_options )} + </configfile> + </configfiles> <outputs> <data format="input" name="out_file1" metadata_source="input1" /> </outputs> @@ -26,6 +85,7 @@ <param name="field2" value="2"/> <param name="unmatched" value=""/> <param name="partial" value=""/> + <param name="fill_empty_columns_switch" value="no_fill"/> <output name="out_file1" file="joiner_out1.bed"/> </test> <test> @@ -35,8 +95,22 @@ <param name="field2" value="2"/> <param name="unmatched" value="Yes"/> <param name="partial" value="Yes"/> + <param name="fill_empty_columns_switch" value="no_fill"/> <output name="out_file1" file="joiner_out2.bed"/> </test> +<!--this throws an exception in the test framework - we need tests for the fill empty columns enhancements <test> + <param name="input1" value="1.bed"/> + <param name="input2" value="2.bed"/> + <param name="field1" value="2"/> + <param name="field2" value="2"/> + <param name="unmatched" value="Yes"/> + <param name="partial" value="Yes"/> + <param name="fill_empty_columns_switch" value="fill_empty"/> + <param name="fill_columns_by" value="fill_all"/> + <param name="column_fill_type" value="single_fill_value"/> + <param name="fill_value" value="~"/> + <output name="out_file1" file="joiner_out3.bed"/> + </test>--> </tests> <help>
participants (1)
-
Nate Coraor