details: http://www.bx.psu.edu/hg/galaxy/rev/5bbd895570d4 changeset: 3568:5bbd895570d4 user: Kelly Vincent <kpvincent@bx.psu.edu> date: Fri Mar 26 12:37:20 2010 -0400 description: Missed the actual tool files for new join in last commit... diffstat: tools/new_operations/column_join.py | 175 +++++++++++++++++++++++++++++++++++ tools/new_operations/column_join.xml | 150 ++++++++++++++++++++++++++++++ 2 files changed, 325 insertions(+), 0 deletions(-) diffs (333 lines): diff -r a9e25e05ae8d -r 5bbd895570d4 tools/new_operations/column_join.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/new_operations/column_join.py Fri Mar 26 12:37:20 2010 -0400 @@ -0,0 +1,175 @@ +#!/usr/bin/env python + +""" +This tool takes a tab-delimited text file as input and creates filters on columns based on certain properties. The tool will skip over invalid lines within the file, informing the user about the number of lines skipped. + +usage: %prog output input1 input2 column1[,column2[,column3[,...]]] hinge1[,hinge2[,hinge3[,...]]] [other_input1 [other_input2 [other_input3 ...]]] + output: the output pileup + input1: the pileup file to start with + input2: the second pileup file to join + hinge: the columns to be used for matching + columns: the columns that should appear in the output + other_inputs: the other input files to join + +""" + +import os, re, sys, tempfile + +def stop_err( msg ): + sys.stderr.write( msg ) + sys.exit() + +def ref_compare( ref1, ref2 ): + """ + Compares items like 'chr10' and 'chrM' or 'scaffold2' and scaffold10' so that + first part handled as text but last part as number + """ + pat = re.compile( '(?P<text>\D*)(?P<number>\d+)?' ) + r1 = pat.match( ref1 ) + r2 = pat.match( ref2 ) + if not r2: + return 1 + elif not r1: + return -1 + text1, num1 = r1.groupdict()[ 'text' ].strip(), r1.groupdict()[ 'number' ] + text2, num2 = r2.groupdict()[ 'text' ].strip(), r2.groupdict()[ 'number' ] + if text2 == '' and ( num2 == '' or num2 is None ): + return 1 + elif text1 == '' and ( num1 == '' or num1 is None ): + return -1 + if text1 > text2: + return 1 + elif text1 == text2: + if not ( num1 is None or num2 is None ): + num1 = int( num1 ) + num2 = int( num2 ) + if num1 > num2: + return 1 + elif num1 == num2: + return 0 + elif num1 < num2: + return -1 + elif text1 < text2: + return -1 + +def ref_pos_sort( infile, outfile ): + """Given input file name, sorts logically (text vs. numeric) into provided output file name.""" + ref_locs = {} + bad_lines = [] + fin = open( infile, 'rb' ) + line = fin.readline() + while line.strip(): + if True: + try: + ref_seq, ref_loc = line.split( '\t' )[:2] + try: + ref_locs[ ref_seq ][ long( ref_loc ) ] = fin.tell() - len( line.strip() ) - 1 + except KeyError: + ref_locs[ ref_seq ] = { long( ref_loc ): fin.tell() - len( line.strip() ) - 1 } + except ValueError: + bad_lines.append( line ) + except ValueError: + bad_line.append( line ) + line = fin.readline() + fin.close() + refs = ref_locs.keys() + refs.sort( ref_compare ) + fin = open( infile, 'rb' ) + fout = open( outfile, 'wb' ) + for ref in refs: + locs = ref_locs[ ref ].keys() + locs.sort() + for loc in locs: + fin.seek( ref_locs[ ref ][ loc ] ) + fout.write( fin.readline() ) + fout.close() + fin.close() + +def min_chr_pos( chr_pos ): + """Given line and hinge, identifies the 'smallest' one, from left to right""" + if len( chr_pos ) == 0 and ''.join( chr_pos ): + return '' + min_loc = len( chr_pos ) + min_ref_pos = [] + loc = 0 + for c_pos in chr_pos: + if c_pos.strip(): + ref, pos = c_pos.split( '\t' )[:2] + pos = int( pos ) + if not min_ref_pos: + min_ref_pos = [ ref, pos ] + min_loc = loc + else: + ref_comp = ref_compare( ref, min_ref_pos[0] ) + if ref_comp < 0: + min_ref_pos = [ ref, pos ] + min_loc = loc + elif ref_comp == 0 and pos < min_ref_pos[1]: + min_ref_pos[1] = pos + min_loc = loc + loc += 1 + return '%s\t%s' % tuple( min_ref_pos ), min_loc + +def __main__(): + output = sys.argv[1] + input1 = sys.argv[2] + input2 = sys.argv[3] + hinge = int( sys.argv[4] ) + cols = [ int( c ) for c in sys.argv[5].split( ',' ) ] + inputs = sys.argv[6:] + assert len( cols ) > 2, 'You need to select at least one column in addition to the first two' + # make sure all files are sorted in same way, ascending + tmp_input_files = [] + input_files = [ input1, input2 ] + input_files.extend( inputs ) + for in_file in input_files: + tmp_file = tempfile.NamedTemporaryFile() + tmp_file_name = tmp_file.name + tmp_file.close() + ref_pos_sort( in_file, tmp_file_name ) + tmp_file = open( tmp_file_name, 'rb' ) + tmp_input_files.append( tmp_file ) + # cycle through files, getting smallest line of all files one at a time + # also have to keep track of vertical position of extra columns + fout = file( output, 'w' ) + old_current_chr_pos = '' + first_line = True + current_lines = [ f.readline() for f in tmp_input_files ] + last_lines = ''.join( current_lines ).strip() + last_loc = -1 + while last_lines: + # get the "minimum" hinge, which should come first, and the file location in list + current_chr_pos, loc = min_chr_pos( [ '\t'.join( line.split( '\t' )[ :hinge ] ) for line in current_lines ] ) + # first output empty columns for vertical alignment (account for "missing" files) + if current_chr_pos != old_current_chr_pos: + last_loc = -1 + if loc - last_loc > 1: + current_data = [ '' for col in range( ( loc - last_loc - 1 ) * len( cols[ hinge: ] ) ) ] + else: + current_data = [] + # now output actual data + split_line = current_lines[ loc ].strip().split( '\t' ) + if ''.join( split_line ): + for col in cols: + if col > hinge: + current_data.append( split_line[ col - 1 ] ) + current_lines[ loc ] = tmp_input_files[ loc ].readline() + if current_chr_pos == old_current_chr_pos: + fout.write( '\t%s' % '\t'.join( current_data ) ) + else: + if not first_line: + fout.write( '\n' ) + fout.write( '%s\t%s' % ( current_chr_pos, '\t'.join( current_data ) ) ) + first_line = False + old_current_chr_pos = current_chr_pos + if last_lines == ''.join( current_lines ).strip(): + break + last_lines = ''.join( current_lines ).strip() + last_loc = loc + fout.write( '\n' ) + for f in tmp_input_files: + os.unlink( f.name ) + fout.close() +# sys.stderr.write('******************\n'+file(fout.name, 'r').read()+'\n******************\n') + +if __name__ == "__main__" : __main__() diff -r a9e25e05ae8d -r 5bbd895570d4 tools/new_operations/column_join.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/new_operations/column_join.xml Fri Mar 26 12:37:20 2010 -0400 @@ -0,0 +1,150 @@ +<tool id="column_join" name="Column Join" version="1.0.0"> + <description></description> + <command interpreter="python"> + column_join.py + $output + $input1 + $input2 + $hinge + $columns + #for $f in $file_chooser: + ${f.input} + #end for + </command> + <inputs> + <param name="input1" type="data" format="tabular" label="Choose the first file for the join" /> + <param name="columns" type="data_column" data_ref="input1" multiple="true" numerical="false" label="Include these column" help="Multi-select list - hold the appropriate key while clicking to select multiple columns" /> + <param name="hinge" type="data_column" data_ref="input1" multiple="false" numerical="false" label="Use this column and columns to left the 'hinge' (matching data for each join)" help="All columns to left of selected column (plus selected column) will be used. Select 2 for pileup" /> + <param name="input2" type="data" format="tabular" label="Choose the second file for the join" /> + <repeat name="file_chooser" title="Additional Input"> + <param name="input" label="Additional input file" type="data" format="tabular" /> + </repeat> + </inputs> + <outputs> + <data name="output" format="pileup" /> + </outputs> + <tests> + <test> + <param name="input1" value="column_join_in1.pileup" ftype="pileup" /> + <param name="columns" value="1,2,3,4,5,7" /> + <param name="hinge" value="1,2" /> + <param name="input2" value="column_join_in2.pileup" ftype="pileup" /> + <param name="input" value="column_join_in3.pileup" ftype="pileup" /> + <output name="output" file="column_join_out1.pileup" ftype="pileup" /> + </test> + <test> + <param name="input1" value="column_join_in4.pileup" ftype="pileup" /> + <param name="columns" value="1,2,3,4" /> + <param name="hinge" value="1,2" /> + <param name="input2" value="column_join_in5.pileup" ftype="pileup" /> + <param name="input" value="column_join_in6.pileup" ftype="pileup" /> + <output name="output" file="column_join_out2.pileup" ftype="pileup" /> + </test> + <test> + <param name="input1" value="column_join_in7.pileup" ftype="pileup" /> + <param name="columns" value="1,2,3,4,5" /> + <param name="hinge" value="1,2" /> + <param name="input2" value="column_join_in8.pileup" ftype="pileup" /> + <param name="input" value="column_join_in9.pileup" ftype="pileup" /> + <output name="output" file="column_join_out3.pileup" ftype="pileup" /> + </test> + </tests> + <help> +**What it does** + +This tool allows you to join several files with the same column structure into one file, removing certain columns if necessary. The user needs to select a 'hinge', which is the number of left-most columns to match on. They also need to select the columns to include in the join, which should include the hinge columns, too. + +----- + +**Example** + +Given the following files:: + + FILE 1 + chr2 1 T 6 .C..., I$$III + chr2 2 G 6 ..N.., III@II + chr2 3 C 7 ..C..., I$IIIII + chr2 4 G 7 .G...., I#IIIII + chr2 5 G 7 ...N.., IIII#BI + chr2 6 A 7 ..T..., I$IDIII + chr1 1 C 1 ^:. I + chr1 2 G 2 .^:. $I + chr1 3 A 2 .. I% + chr1 4 C 2 .. I$ + chr1 5 T 3 ..^:. I#I + chr1 6 G 3 ..^:, I#I + + FILE 2 + chr1 3 T 1 ^:. I + chr1 4 G 2 .^:. $I + chr1 5 T 2 .. I% + chr1 6 C 3 ..^:. III + chr1 7 G 3 ..^:. I#I + chr1 8 T 4 ...^:, I#II + chr2 77 C 6 .G..., I$$III + chr2 78 G 6 ..N.., III@II + chr2 79 T 7 ..N..., I$IIIII + chr2 80 C 7 .G...., I#IIIII + chr2 81 G 7 ...A.., IIII#BI + chr2 82 A 8 ...G..., I$IDIIII + chr2 83 T 8 .A.....N IIIIIIII + chr2 84 A 9 ......T. I$IIIIIII + + FILE 3 + chr1 1 A 1 . I + chr1 2 T 2 G. I$ + chr1 3 C 2 ., I@ + chr1 4 C 3 ..N III + chr1 42 C 5 ...N^:. III@I + chr1 43 C 5 .N..^:. IIIII + chr1 44 T 5 .A.., IA@II + chr1 45 A 6 .N...^:. IIIII$ + chr1 46 G 6 .GN..^:. I@IIII + chr1 47 A 7 ....^:.., IIIII$I + chr2 73 T 5 .N.., II$II + chr2 74 A 5 ...., IIIII + chr2 75 T 5 ...., IIIII + chr2 76 T 5 ...., IIIII + chr2 77 C 5 ...., IIIBI + chr2 78 T 5 ...., IDIII + +To join on columns 3 and 4 combining on columns 1 and 2, columns 1-4 should be selected for the 'Include these columns' option, and column 2 selected for the 'hinge'. With these settings, the following would be output:: + + chr1 1 C 1 A 1 + chr1 2 G 2 T 2 + chr1 3 A 2 T 1 C 2 + chr1 4 C 2 G 2 C 3 + chr1 5 T 3 T 2 + chr1 6 G 3 C 3 + chr1 7 G 3 + chr1 8 T 4 + chr1 42 C 5 + chr1 43 C 5 + chr1 44 T 5 + chr1 45 A 6 + chr1 46 G 6 + chr1 47 A 7 + chr2 1 T 6 + chr2 2 G 6 + chr2 3 C 7 + chr2 4 G 7 + chr2 5 G 7 + chr2 6 A 7 + chr2 73 T 5 + chr2 74 A 5 + chr2 75 T 5 + chr2 76 T 5 + chr2 77 C 6 C 5 + chr2 78 G 6 T 5 + chr2 79 T 7 + chr2 80 C 7 + chr2 81 G 7 + chr2 82 A 8 + chr2 83 T 8 + chr2 84 A 9 + + + </help> +</tool> + +