[hg] galaxy 3630: Fixed a bug in the new joiner tool
details: http://www.bx.psu.edu/hg/galaxy/rev/fdf5c92e7fae changeset: 3630:fdf5c92e7fae user: Kelly Vincent <kpvincent@bx.psu.edu> date: Mon Apr 12 19:11:36 2010 -0400 description: Fixed a bug in the new joiner tool diffstat: tools/new_operations/column_join.py | 97 +++++++++++++++++++++++------------ tools/new_operations/column_join.xml | 8 +- 2 files changed, 68 insertions(+), 37 deletions(-) diffs (213 lines): diff -r 6b93e705c8a4 -r fdf5c92e7fae tools/new_operations/column_join.py --- a/tools/new_operations/column_join.py Mon Apr 12 17:24:17 2010 -0400 +++ b/tools/new_operations/column_join.py Mon Apr 12 19:11:36 2010 -0400 @@ -19,6 +19,32 @@ sys.stderr.write( msg ) sys.exit() +def hinge_compare( hinge1, hinge2 ): + """ + Compares items like 'chr10' and 'chrM' or 'scaffold2' and scaffold10' so that + first part handled as text but last part as number + """ + pat = re.compile( '(?P<text>\D*)(?P<number>\d+)?' ) + split_hinge1 = hinge1.split( '\t' ) + split_hinge2 = hinge2.split( '\t' ) + for i in range( len( split_hinge1 ) ): + if split_hinge1[ i ] == split_hinge2[ i ]: + continue + try: + if int( split_hinge1[ i ] ) > int( split_hinge2[ i ] ): + return 1 + else: + return -1 + except ValueError: + try: + if float( split_hinge1[ i ] ) > float( split_hinge2[ i ] ): + return 1 + else: + return -1 + except ValueError: + return ref_compare( split_hinge1[ i ], split_hinge2[ i ]) + return 0 + def ref_compare( ref1, ref2 ): """ Compares items like 'chr10' and 'chrM' or 'scaffold2' and scaffold10' so that @@ -52,35 +78,31 @@ elif text1 < text2: return -1 -def ref_pos_sort( infile, outfile ): +def hinge_sort( infile, outfile, hinge ): """Given input file name, sorts logically (text vs. numeric) into provided output file name.""" - ref_locs = {} + hinge_locs = {} bad_lines = [] fin = open( infile, 'rb' ) line = fin.readline() while line.strip(): - if True: + try: + hinge_parts = line.split( '\t' )[ :hinge ] try: - ref_seq, ref_loc = line.split( '\t' )[:2] - try: - ref_locs[ ref_seq ][ long( ref_loc ) ] = fin.tell() - len( line.strip() ) - 1 - except KeyError: - ref_locs[ ref_seq ] = { long( ref_loc ): fin.tell() - len( line.strip() ) - 1 } - except ValueError: - bad_lines.append( line ) - except ValueError: - bad_line.append( line ) + hinge_locs[ '\t'.join( hinge_parts ) ].append( fin.tell() - len( line.strip() ) - 1 ) + except KeyError: + hinge_locs[ '\t'.join( hinge_parts ) ] = [ fin.tell() - len( line.strip() ) - 1 ] + except ValueError: + bad_line.append( line ) line = fin.readline() fin.close() - refs = ref_locs.keys() - refs.sort( ref_compare ) fin = open( infile, 'rb' ) fout = open( outfile, 'wb' ) - for ref in refs: - locs = ref_locs[ ref ].keys() - locs.sort() + hinge_locs_sorted = hinge_locs.keys() + hinge_locs_sorted.sort( hinge_compare ) + for hinge_loc in hinge_locs_sorted: + locs = hinge_locs[ hinge_loc ] for loc in locs: - fin.seek( ref_locs[ ref ][ loc ] ) + fin.seek( loc ) fout.write( fin.readline() ) fout.close() fin.close() @@ -90,14 +112,17 @@ if len( chr_pos ) == 0 and ''.join( chr_pos ): return '' min_loc = len( chr_pos ) - min_ref_pos = [] + min_hinge = [] loc = 0 for c_pos in chr_pos: if c_pos.strip(): + split_c = c_pos.split( '\t' ) + + ref, pos = c_pos.split( '\t' )[:2] pos = int( pos ) - if not min_ref_pos: - min_ref_pos = [ ref, pos ] + if not min_hinge: + min_hinge = split_c min_loc = loc else: ref_comp = ref_compare( ref, min_ref_pos[0] ) @@ -126,25 +151,33 @@ tmp_file = tempfile.NamedTemporaryFile() tmp_file_name = tmp_file.name tmp_file.close() - ref_pos_sort( in_file, tmp_file_name ) + hinge_sort( in_file, tmp_file_name, hinge ) tmp_file = open( tmp_file_name, 'rb' ) tmp_input_files.append( tmp_file ) # cycle through files, getting smallest line of all files one at a time # also have to keep track of vertical position of extra columns fout = file( output, 'w' ) - old_current_chr_pos = '' + old_current = '' first_line = True current_lines = [ f.readline() for f in tmp_input_files ] last_lines = ''.join( current_lines ).strip() last_loc = -1 + i = 0 while last_lines: # get the "minimum" hinge, which should come first, and the file location in list - current_chr_pos, loc = min_chr_pos( [ '\t'.join( line.split( '\t' )[ :hinge ] ) for line in current_lines ] ) + hinges = [ '\t'.join( line.split( '\t' )[ :hinge ] ) for line in current_lines ] + hinge_dict = {} + for i in range( len( hinges ) ): + if not hinge_dict.has_key( hinges[ i ] ): + hinge_dict[ hinges[ i ] ] = i + hinges.sort( hinge_compare ) + hinges = [ h for h in hinges if h ] + current, loc = hinges[0], hinge_dict[ hinges[0] ] # first output empty columns for vertical alignment (account for "missing" files) - if current_chr_pos != old_current_chr_pos: + if current != old_current: last_loc = -1 if loc - last_loc > 1: - current_data = [ '' for col in range( ( loc - last_loc - 1 ) * len( cols[ hinge: ] ) ) ] + current_data = [ '' for col in range( ( loc - last_loc - 1 ) * len( [ col for col in cols if col > hinge ] ) ) ] else: current_data = [] # now output actual data @@ -154,22 +187,20 @@ if col > hinge: current_data.append( split_line[ col - 1 ] ) current_lines[ loc ] = tmp_input_files[ loc ].readline() - if current_chr_pos == old_current_chr_pos: - fout.write( '\t%s' % '\t'.join( current_data ) ) + if current == old_current: + if current_data: + fout.write( '\t%s' % '\t'.join( current_data ) ) else: if not first_line: fout.write( '\n' ) - fout.write( '%s\t%s' % ( current_chr_pos, '\t'.join( current_data ) ) ) + fout.write( '%s\t%s' % ( current, '\t'.join( current_data ) ) ) first_line = False - old_current_chr_pos = current_chr_pos - if last_lines == ''.join( current_lines ).strip(): - break + old_current = current last_lines = ''.join( current_lines ).strip() last_loc = loc fout.write( '\n' ) for f in tmp_input_files: os.unlink( f.name ) fout.close() -# sys.stderr.write('******************\n'+file(fout.name, 'r').read()+'\n******************\n') if __name__ == "__main__" : __main__() diff -r 6b93e705c8a4 -r fdf5c92e7fae tools/new_operations/column_join.xml --- a/tools/new_operations/column_join.xml Mon Apr 12 17:24:17 2010 -0400 +++ b/tools/new_operations/column_join.xml Mon Apr 12 19:11:36 2010 -0400 @@ -21,7 +21,7 @@ </repeat> </inputs> <outputs> - <data name="output" format="pileup" /> + <data name="output" format="tabular" /> </outputs> <tests> <test> @@ -30,7 +30,7 @@ <param name="hinge" value="1,2" /> <param name="input2" value="column_join_in2.pileup" ftype="pileup" /> <param name="input" value="column_join_in3.pileup" ftype="pileup" /> - <output name="output" file="column_join_out1.pileup" ftype="pileup" /> + <output name="output" file="column_join_out1.pileup" ftype="tabular" /> </test> <test> <param name="input1" value="column_join_in4.pileup" ftype="pileup" /> @@ -38,7 +38,7 @@ <param name="hinge" value="1,2" /> <param name="input2" value="column_join_in5.pileup" ftype="pileup" /> <param name="input" value="column_join_in6.pileup" ftype="pileup" /> - <output name="output" file="column_join_out2.pileup" ftype="pileup" /> + <output name="output" file="column_join_out2.pileup" ftype="tabular" /> </test> <test> <param name="input1" value="column_join_in7.pileup" ftype="pileup" /> @@ -46,7 +46,7 @@ <param name="hinge" value="1,2" /> <param name="input2" value="column_join_in8.pileup" ftype="pileup" /> <param name="input" value="column_join_in9.pileup" ftype="pileup" /> - <output name="output" file="column_join_out3.pileup" ftype="pileup" /> + <output name="output" file="column_join_out3.pileup" ftype="tabular" /> </test> </tests> <help>
participants (1)
-
Greg Von Kuster