commit/galaxy-central: kanwei: Refactor Grouping tool and fix bug where data after a blank value in the group_by column returns the wrong data.
1 new changeset in galaxy-central: http://bitbucket.org/galaxy/galaxy-central/changeset/4d521da04318/ changeset: r5247:4d521da04318 user: kanwei date: 2011-03-22 22:49:09 summary: Refactor Grouping tool and fix bug where data after a blank value in the group_by column returns the wrong data. affected #: 2 files (5.1 KB) --- a/tools/stats/grouping.py Tue Mar 22 14:08:32 2011 -0400 +++ b/tools/stats/grouping.py Tue Mar 22 17:49:09 2011 -0400 @@ -1,10 +1,12 @@ #!/usr/bin/env python -#Guruprasad Ananda +# Guruprasad Ananda +# Refactored 2011, Kanwei Li """ This tool provides the SQL "group by" functionality. """ import sys, string, re, commands, tempfile, random from rpy import * +from itertools import groupby def stop_err(msg): sys.stderr.write(msg) @@ -53,10 +55,10 @@ stop_err( "Group column not specified." ) str_ops = ['c', 'length', 'unique', 'random', 'cuniq', 'Mode'] #ops that can handle string/non-numeric inputs - for k,col in enumerate(cols): + for k, col in enumerate(cols): col = int(col)-1 if ops[k] not in str_ops: - # We'll get here only if the user didn't choose 'Concatenate' or 'Count' or 'Count Distinct' or 'pick randmly', which are the + # We'll get here only if the user didn't choose 'Concatenate' or 'Count' or 'Count Distinct' or 'pick randomly', which are the # only aggregation functions that can be used on columns containing strings. try: float( elems[col] ) @@ -90,191 +92,94 @@ if error_code != 0: stop_err( "Sorting input dataset resulted in error: %s: %s" %( error_code, stdout )) - prev_item = "" - prev_vals = [] + prev_item = None skipped_lines = 0 - first_invalid_line = 0 - invalid_line = '' + first_invalid_line = None invalid_value = '' invalid_column = 0 fout = open(sys.argv[1], "w") - for ii, line in enumerate( file( tmpfile.name )): - if line and not line.startswith( '#' ): - line = line.rstrip( '\r\n' ) - try: - fields = line.split("\t") - item = fields[group_col] - if ignorecase == 1: - item = item.lower() - if prev_item != "": - # At this level, we're grouping on values (item and prev_item) in group_col - if item == prev_item: - # Keep iterating and storing values until a new value is encountered. - for i, col in enumerate(cols): - col = int(col)-1 - valid = True - # Before appending the current value, make sure it is numeric if the - # operation for the column requires it. - if ops[i] not in str_ops: - try: - float( fields[col].strip()) - except: - valid = False - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = ii+1 - invalid_value = fields[col] - invalid_column = col+1 - if valid: - prev_vals[i].append(fields[col].strip()) - else: - """ - When a new value is encountered, write the previous value and the - corresponding aggregate values into the output file. This works - due to the sort on group_col we've applied to the data above. - """ - out_str = prev_item - multiple_modes = False - mode_index = None - for i, op in enumerate( ops ): - if op == 'cuniq': - rfunc = "r.c" - else: - rfunc = "r." + op - if op not in str_ops: - for j, elem in enumerate( prev_vals[i] ): - prev_vals[i][j] = float( elem ) - rout = eval( rfunc )( prev_vals[i] ) - if rounds[i] == 'yes': - rout = int(round(float(rout))) - else: - rout = '%g' %(float(rout)) - else: - if op != 'random': - rout = eval( rfunc )( prev_vals[i] ) - else: - try: - rand_index = random.randint(0,len(prev_vals[i])-1) #if the two inputs to randint are equal, it seems to throw a ValueError. This can't be reproduced with the python interpreter in its interactive mode. - except Exception, ValueError: - rand_index = 0 - rout = prev_vals[i][rand_index] - - if op == 'Mode' and rout == '>1 mode': - multiple_modes = True - mode_index = i - if op == 'unique': - rfunc = "r.length" - rout = eval( rfunc )( rout ) - if op in ['c', 'cuniq']: - if op == 'c': - if type(rout) == type([]): - out_str += "\t" + ','.join(rout) - else: - out_str += "\t" + str(rout) - else: - if type(rout) == type([]): - out_str += "\t" + ','.join(list(set(rout))) - else: - out_str += "\t" + str(rout) - else: - out_str += "\t" + str(rout) - if multiple_modes and mode_index != None: - out_str_list = out_str.split('\t') - for val in prev_vals[mode_index]: - out_str = '\t'.join(out_str_list[:mode_index+1]) + '\t' + str(val) + '\t' + '\t'.join(out_str_list[mode_index+2:]) - print >>fout, out_str.rstrip('\t') - else: - print >>fout, out_str - - prev_item = item - prev_vals = [] - for col in cols: - col = int(col)-1 - val_list = [] - val_list.append(fields[col].strip()) - prev_vals.append(val_list) - else: - # This only occurs once, right at the start of the iteration. - prev_item = item - for col in cols: - col = int(col)-1 - val_list = [] - val_list.append(fields[col].strip()) - prev_vals.append(val_list) - - except Exception, exc: - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = ii+1 - else: - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = ii+1 - - # Handle the last grouped value - out_str = prev_item - multiple_modes = False - mode_index = None - for i, op in enumerate(ops): - if op == 'cuniq': - rfunc = "r.c" - else: - rfunc = "r." + op - try: + def is_new_item(line): + item = line.strip().split("\t")[group_col] + if ignorecase == 1: + item = item.lower() + return item + + for key, line_list in groupby(tmpfile, key=is_new_item): + op_vals = [ [] for op in cols ] + out_str = key + multiple_modes = False + mode_index = None + + for line in line_list: + fields = line.strip().split("\t") + for i, col in enumerate(cols): + col = int(col)-1 # cXX from galaxy is 1-based + val = fields[col].strip() + # Before appending the current value, make sure it is numeric if the + # operation for the column requires it. + if ops[i] not in str_ops: + try: + float(val) + except ValueError: + skipped_lines += 1 + if first_invalid_line is None: + first_invalid_line = i+1 + invalid_value = fields[col] + invalid_column = col+1 + break + + op_vals[i].append(val) + + for i, op in enumerate( ops ): + if op == 'cuniq': + rfunc = "r.c" + else: + rfunc = "r." + op if op not in str_ops: - for j, elem in enumerate( prev_vals[i] ): - prev_vals[i][j] = float( elem ) - rout = eval( rfunc )( prev_vals[i] ) + for j, elem in enumerate( op_vals[i] ): + op_vals[i][j] = float( elem ) + rout = eval( rfunc )( op_vals[i] ) if rounds[i] == 'yes': - rout = int(round(float(rout))) + rout = round(float(rout)) else: rout = '%g' %(float(rout)) else: if op != 'random': - rout = eval( rfunc )( prev_vals[i] ) + rout = eval( rfunc )( op_vals[i] ) else: try: - rand_index = random.randint(0,len(prev_vals[i])-1) #if the two inputs to randint are equal, it seems to throw a ValueError. This can't be reproduced with the python interpreter in its interactive mode. - except Exception, ValueError: + rand_index = random.randint(0,len(op_vals[i])-1) #if the two inputs to randint are equal, it seems to throw a ValueError. This can't be reproduced with the python interpreter in its interactive mode. + except: rand_index = 0 - rout = prev_vals[i][rand_index] + rout = op_vals[i][rand_index] if op == 'Mode' and rout == '>1 mode': multiple_modes = True - mode_index = i + mode_index = i if op == 'unique': rfunc = "r.length" - rout = eval( rfunc )( rout ) - if op in ['c','cuniq']: - if op == 'c': - if type(rout) == type([]): - out_str += "\t" + ','.join(rout) - else: - out_str += "\t" + str(rout) + rout = eval( rfunc )( rout ) + if op in ['c', 'cuniq']: + if isinstance(rout, list): + if op == 'cuniq': + rout = set(rout) + out_str += "\t" + ','.join(rout) else: - if type(rout) == type([]): - out_str += "\t" + ','.join(list(set(rout))) - else: - out_str += "\t" + str(rout) + out_str += "\t" + str(rout) else: - out_str += "\t" + str( rout ) - except: - skipped_lines += 1 - if not first_invalid_line: - first_invalid_line = ii+1 - - if multiple_modes and mode_index != None: - out_str_list = out_str.split('\t') - for val in prev_vals[mode_index]: - out_str = '\t'.join(out_str_list[:mode_index+1]) + '\t' + str(val) + '\t' + '\t'.join(out_str_list[mode_index+2:]) - print >>fout, out_str.rstrip('\t') - else: - print >>fout, out_str + out_str += "\t" + str(rout) + if multiple_modes and mode_index != None: + out_str_list = out_str.split('\t') + for val in op_vals[mode_index]: + out_str = '\t'.join(out_str_list[:mode_index+1]) + '\t' + str(val) + '\t' + '\t'.join(out_str_list[mode_index+2:]) + fout.write(out_str.rstrip('\t') + "\n") + else: + fout.write(out_str + "\n") # Generate a useful info message. msg = "--Group by c%d: " %(group_col+1) - for i,op in enumerate(ops): + for i, op in enumerate(ops): if op == 'c': op = 'concat' elif op == 'length': --- a/tools/stats/grouping.xml Tue Mar 22 14:08:32 2011 -0400 +++ b/tools/stats/grouping.xml Tue Mar 22 17:49:09 2011 -0400 @@ -1,4 +1,4 @@ -<tool id="Grouping1" name="Group" version="1.9.3"> +<tool id="Grouping1" name="Group" version="1.9.4"><description>data by a column and perform aggregate operation on other columns.</description><command interpreter="python"> grouping.py Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
Bitbucket