commit/galaxy-central: kanwei: Grouping tool:

1 new changeset in galaxy-central: http://bitbucket.org/galaxy/galaxy-central/changeset/7ee052a1f4f2/ changeset: 7ee052a1f4f2 branches: user: kanwei date: 2011-06-03 22:08:26 summary: Grouping tool: - Converted to use numpy, which is included with Galaxy, instead of rpy, which is not. - Don't simply skip lines if a numeric parameter encounters non-numeric data, return descriptive error with line contents instead - General refactoring - Added new comprehensive test, disabled until test framework supports multiple repeat inputs affected #: 3 files (3.3 KB) --- a/tools/stats/grouping.py Fri Jun 03 15:36:59 2011 -0400 +++ b/tools/stats/grouping.py Fri Jun 03 16:08:26 2011 -0400 @@ -1,73 +1,58 @@ #!/usr/bin/env python # Guruprasad Ananda # Refactored 2011, Kanwei Li +# Refactored to use numpy instead of rpy """ This tool provides the SQL "group by" functionality. """ -import sys, string, re, commands, tempfile, random -from rpy import * +import sys, commands, tempfile, random +import pkg_resources +pkg_resources.require( "numpy" ) +import numpy + from itertools import groupby def stop_err(msg): sys.stderr.write(msg) sys.exit() +def mode(data): + counts = {} + for x in data: + counts[x] = counts.get(x,0) + 1 + maxcount = max(counts.values()) + modelist = [] + for x in counts: + if counts[x] == maxcount: + modelist.append( str(x) ) + return ','.join(modelist) + def main(): inputfile = sys.argv[2] ignorecase = int(sys.argv[4]) ops = [] cols = [] - rounds = [] - elems = [] + round_val = [] + data_ary = [] for var in sys.argv[5:]: - ops.append(var.split()[0]) - cols.append(var.split()[1]) - rounds.append(var.split()[2]) - - if 'Mode' in ops: - try: - r.library('prettyR') - except: - stop_err('R package prettyR could not be loaded. Please make sure it is installed.') - + op, col, do_round = var.split() + ops.append(op) + cols.append(col) + round_val.append(do_round) """ At this point, ops, cols and rounds will look something like this: ops: ['mean', 'min', 'c'] cols: ['1', '3', '4'] - rounds: ['no', 'yes' 'no'] + round_val: ['no', 'yes' 'no'] """ - - for i, line in enumerate( file ( inputfile )): - line = line.rstrip('\r\n') - if len( line )>0 and not line.startswith( '#' ): - elems = line.split( '\t' ) - break - if i == 30: - break # Hopefully we'll never get here... - - if len( elems )<1: - stop_err( "The data in your input dataset is either missing or not formatted properly." ) - + try: group_col = int( sys.argv[3] )-1 except: stop_err( "Group column not specified." ) str_ops = ['c', 'length', 'unique', 'random', 'cuniq', 'Mode'] #ops that can handle string/non-numeric inputs - for k, col in enumerate(cols): - col = int(col)-1 - if ops[k] not in str_ops: - # We'll get here only if the user didn't choose 'Concatenate' or 'Count' or 'Count Distinct' or 'pick randomly', which are the - # only aggregation functions that can be used on columns containing strings. - try: - float( elems[col] ) - except: - try: - msg = "Operation '%s' cannot be performed on non-numeric column %d containing value '%s'." %( ops[k], col+1, elems[col] ) - except: - msg = "Operation '%s' cannot be performed on non-numeric data." %ops[k] - stop_err( msg ) tmpfile = tempfile.NamedTemporaryFile() @@ -83,7 +68,7 @@ case = '' if ignorecase == 1: case = '-f' - command_line = "sort -t ' ' " + case + " -k" + str(group_col+1) +"," + str(group_col+1) + " -o " + tmpfile.name + " " + inputfile + command_line = "sort -t ' ' %s -k%s,%s -o %s %s" % (case, group_col+1, group_col+1, tmpfile.name, inputfile) except Exception, exc: stop_err( 'Initialization error -> %s' %str(exc) ) @@ -92,21 +77,16 @@ if error_code != 0: stop_err( "Sorting input dataset resulted in error: %s: %s" %( error_code, stdout )) - prev_item = None - skipped_lines = 0 - first_invalid_line = None - invalid_value = '' - invalid_column = 0 fout = open(sys.argv[1], "w") def is_new_item(line): item = line.strip().split("\t")[group_col] if ignorecase == 1: - item = item.lower() + return item.lower() return item for key, line_list in groupby(tmpfile, key=is_new_item): - op_vals = [ [] for op in cols ] + op_vals = [ [] for op in ops ] out_str = key multiple_modes = False mode_index = None @@ -115,86 +95,65 @@ fields = line.strip().split("\t") for i, col in enumerate(cols): col = int(col)-1 # cXX from galaxy is 1-based - val = fields[col].strip() - # Before appending the current value, make sure it is numeric if the - # operation for the column requires it. - if ops[i] not in str_ops: - try: - float(val) - except ValueError: - skipped_lines += 1 - if first_invalid_line is None: - first_invalid_line = i+1 - invalid_value = fields[col] - invalid_column = col+1 - break + try: + val = fields[col].strip() + op_vals[i].append(val) + except IndexError: + sys.stderr.write( 'Could not access the value for column %s on line: "%s". Make sure file is tab-delimited.\n' % (col+1, line) ) + sys.exit( 1 ) - op_vals[i].append(val) + # Generate string for each op for this group + for i, op in enumerate( ops ): + data = op_vals[i] + rval = "" + if op == "mode": + rval = mode( data ) + elif op == "length": + rval = len( data ) + elif op == "random": + rval = random.choice(data) + elif op in ['cat', 'cat_uniq']: + if op == 'cat_uniq': + data = numpy.unique(data) + rval = ','.join(data) + elif op == "unique": + rval = len( numpy.unique(data) ) + else: + # some kind of numpy fn + try: + data = map(float, data) + except ValueError: + sys.stderr.write( "Operation %s expected number values but got %s instead.\n" % (op, data) ) + sys.exit( 1 ) + rval = getattr(numpy, op)( data ) + if round_val[i] == 'yes': + rval = round(rval) + else: + rval = '%g' % rval + + out_str += "\t%s" % rval - for i, op in enumerate( ops ): - if op == 'cuniq': - rfunc = "r.c" - else: - rfunc = "r." + op - if op not in str_ops: - for j, elem in enumerate( op_vals[i] ): - op_vals[i][j] = float( elem ) - rout = eval( rfunc )( op_vals[i] ) - if rounds[i] == 'yes': - rout = round(float(rout)) - else: - rout = '%g' %(float(rout)) - else: - if op != 'random': - rout = eval( rfunc )( op_vals[i] ) - else: - try: - rand_index = random.randint(0,len(op_vals[i])-1) #if the two inputs to randint are equal, it seems to throw a ValueError. This can't be reproduced with the python interpreter in its interactive mode. - except: - rand_index = 0 - rout = op_vals[i][rand_index] - - if op == 'Mode' and rout == '>1 mode': - multiple_modes = True - mode_index = i - if op == 'unique': - rfunc = "r.length" - rout = eval( rfunc )( rout ) - if op in ['c', 'cuniq']: - if isinstance(rout, list): - if op == 'cuniq': - rout = set(rout) - out_str += "\t" + ','.join(rout) - else: - out_str += "\t" + str(rout) - else: - out_str += "\t" + str(rout) - if multiple_modes and mode_index != None: - out_str_list = out_str.split('\t') - for val in op_vals[mode_index]: - out_str = '\t'.join(out_str_list[:mode_index+1]) + '\t' + str(val) + '\t' + '\t'.join(out_str_list[mode_index+2:]) - fout.write(out_str.rstrip('\t') + "\n") - else: - fout.write(out_str + "\n") + fout.write(out_str + "\n") # Generate a useful info message. msg = "--Group by c%d: " %(group_col+1) for i, op in enumerate(ops): - if op == 'c': + if op == 'cat': op = 'concat' + elif op == 'cat_uniq': + op = 'concat_distinct' elif op == 'length': op = 'count' elif op == 'unique': op = 'count_distinct' elif op == 'random': op = 'randomly_pick' - elif op == 'cuniq': - op = 'concat_distinct' + msg += op + "[c" + cols[i] + "] " - if skipped_lines > 0: - msg+= "--skipped %d invalid lines starting with line %d. Value '%s' in column %d is not numeric." % ( skipped_lines, first_invalid_line, invalid_value, invalid_column ) print msg + fout.close() + tmpfile.close() if __name__ == "__main__": main() --- a/tools/stats/grouping.xml Fri Jun 03 15:36:59 2011 -0400 +++ b/tools/stats/grouping.xml Fri Jun 03 16:08:26 2011 -0400 @@ -1,4 +1,4 @@ -<tool id="Grouping1" name="Group" version="1.9.4"> +<tool id="Grouping1" name="Group" version="2.0.0"><description>data by a column and perform aggregate operation on other columns.</description><command interpreter="python"> grouping.py @@ -22,16 +22,16 @@ <param name="optype" type="select" label="Type"><option value="mean">Mean</option><option value="median">Median</option> - <option value="Mode">Mode</option> + <option value="mode">Mode</option><option value="max">Maximum</option><option value="min">Minimum</option><option value="sum">Sum</option><option value="length">Count</option><option value="unique">Count Distinct</option> - <option value="c">Concatenate</option> - <option value="cuniq">Concatenate Distinct</option> + <option value="cat">Concatenate</option> + <option value="cat_uniq">Concatenate Distinct</option><option value="random">Randomly pick</option> - <option value="sd">Standard deviation</option> + <option value="std">Standard deviation</option></param><param name="opcol" label="On column" type="data_column" data_ref="input1" /><param name="opround" type="select" label="Round result to nearest integer?"> @@ -44,7 +44,7 @@ <data format="tabular" name="out_file1" /></outputs><requirements> - <requirement type="python-module">rpy</requirement> + <requirement type="python-module">numpy</requirement></requirements><tests><!-- Test valid data --> @@ -57,8 +57,16 @@ <param name="opround" value="no"/><output name="out_file1" file="groupby_out1.dat"/></test> - - <!-- Test data with an invalid value in a column --> + <!-- Long case but test framework doesn't allow yet + <test> + <param name="input1" value="1.bed"/> + <param name="groupcol" value="1"/> + <param name="ignorecase" value="false"/> + <param name="operations" value='[{"opcol": "2", "__index__": 0, "optype": "mean", "opround": "no"}, {"opcol": "2", "__index__": 1, "optype": "median", "opround": "no"}, {"opcol": "6", "__index__": 2, "optype": "mode", "opround": "no"}, {"opcol": "2", "__index__": 3, "optype": "max", "opround": "no"}, {"opcol": "2", "__index__": 4, "optype": "min", "opround": "no"}, {"opcol": "2", "__index__": 5, "optype": "sum", "opround": "no"}, {"opcol": "1", "__index__": 6, "optype": "length", "opround": "no"}, {"opcol": "1", "__index__": 7, "optype": "unique", "opround": "no"}, {"opcol": "1", "__index__": 8, "optype": "cat", "opround": "no"}, {"opcol": "6", "__index__": 9, "optype": "cat_uniq", "opround": "no"}, {"opcol": "2", "__index__": 10, "optype": "random", "opround": "no"}, {"opcol": "2", "__index__": 11, "optype": "std", "opround": "no"}]'/> + <output name="out_file1" file="groupby_out3.tabular"/> + </test> + --> + <!-- Test data with an invalid value in a column. Can't do it because test framework doesn't allow testing of errors <test><param name="input1" value="1.tabular"/><param name="groupcol" value="1"/> @@ -68,6 +76,7 @@ <param name="opround" value="no"/><output name="out_file1" file="groupby_out2.dat"/></test> + --></tests><help> @@ -79,9 +88,7 @@ **Syntax** -This tool allows you to group the input dataset by a particular column and perform aggregate functions like Mean, Median, Mode, Sum, Max, Min, Count, Random draw and Concatenate on other columns. - -- All invalid, blank and comment lines are skipped when performing the aggregate functions. The number of skipped lines is displayed in the resulting history item. +This tool allows you to group the input dataset by a particular column and perform aggregate functions: Mean, Median, Mode, Sum, Max, Min, Count, Randomly pick, and Concatenate on any column. - If multiple modes are present, all are reported. Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
Bitbucket