[hg] galaxy 1688: Added a new option to the grouping tool to fac...
details: http://www.bx.psu.edu/hg/galaxy/rev/e0162b7bf0ba changeset: 1688:e0162b7bf0ba user: guru date: Tue Jan 06 11:11:24 2009 -0500 description: Added a new option to the grouping tool to facilitate picking of random entries from columns not chosen for either grouping or aggregate operations. 2 file(s) affected in this change: tools/stats/grouping.py tools/stats/grouping.xml diffs (230 lines): diff -r 091573fd6e27 -r e0162b7bf0ba tools/stats/grouping.py --- a/tools/stats/grouping.py Mon Jan 05 15:26:17 2009 -0500 +++ b/tools/stats/grouping.py Tue Jan 06 11:11:24 2009 -0500 @@ -12,13 +12,15 @@ def main(): inputfile = sys.argv[2] + in_columns = int( sys.argv[5] ) + show_remaining_cols = sys.argv[4] ops = [] cols = [] rounds = [] elems = [] - for var in sys.argv[4:]: + for var in sys.argv[6:]: ops.append(var.split()[0]) cols.append(var.split()[1]) rounds.append(var.split()[2]) @@ -79,9 +81,19 @@ if error_code != 0: stop_err( "Sorting input dataset resulted in error: %s: %s" %( error_code, stdout )) - + + if show_remaining_cols == 'yes': + show_cols_list = [1]*in_columns + show_cols_list[group_col] = 0 + for c in cols: + c = int(c)-1 + show_cols_list[c] = 0 + #at the end of this, only the indices of the remaining columns will be set to 1 + remaining_cols = [j for j,k in enumerate(show_cols_list) if k==1] #this is the list of remaining column indices + prev_item = "" prev_vals = [] + remaining_vals = [] skipped_lines = 0 first_invalid_line = 0 invalid_line = '' @@ -116,22 +128,30 @@ invalid_column = col+1 if valid: prev_vals[i].append(fields[col].strip()) + #Store values from all the remaning columns + if show_remaining_cols == 'yes': + for j, index in enumerate(remaining_cols): + remaining_vals[j].append(fields[index].strip()) else: """ When a new value is encountered, write the previous value and the corresponding aggregate values into the output file. This works due to the sort on group_col we've applied to the data above. """ - out_str = prev_item - + out_list = ['']*in_columns + out_list[group_col] = str(prev_item) + for i, op in enumerate( ops ): rfunc = "r." + op if op not in ['c','length','unique','random']: for j, elem in enumerate( prev_vals[i] ): prev_vals[i][j] = float( elem ) - rout = "%g" %( eval( rfunc )( prev_vals[i] )) if rounds[i] == 'yes': + rout = "%f" %( eval( rfunc )( prev_vals[i] )) rout = int(round(float(rout))) + else: + rout = "%g" %( eval( rfunc )( prev_vals[i] )) + else: if op != 'random': rout = eval( rfunc )( prev_vals[i] ) @@ -142,9 +162,21 @@ if op == 'unique': rfunc = "r.length" rout = eval( rfunc )( rout ) - out_str += "\t" + str(rout) - - print >>fout, out_str + + out_list[int(cols[i])-1] = str(rout) + + if show_remaining_cols == 'yes': + for index,el in enumerate(remaining_cols): + if index == 0: + try: + random_index = random.randint(0,len(remaining_vals[index])-1) + except: + random_index = 0 + #pick a random value from each of the remaning columns + rand_out = remaining_vals[index][random_index] + out_list[el] = str(rand_out) + + print >>fout, '\t'.join([elem for elem in out_list if elem != '']) prev_item = item prev_vals = [] @@ -153,6 +185,14 @@ val_list = [] val_list.append(fields[col].strip()) prev_vals.append(val_list) + + if show_remaining_cols == 'yes': + remaining_vals = [] + for index in remaining_cols: + remaining_val_list = [] + remaining_val_list.append(fields[index].strip()) + remaining_vals.append(remaining_val_list) + else: # This only occurs once, right at the start of the iteration. prev_item = item @@ -161,8 +201,15 @@ val_list = [] val_list.append(fields[col].strip()) prev_vals.append(val_list) + + if show_remaining_cols == 'yes': + remaining_vals = [] + for index in remaining_cols: + remaining_val_list = [] + remaining_val_list.append(fields[index].strip()) + remaining_vals.append(remaining_val_list) - except Exception, exc: + except Exception: skipped_lines += 1 if not first_invalid_line: first_invalid_line = ii+1 @@ -172,7 +219,8 @@ first_invalid_line = ii+1 # Handle the last grouped value - out_str = prev_item + out_list = ['']*in_columns + out_list[group_col] = str(prev_item) for i, op in enumerate(ops): rfunc = "r." + op @@ -180,9 +228,11 @@ if op not in ['c','length','unique','random']: for j, elem in enumerate( prev_vals[i] ): prev_vals[i][j] = float( elem ) - rout = '%g' %( eval( rfunc )( prev_vals[i] )) if rounds[i] == 'yes': + rout = '%f' %( eval( rfunc )( prev_vals[i] )) rout = int(round(float(rout))) + else: + rout = '%g' %( eval( rfunc )( prev_vals[i] )) else: if op != 'random': rout = eval( rfunc )( prev_vals[i] ) @@ -193,13 +243,22 @@ if op == 'unique': rfunc = "r.length" rout = eval( rfunc )( rout ) - out_str += "\t" + str( rout ) + out_list[int(cols[i])-1] = str(rout) except: skipped_lines += 1 if not first_invalid_line: first_invalid_line = ii+1 + if show_remaining_cols == 'yes': + for index,el in enumerate(remaining_cols): + if index == 0: + try: + random_index = random.randint(0,len(remaining_vals[index])-1) + except: + random_index = 0 + rand_out = remaining_vals[index][random_index] + out_list[el] = str(rand_out) - print >>fout, out_str + print >>fout, '\t'.join([elem for elem in out_list if elem != '']) # Generate a useful info message. msg = "--Group by c%d: " %(group_col+1) diff -r 091573fd6e27 -r e0162b7bf0ba tools/stats/grouping.xml --- a/tools/stats/grouping.xml Mon Jan 05 15:26:17 2009 -0500 +++ b/tools/stats/grouping.xml Tue Jan 06 11:11:24 2009 -0500 @@ -1,10 +1,12 @@ -<tool id="Grouping1" name="Group" version="1.4.0"> +<tool id="Grouping1" name="Group" version="1.5.0"> <description>data by a column and perform aggregate operation on other columns.</description> <command interpreter="python"> grouping.py $out_file1 $input1 - $groupcol + $groupcol + $othercols + ${input1.metadata.columns} #for $op in $operations '${op.optype} ${op.opcol} @@ -30,7 +32,11 @@ <option value="no">NO</option> <option value="yes">YES</option> </param> - </repeat> + </repeat> + <param name="othercols" type="select" label="Randomly pick an entry from each of the remaining columns (besides the columns chosen for group and aggregate operations above) ?"> + <option value="no">NO</option> + <option value="yes">YES</option> + </param> </inputs> <outputs> <data format="input" name="out_file1" metadata_source="input1" /> @@ -46,9 +52,9 @@ <param name="optype" value="mean"/> <param name="opcol" value="2"/> <param name="opround" value="no"/> + <param name="othercols" value="no"/> <output name="out_file1" file="groupby_out1.dat"/> </test> - <!-- Test data with an invalid value in a column --> <test> <param name="input1" value="1.tabular"/> @@ -56,6 +62,7 @@ <param name="optype" value="mean"/> <param name="opcol" value="2"/> <param name="opround" value="no"/> + <param name="othercols" value="no"/> <output name="out_file1" file="groupby_out2.dat"/> </test> </tests>
participants (1)
-
Greg Von Kuster