details: http://www.bx.psu.edu/hg/galaxy/rev/2a36ccdb2a38 changeset: 1711:2a36ccdb2a38 user: guru date: Fri Jan 16 13:30:01 2009 -0500 description: Removed an option from grouping tool, which is now part of the LCA tool under Taxonomy. 2 file(s) affected in this change: tools/stats/grouping.py tools/stats/grouping.xml diffs (354 lines): diff -r c565de071f7f -r 2a36ccdb2a38 tools/stats/grouping.py --- a/tools/stats/grouping.py Thu Jan 15 14:23:53 2009 -0500 +++ b/tools/stats/grouping.py Fri Jan 16 13:30:01 2009 -0500 @@ -12,15 +12,13 @@ def main(): inputfile = sys.argv[2] - in_columns = int( sys.argv[5] ) - show_remaining_cols = sys.argv[4] ops = [] cols = [] rounds = [] elems = [] - for var in sys.argv[6:]: + for var in sys.argv[4:]: ops.append(var.split()[0]) cols.append(var.split()[1]) rounds.append(var.split()[2]) @@ -81,19 +79,9 @@ if error_code != 0: stop_err( "Sorting input dataset resulted in error: %s: %s" %( error_code, stdout )) - - if show_remaining_cols == 'yes': - show_cols_list = [1]*in_columns - show_cols_list[group_col] = 0 - for c in cols: - c = int(c)-1 - show_cols_list[c] = 0 - #at the end of this, only the indices of the remaining columns will be set to 1 - remaining_cols = [j for j,k in enumerate(show_cols_list) if k==1] #this is the list of remaining column indices - + prev_item = "" prev_vals = [] - remaining_vals = [] skipped_lines = 0 first_invalid_line = 0 invalid_line = '' @@ -128,30 +116,22 @@ invalid_column = col+1 if valid: prev_vals[i].append(fields[col].strip()) - #Store values from all the remaning columns - if show_remaining_cols == 'yes': - for j, index in enumerate(remaining_cols): - remaining_vals[j].append(fields[index].strip()) else: """ When a new value is encountered, write the previous value and the corresponding aggregate values into the output file. This works due to the sort on group_col we've applied to the data above. """ - out_list = ['']*in_columns - out_list[group_col] = str(prev_item) - + out_str = prev_item + for i, op in enumerate( ops ): rfunc = "r." + op if op not in ['c','length','unique','random']: for j, elem in enumerate( prev_vals[i] ): prev_vals[i][j] = float( elem ) + rout = "%g" %( eval( rfunc )( prev_vals[i] )) if rounds[i] == 'yes': - rout = "%f" %( eval( rfunc )( prev_vals[i] )) rout = int(round(float(rout))) - else: - rout = "%g" %( eval( rfunc )( prev_vals[i] )) - else: if op != 'random': rout = eval( rfunc )( prev_vals[i] ) @@ -162,21 +142,9 @@ if op == 'unique': rfunc = "r.length" rout = eval( rfunc )( rout ) - - out_list[int(cols[i])-1] = str(rout) - - if show_remaining_cols == 'yes': - for index,el in enumerate(remaining_cols): - if index == 0: - try: - random_index = random.randint(0,len(remaining_vals[index])-1) - except: - random_index = 0 - #pick a random value from each of the remaning columns - rand_out = remaining_vals[index][random_index] - out_list[el] = str(rand_out) - - print >>fout, '\t'.join([elem for elem in out_list if elem != '']) + out_str += "\t" + str(rout) + + print >>fout, out_str prev_item = item prev_vals = [] @@ -185,14 +153,6 @@ val_list = [] val_list.append(fields[col].strip()) prev_vals.append(val_list) - - if show_remaining_cols == 'yes': - remaining_vals = [] - for index in remaining_cols: - remaining_val_list = [] - remaining_val_list.append(fields[index].strip()) - remaining_vals.append(remaining_val_list) - else: # This only occurs once, right at the start of the iteration. prev_item = item @@ -201,15 +161,8 @@ val_list = [] val_list.append(fields[col].strip()) prev_vals.append(val_list) - - if show_remaining_cols == 'yes': - remaining_vals = [] - for index in remaining_cols: - remaining_val_list = [] - remaining_val_list.append(fields[index].strip()) - remaining_vals.append(remaining_val_list) - except Exception: + except Exception, exc: skipped_lines += 1 if not first_invalid_line: first_invalid_line = ii+1 @@ -219,8 +172,7 @@ first_invalid_line = ii+1 # Handle the last grouped value - out_list = ['']*in_columns - out_list[group_col] = str(prev_item) + out_str = prev_item for i, op in enumerate(ops): rfunc = "r." + op @@ -228,11 +180,9 @@ if op not in ['c','length','unique','random']: for j, elem in enumerate( prev_vals[i] ): prev_vals[i][j] = float( elem ) + rout = '%g' %( eval( rfunc )( prev_vals[i] )) if rounds[i] == 'yes': - rout = '%f' %( eval( rfunc )( prev_vals[i] )) rout = int(round(float(rout))) - else: - rout = '%g' %( eval( rfunc )( prev_vals[i] )) else: if op != 'random': rout = eval( rfunc )( prev_vals[i] ) @@ -243,22 +193,13 @@ if op == 'unique': rfunc = "r.length" rout = eval( rfunc )( rout ) - out_list[int(cols[i])-1] = str(rout) + out_str += "\t" + str( rout ) except: skipped_lines += 1 if not first_invalid_line: first_invalid_line = ii+1 - if show_remaining_cols == 'yes': - for index,el in enumerate(remaining_cols): - if index == 0: - try: - random_index = random.randint(0,len(remaining_vals[index])-1) - except: - random_index = 0 - rand_out = remaining_vals[index][random_index] - out_list[el] = str(rand_out) - print >>fout, '\t'.join([elem for elem in out_list if elem != '']) + print >>fout, out_str # Generate a useful info message. msg = "--Group by c%d: " %(group_col+1) diff -r c565de071f7f -r 2a36ccdb2a38 tools/stats/grouping.xml --- a/tools/stats/grouping.xml Thu Jan 15 14:23:53 2009 -0500 +++ b/tools/stats/grouping.xml Fri Jan 16 13:30:01 2009 -0500 @@ -1,49 +1,43 @@ -<tool id="Grouping1" name="Group" version="1.5.0"> - <description>data by a column and perform aggregate operation on other columns.</description> - <command interpreter="python"> - grouping.py - $out_file1 - $input1 +<tool id="Grouping1" name="Group" version="1.4.0"> + <description>data by a column and perform aggregate operation on other columns.</description> + <command interpreter="python"> + grouping.py + $out_file1 + $input1 $groupcol - $othercols - ${input1.metadata.columns} - #for $op in $operations - '${op.optype} - ${op.opcol} - ${op.opround}' - #end for - </command> - <inputs> - <param format="tabular" name="input1" type="data" label="Select data" help="Query missing? See TIP below."/> - <param name="groupcol" label="Group by column" type="data_column" data_ref="input1" /> - <repeat name="operations" title="Operation"> - <param name="optype" type="select" label="Type"> - <option value="mean">Mean</option> - <option value="max">Maximum</option> - <option value="min">Minimum</option> - <option value="sum">Sum</option> - <option value="length">Count</option> - <option value="unique">Count Distinct</option> + #for $op in $operations + '${op.optype} + ${op.opcol} + ${op.opround}' + #end for + </command> + <inputs> + <param format="tabular" name="input1" type="data" label="Select data" help="Query missing? See TIP below."/> + <param name="groupcol" label="Group by column" type="data_column" data_ref="input1" /> + <repeat name="operations" title="Operation"> + <param name="optype" type="select" label="Type"> + <option value="mean">Mean</option> + <option value="max">Maximum</option> + <option value="min">Minimum</option> + <option value="sum">Sum</option> + <option value="length">Count</option> + <option value="unique">Count Distinct</option> <option value="c">Concatenate</option> - <option value="random">Randomly pick</option> - </param> - <param name="opcol" label="On column" type="data_column" data_ref="input1" /> - <param name="opround" type="select" label="Round result to nearest integer?"> - <option value="no">NO</option> - <option value="yes">YES</option> - </param> - </repeat> - <param name="othercols" type="select" label="Randomly pick an entry from each of the remaining columns (besides the columns chosen for group and aggregate operations above) ?"> + <option value="random">Randomly pick</option> + </param> + <param name="opcol" label="On column" type="data_column" data_ref="input1" /> + <param name="opround" type="select" label="Round result to nearest integer?"> <option value="no">NO</option> <option value="yes">YES</option> - </param> - </inputs> - <outputs> - <data format="input" name="out_file1" metadata_source="input1" /> - </outputs> - <requirements> - <requirement type="python-module">rpy</requirement> - </requirements> + </param> + </repeat> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input1" /> + </outputs> + <requirements> + <requirement type="python-module">rpy</requirement> + </requirements> <tests> <!-- Test valid data --> <test> @@ -52,9 +46,9 @@ <param name="optype" value="mean"/> <param name="opcol" value="2"/> <param name="opround" value="no"/> - <param name="othercols" value="no"/> <output name="out_file1" file="groupby_out1.dat"/> </test> + <!-- Test data with an invalid value in a column --> <test> <param name="input1" value="1.tabular"/> @@ -62,39 +56,38 @@ <param name="optype" value="mean"/> <param name="opcol" value="2"/> <param name="opround" value="no"/> - <param name="othercols" value="no"/> <output name="out_file1" file="groupby_out2.dat"/> - </test> - </tests> - <help> - -.. class:: infomark - -**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* - ------ - -**Syntax** - -This tool allows you to group the input dataset by a particular column and perform aggregate functions like Mean, Sum, Max, Min and Concatenate on other columns. - -- All invalid, blank and comment lines are skipped when performing the aggregate functions. The number of skipped lines is displayed in the resulting history item. - ------ - -**Example** - -- For the following input:: - - chr22 1000 NM_17 - chr22 2000 NM_18 - chr10 2200 NM_10 - chr10 1200 NM_11 - chr22 1600 NM_19 - -- running this tool with **Group by column 1**, Operations **Mean on column 2** and **Concatenate on column 3** will return:: - - chr10 1700.00 ['NM_11', 'NM_10'] - chr22 1533.33 ['NM_17', 'NM_19', 'NM_18'] - </help> -</tool> + </test> + </tests> + <help> + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert* + +----- + +**Syntax** + +This tool allows you to group the input dataset by a particular column and perform aggregate functions like Mean, Sum, Max, Min and Concatenate on other columns. + +- All invalid, blank and comment lines are skipped when performing the aggregate functions. The number of skipped lines is displayed in the resulting history item. + +----- + +**Example** + +- For the following input:: + + chr22 1000 NM_17 + chr22 2000 NM_18 + chr10 2200 NM_10 + chr10 1200 NM_11 + chr22 1600 NM_19 + +- running this tool with **Group by column 1**, Operations **Mean on column 2** and **Concatenate on column 3** will return:: + + chr10 1700.00 ['NM_11', 'NM_10'] + chr22 1533.33 ['NM_17', 'NM_19', 'NM_18'] + </help> +</tool>