[hg] galaxy 1711: Removed an option from grouping tool, which is...

19 Jan 2009

details:   http://www.bx.psu.edu/hg/galaxy/rev/2a36ccdb2a38
changeset: 1711:2a36ccdb2a38
user:      guru
date:      Fri Jan 16 13:30:01 2009 -0500
description:
Removed an option from grouping tool, which is now part of the LCA tool under Taxonomy.

2 file(s) affected in this change:

tools/stats/grouping.py
tools/stats/grouping.xml

diffs (354 lines):

diff -r c565de071f7f -r 2a36ccdb2a38 tools/stats/grouping.py

--- a/tools/stats/grouping.py	Thu Jan 15 14:23:53 2009 -0500
+++ b/tools/stats/grouping.py	Fri Jan 16 13:30:01 2009 -0500
@@ -12,15 +12,13 @@
 
 def main():
     inputfile = sys.argv[2]
-    in_columns = int( sys.argv[5] )
-    show_remaining_cols = sys.argv[4]
     
     ops = []
     cols = []
     rounds = []
     elems = []
     
-    for var in sys.argv[6:]:
+    for var in sys.argv[4:]:
         ops.append(var.split()[0])
         cols.append(var.split()[1])
         rounds.append(var.split()[2])
@@ -81,19 +79,9 @@
     
     if error_code != 0:
         stop_err( "Sorting input dataset resulted in error: %s: %s" %( error_code, stdout ))
-    
-    if show_remaining_cols == 'yes':
-        show_cols_list = [1]*in_columns
-        show_cols_list[group_col] = 0
-        for c in cols:
-            c = int(c)-1
-            show_cols_list[c] = 0
-        #at the end of this, only the indices of the remaining columns will be set to 1
-        remaining_cols = [j for j,k in enumerate(show_cols_list) if k==1] #this is the list of remaining column indices
-      
+        
     prev_item = ""
     prev_vals = []
-    remaining_vals = []
     skipped_lines = 0
     first_invalid_line = 0
     invalid_line = ''
@@ -128,30 +116,22 @@
                                         invalid_column = col+1
                             if valid:
                                 prev_vals[i].append(fields[col].strip())
-                        #Store values from all the remaning columns
-                        if show_remaining_cols == 'yes':
-                            for j, index in enumerate(remaining_cols):
-                                remaining_vals[j].append(fields[index].strip())
                     else:   
                         """
                         When a new value is encountered, write the previous value and the 
                         corresponding aggregate values into the output file.  This works 
                         due to the sort on group_col we've applied to the data above.
                         """
-                        out_list = ['']*in_columns
-                        out_list[group_col] = str(prev_item)
-                        
+                        out_str = prev_item
+    
                         for i, op in enumerate( ops ):
                             rfunc = "r." + op 
                             if op not in ['c','length','unique','random']:
                                 for j, elem in enumerate( prev_vals[i] ):
                                     prev_vals[i][j] = float( elem )
+                                rout = "%g" %( eval( rfunc )( prev_vals[i] ))
                                 if rounds[i] == 'yes':
-                                    rout = "%f" %( eval( rfunc )( prev_vals[i] ))
                                     rout = int(round(float(rout)))
-                                else:
-                                    rout = "%g" %( eval( rfunc )( prev_vals[i] ))
-                                
                             else:
                                 if op != 'random':
                                     rout = eval( rfunc )( prev_vals[i] )
@@ -162,21 +142,9 @@
                             if op == 'unique':
                                 rfunc = "r.length" 
                                 rout = eval( rfunc )( rout )
-
-                            out_list[int(cols[i])-1] = str(rout)
-                        
-                        if show_remaining_cols == 'yes':
-                            for index,el in enumerate(remaining_cols):
-                                if index == 0:
-                                    try:
-                                        random_index = random.randint(0,len(remaining_vals[index])-1)
-                                    except:
-                                        random_index = 0
-                                #pick a random value from each of the remaning columns 
-                                rand_out = remaining_vals[index][random_index]
-                                out_list[el] = str(rand_out)
-                            
-                        print >>fout, '\t'.join([elem for elem in out_list if elem != ''])
+                            out_str += "\t" + str(rout)
+    
+                        print >>fout, out_str
     
                         prev_item = item   
                         prev_vals = [] 
@@ -185,14 +153,6 @@
                             val_list = []
                             val_list.append(fields[col].strip())
                             prev_vals.append(val_list)
-                        
-                        if show_remaining_cols == 'yes':
-                            remaining_vals = []
-                            for index in remaining_cols:
-                                remaining_val_list = []
-                                remaining_val_list.append(fields[index].strip())
-                                remaining_vals.append(remaining_val_list)
-                        
                 else:
                     # This only occurs once, right at the start of the iteration.
                     prev_item = item
@@ -201,15 +161,8 @@
                         val_list = []
                         val_list.append(fields[col].strip())
                         prev_vals.append(val_list)
-                    
-                    if show_remaining_cols == 'yes':
-                        remaining_vals = []
-                        for index in remaining_cols:
-                            remaining_val_list = []
-                            remaining_val_list.append(fields[index].strip())
-                            remaining_vals.append(remaining_val_list)
     
-            except Exception:
+            except Exception, exc:
                 skipped_lines += 1
                 if not first_invalid_line:
                     first_invalid_line = ii+1
@@ -219,8 +172,7 @@
                 first_invalid_line = ii+1
     
     # Handle the last grouped value
-    out_list = ['']*in_columns
-    out_list[group_col] = str(prev_item)
+    out_str = prev_item
     
     for i, op in enumerate(ops):
         rfunc = "r." + op 
@@ -228,11 +180,9 @@
             if op not in ['c','length','unique','random']:
                 for j, elem in enumerate( prev_vals[i] ):
                     prev_vals[i][j] = float( elem )
+                rout = '%g' %( eval( rfunc )( prev_vals[i] ))
                 if rounds[i] == 'yes':
-                    rout = '%f' %( eval( rfunc )( prev_vals[i] ))
                     rout = int(round(float(rout)))
-                else:
-                    rout = '%g' %( eval( rfunc )( prev_vals[i] ))
             else:
                 if op != 'random':
                     rout = eval( rfunc )( prev_vals[i] )
@@ -243,22 +193,13 @@
             if op == 'unique':
                 rfunc = "r.length" 
                 rout = eval( rfunc )( rout )    
-            out_list[int(cols[i])-1] = str(rout)
+            out_str += "\t" + str( rout )
         except:
             skipped_lines += 1
             if not first_invalid_line:
                 first_invalid_line = ii+1
-    if show_remaining_cols == 'yes':
-        for index,el in enumerate(remaining_cols):
-            if index == 0:
-                try:
-                    random_index = random.randint(0,len(remaining_vals[index])-1)
-                except:
-                    random_index = 0
-            rand_out = remaining_vals[index][random_index]
-            out_list[el] = str(rand_out)
     
-    print >>fout, '\t'.join([elem for elem in out_list if elem != ''])
+    print >>fout, out_str
     
     # Generate a useful info message.
     msg = "--Group by c%d: " %(group_col+1)
diff -r c565de071f7f -r 2a36ccdb2a38 tools/stats/grouping.xml
--- a/tools/stats/grouping.xml	Thu Jan 15 14:23:53 2009 -0500
+++ b/tools/stats/grouping.xml	Fri Jan 16 13:30:01 2009 -0500
@@ -1,49 +1,43 @@
-<tool id="Grouping1" name="Group" version="1.5.0">
-  <description>data by a column and perform aggregate operation on other columns.</description>
-  <command interpreter="python">
-    grouping.py 
-      $out_file1
-      $input1
+<tool id="Grouping1" name="Group" version="1.4.0">
+  <description>data by a column and perform aggregate operation on other columns.</description>
+  <command interpreter="python">
+    grouping.py 
+      $out_file1
+      $input1
       $groupcol
-      $othercols
-      ${input1.metadata.columns}
-      #for $op in $operations
-       '${op.optype}
-        ${op.opcol}
-        ${op.opround}'
-      #end for
-  </command>
-  <inputs>
-    <param format="tabular" name="input1" type="data" label="Select data" help="Query missing? See TIP below."/>
-    <param name="groupcol" label="Group by column" type="data_column" data_ref="input1" />
-    <repeat name="operations" title="Operation">
-      <param name="optype" type="select" label="Type">
-        <option value="mean">Mean</option>
-        <option value="max">Maximum</option>
-        <option value="min">Minimum</option>
-        <option value="sum">Sum</option>
-        <option value="length">Count</option>
-        <option value="unique">Count Distinct</option>
+      #for $op in $operations
+       '${op.optype}
+        ${op.opcol}
+        ${op.opround}'
+      #end for
+  </command>
+  <inputs>
+    <param format="tabular" name="input1" type="data" label="Select data" help="Query missing? See TIP below."/>
+    <param name="groupcol" label="Group by column" type="data_column" data_ref="input1" />
+    <repeat name="operations" title="Operation">
+      <param name="optype" type="select" label="Type">
+        <option value="mean">Mean</option>
+        <option value="max">Maximum</option>
+        <option value="min">Minimum</option>
+        <option value="sum">Sum</option>
+        <option value="length">Count</option>
+        <option value="unique">Count Distinct</option>
         <option value="c">Concatenate</option>
-        <option value="random">Randomly pick</option>
-      </param>
-      <param name="opcol" label="On column" type="data_column" data_ref="input1" />
-      <param name="opround" type="select" label="Round result to nearest integer?">
-	     <option value="no">NO</option>
-	     <option value="yes">YES</option>
-	   </param>
-  	</repeat>
-  	<param name="othercols" type="select" label="Randomly pick an entry from each of the remaining columns (besides the columns chosen for group and aggregate operations above) ?">
+        <option value="random">Randomly pick</option>
+      </param>
+      <param name="opcol" label="On column" type="data_column" data_ref="input1" />
+      <param name="opround" type="select" label="Round result to nearest integer?">
          <option value="no">NO</option>
          <option value="yes">YES</option>
-       </param>
-  </inputs>
-  <outputs>
-    <data format="input" name="out_file1" metadata_source="input1" />
-  </outputs>
-  <requirements>
-    <requirement type="python-module">rpy</requirement>
-  </requirements>
+       </param>
+    </repeat>
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input1" />
+  </outputs>
+  <requirements>
+    <requirement type="python-module">rpy</requirement>
+  </requirements>
   <tests>
     <!-- Test valid data -->
     <test>
@@ -52,9 +46,9 @@
       <param name="optype" value="mean"/>
       <param name="opcol" value="2"/>
       <param name="opround" value="no"/>
-      <param name="othercols" value="no"/>
       <output name="out_file1" file="groupby_out1.dat"/>
     </test>
+    
     <!-- Test data with an invalid value in a column -->
     <test>
       <param name="input1" value="1.tabular"/>
@@ -62,39 +56,38 @@
       <param name="optype" value="mean"/>
       <param name="opcol" value="2"/>
       <param name="opround" value="no"/>
-      <param name="othercols" value="no"/>
       <output name="out_file1" file="groupby_out2.dat"/>
-    </test>
-  </tests>
-  <help>
-
-.. class:: infomark
-
-**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert*
-
------
-
-**Syntax**
-
-This tool allows you to group the input dataset by a particular column and perform aggregate functions like Mean, Sum, Max, Min and Concatenate on other columns. 
-
-- All invalid, blank and comment lines are skipped when performing the aggregate functions.  The number of skipped lines is displayed in the resulting history item.
-
------
-
-**Example**
-
-- For the following input::
-
-   chr22  1000  NM_17
-   chr22  2000  NM_18
-   chr10  2200  NM_10
-   chr10  1200  NM_11
-   chr22  1600  NM_19
-
-- running this tool with **Group by column 1**, Operations **Mean on column 2** and **Concatenate on column 3** will return::
-
-   chr10	1700.00	['NM_11', 'NM_10']
-   chr22	1533.33	['NM_17', 'NM_19', 'NM_18']
-  </help>
-</tool>
+    </test>
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Text Manipulation->Convert*
+
+-----
+
+**Syntax**
+
+This tool allows you to group the input dataset by a particular column and perform aggregate functions like Mean, Sum, Max, Min and Concatenate on other columns. 
+
+- All invalid, blank and comment lines are skipped when performing the aggregate functions.  The number of skipped lines is displayed in the resulting history item.
+
+-----
+
+**Example**
+
+- For the following input::
+
+   chr22  1000  NM_17
+   chr22  2000  NM_18
+   chr10  2200  NM_10
+   chr10  1200  NM_11
+   chr22  1600  NM_19
+
+- running this tool with **Group by column 1**, Operations **Mean on column 2** and **Concatenate on column 3** will return::
+
+   chr10    1700.00 ['NM_11', 'NM_10']
+   chr22    1533.33 ['NM_17', 'NM_19', 'NM_18']
+  </help>
+</tool>

    

Nate Coraor

tags

participants (1)