[hg] galaxy 1688: Added a new option to the grouping tool to fac...

6 Jan 2009

details:   http://www.bx.psu.edu/hg/galaxy/rev/e0162b7bf0ba
changeset: 1688:e0162b7bf0ba
user:      guru
date:      Tue Jan 06 11:11:24 2009 -0500
description:
Added a new option to the grouping tool to facilitate picking of random entries from columns not chosen for either grouping or aggregate operations.

2 file(s) affected in this change:

tools/stats/grouping.py
tools/stats/grouping.xml

diffs (230 lines):

diff -r 091573fd6e27 -r e0162b7bf0ba tools/stats/grouping.py

--- a/tools/stats/grouping.py	Mon Jan 05 15:26:17 2009 -0500
+++ b/tools/stats/grouping.py	Tue Jan 06 11:11:24 2009 -0500
@@ -12,13 +12,15 @@
 
 def main():
     inputfile = sys.argv[2]
+    in_columns = int( sys.argv[5] )
+    show_remaining_cols = sys.argv[4]
     
     ops = []
     cols = []
     rounds = []
     elems = []
     
-    for var in sys.argv[4:]:
+    for var in sys.argv[6:]:
         ops.append(var.split()[0])
         cols.append(var.split()[1])
         rounds.append(var.split()[2])
@@ -79,9 +81,19 @@
     
     if error_code != 0:
         stop_err( "Sorting input dataset resulted in error: %s: %s" %( error_code, stdout ))
-        
+    
+    if show_remaining_cols == 'yes':
+        show_cols_list = [1]*in_columns
+        show_cols_list[group_col] = 0
+        for c in cols:
+            c = int(c)-1
+            show_cols_list[c] = 0
+        #at the end of this, only the indices of the remaining columns will be set to 1
+        remaining_cols = [j for j,k in enumerate(show_cols_list) if k==1] #this is the list of remaining column indices
+      
     prev_item = ""
     prev_vals = []
+    remaining_vals = []
     skipped_lines = 0
     first_invalid_line = 0
     invalid_line = ''
@@ -116,22 +128,30 @@
                                         invalid_column = col+1
                             if valid:
                                 prev_vals[i].append(fields[col].strip())
+                        #Store values from all the remaning columns
+                        if show_remaining_cols == 'yes':
+                            for j, index in enumerate(remaining_cols):
+                                remaining_vals[j].append(fields[index].strip())
                     else:   
                         """
                         When a new value is encountered, write the previous value and the 
                         corresponding aggregate values into the output file.  This works 
                         due to the sort on group_col we've applied to the data above.
                         """
-                        out_str = prev_item
-    
+                        out_list = ['']*in_columns
+                        out_list[group_col] = str(prev_item)
+                        
                         for i, op in enumerate( ops ):
                             rfunc = "r." + op 
                             if op not in ['c','length','unique','random']:
                                 for j, elem in enumerate( prev_vals[i] ):
                                     prev_vals[i][j] = float( elem )
-                                rout = "%g" %( eval( rfunc )( prev_vals[i] ))
                                 if rounds[i] == 'yes':
+                                    rout = "%f" %( eval( rfunc )( prev_vals[i] ))
                                     rout = int(round(float(rout)))
+                                else:
+                                    rout = "%g" %( eval( rfunc )( prev_vals[i] ))
+                                
                             else:
                                 if op != 'random':
                                     rout = eval( rfunc )( prev_vals[i] )
@@ -142,9 +162,21 @@
                             if op == 'unique':
                                 rfunc = "r.length" 
                                 rout = eval( rfunc )( rout )
-                            out_str += "\t" + str(rout)
-    
-                        print >>fout, out_str
+
+                            out_list[int(cols[i])-1] = str(rout)
+                        
+                        if show_remaining_cols == 'yes':
+                            for index,el in enumerate(remaining_cols):
+                                if index == 0:
+                                    try:
+                                        random_index = random.randint(0,len(remaining_vals[index])-1)
+                                    except:
+                                        random_index = 0
+                                #pick a random value from each of the remaning columns 
+                                rand_out = remaining_vals[index][random_index]
+                                out_list[el] = str(rand_out)
+                            
+                        print >>fout, '\t'.join([elem for elem in out_list if elem != ''])
     
                         prev_item = item   
                         prev_vals = [] 
@@ -153,6 +185,14 @@
                             val_list = []
                             val_list.append(fields[col].strip())
                             prev_vals.append(val_list)
+                        
+                        if show_remaining_cols == 'yes':
+                            remaining_vals = []
+                            for index in remaining_cols:
+                                remaining_val_list = []
+                                remaining_val_list.append(fields[index].strip())
+                                remaining_vals.append(remaining_val_list)
+                        
                 else:
                     # This only occurs once, right at the start of the iteration.
                     prev_item = item
@@ -161,8 +201,15 @@
                         val_list = []
                         val_list.append(fields[col].strip())
                         prev_vals.append(val_list)
+                    
+                    if show_remaining_cols == 'yes':
+                        remaining_vals = []
+                        for index in remaining_cols:
+                            remaining_val_list = []
+                            remaining_val_list.append(fields[index].strip())
+                            remaining_vals.append(remaining_val_list)
     
-            except Exception, exc:
+            except Exception:
                 skipped_lines += 1
                 if not first_invalid_line:
                     first_invalid_line = ii+1
@@ -172,7 +219,8 @@
                 first_invalid_line = ii+1
     
     # Handle the last grouped value
-    out_str = prev_item
+    out_list = ['']*in_columns
+    out_list[group_col] = str(prev_item)
     
     for i, op in enumerate(ops):
         rfunc = "r." + op 
@@ -180,9 +228,11 @@
             if op not in ['c','length','unique','random']:
                 for j, elem in enumerate( prev_vals[i] ):
                     prev_vals[i][j] = float( elem )
-                rout = '%g' %( eval( rfunc )( prev_vals[i] ))
                 if rounds[i] == 'yes':
+                    rout = '%f' %( eval( rfunc )( prev_vals[i] ))
                     rout = int(round(float(rout)))
+                else:
+                    rout = '%g' %( eval( rfunc )( prev_vals[i] ))
             else:
                 if op != 'random':
                     rout = eval( rfunc )( prev_vals[i] )
@@ -193,13 +243,22 @@
             if op == 'unique':
                 rfunc = "r.length" 
                 rout = eval( rfunc )( rout )    
-            out_str += "\t" + str( rout )
+            out_list[int(cols[i])-1] = str(rout)
         except:
             skipped_lines += 1
             if not first_invalid_line:
                 first_invalid_line = ii+1
+    if show_remaining_cols == 'yes':
+        for index,el in enumerate(remaining_cols):
+            if index == 0:
+                try:
+                    random_index = random.randint(0,len(remaining_vals[index])-1)
+                except:
+                    random_index = 0
+            rand_out = remaining_vals[index][random_index]
+            out_list[el] = str(rand_out)
     
-    print >>fout, out_str
+    print >>fout, '\t'.join([elem for elem in out_list if elem != ''])
     
     # Generate a useful info message.
     msg = "--Group by c%d: " %(group_col+1)
diff -r 091573fd6e27 -r e0162b7bf0ba tools/stats/grouping.xml
--- a/tools/stats/grouping.xml	Mon Jan 05 15:26:17 2009 -0500
+++ b/tools/stats/grouping.xml	Tue Jan 06 11:11:24 2009 -0500
@@ -1,10 +1,12 @@
-<tool id="Grouping1" name="Group" version="1.4.0">
+<tool id="Grouping1" name="Group" version="1.5.0">
   <description>data by a column and perform aggregate operation on other columns.</description>
   <command interpreter="python">
     grouping.py 
       $out_file1
       $input1
-      $groupcol
+      $groupcol
+      $othercols
+      ${input1.metadata.columns}
       #for $op in $operations
        '${op.optype}
         ${op.opcol}
@@ -30,7 +32,11 @@
 	     <option value="no">NO</option>
 	     <option value="yes">YES</option>
 	   </param>
-  	</repeat>
+  	</repeat>
+  	<param name="othercols" type="select" label="Randomly pick an entry from each of the remaining columns (besides the columns chosen for group and aggregate operations above) ?">
+         <option value="no">NO</option>
+         <option value="yes">YES</option>
+       </param>
   </inputs>
   <outputs>
     <data format="input" name="out_file1" metadata_source="input1" />
@@ -46,9 +52,9 @@
       <param name="optype" value="mean"/>
       <param name="opcol" value="2"/>
       <param name="opround" value="no"/>
+      <param name="othercols" value="no"/>
       <output name="out_file1" file="groupby_out1.dat"/>
     </test>
-    
     <!-- Test data with an invalid value in a column -->
     <test>
       <param name="input1" value="1.tabular"/>
@@ -56,6 +62,7 @@
       <param name="optype" value="mean"/>
       <param name="opcol" value="2"/>
       <param name="opround" value="no"/>
+      <param name="othercols" value="no"/>
       <output name="out_file1" file="groupby_out2.dat"/>
     </test>
   </tests>

    

Greg Von Kuster

tags

participants (1)