[hg] galaxy 2359: Bug fixed in linear regression tool, and new o...

21 Apr 2009

details:   http://www.bx.psu.edu/hg/galaxy/rev/541ae0e5e599
changeset: 2359:541ae0e5e599
user:      guru
date:      Tue Apr 21 10:53:34 2009 -0400
description:
Bug fixed in linear regression tool, and new option added to grouping tool.

4 file(s) affected in this change:

tools/regVariation/linear_regression.py
tools/regVariation/linear_regression.xml
tools/stats/grouping.py
tools/stats/grouping.xml

diffs (186 lines):

diff -r e9971a098d14 -r 541ae0e5e599 tools/regVariation/linear_regression.py

--- a/tools/regVariation/linear_regression.py	Tue Apr 21 10:45:13 2009 -0400
+++ b/tools/regVariation/linear_regression.py	Tue Apr 21 10:53:34 2009 -0400
@@ -108,10 +108,15 @@
 r.pdf( outfile2, 8, 8 )
 if len(x_vals) == 1:    #Simple linear  regression case with 1 predictor variable
     sub_title =  "Slope = %s; Y-int = %s" %(slope,yintercept)
-    r.plot(x=x_vals[0], y=y_vals, xlab="X", ylab="Y", sub=sub_title, main="Scatterplot with regression")
-    r.abline(a=yintercept, b=slope, col="red")
+    try:
+        r.plot(x=x_vals[0], y=y_vals, xlab="X", ylab="Y", sub=sub_title, main="Scatterplot with regression")
+        r.abline(a=yintercept, b=slope, col="red")
+    except:
+        pass
 else:
     r.pairs(dat, main="Scatterplot Matrix", col="blue")
-
-r.plot(linear_model)
+try:
+    r.plot(linear_model)
+except:
+    pass
 r.dev_off()
diff -r e9971a098d14 -r 541ae0e5e599 tools/regVariation/linear_regression.xml
--- a/tools/regVariation/linear_regression.xml	Tue Apr 21 10:45:13 2009 -0400
+++ b/tools/regVariation/linear_regression.xml	Tue Apr 21 10:53:34 2009 -0400
@@ -1,4 +1,4 @@
-<tool id="LinearRegression1" name="Perform Linear Regression">
+<tool id="LinearRegression1" name="Perform Linear Regression" version="1.0.1">
   <description> </description>
   <command interpreter="python">
     linear_regression.py 
@@ -48,7 +48,7 @@
 
 **Note**
 
-- This tool currently treats all predictor and response variables as continuous variables. 
+- This tool currently treats all predictor and response variables as continuous variables. Running the tool on categorical variables might result in incorrect results.
 
 - Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis.
 
diff -r e9971a098d14 -r 541ae0e5e599 tools/stats/grouping.py
--- a/tools/stats/grouping.py	Tue Apr 21 10:45:13 2009 -0400
+++ b/tools/stats/grouping.py	Tue Apr 21 10:53:34 2009 -0400
@@ -3,7 +3,7 @@
 """
 This tool provides the SQL "group by" functionality.
 """
-import sys, string, re, commands, tempfile, random
+import sys, string, re, commands, tempfile, random, sets
 from rpy import *
 
 def stop_err(msg):
@@ -48,7 +48,7 @@
     
     for k,col in enumerate(cols):
         col = int(col)-1
-        if ops[k] not in ['c', 'length', 'unique', 'random']:
+        if ops[k] not in ['c', 'length', 'unique', 'random', 'cuniq']:
             # We'll get here only if the user didn't choose 'Concatenate' or 'Count' or 'Count Distinct' or 'pick randmly', which are the
             # only aggregation functions that can be used on columns containing strings.
             try:
@@ -104,7 +104,7 @@
                             valid = True
                             # Before appending the current value, make sure it is numeric if the
                             # operation for the column requires it.
-                            if ops[i] not in ['c','length', 'unique','random']:
+                            if ops[i] not in ['c','length', 'unique','random','cuniq']:
                                 try:
                                     float( fields[col].strip())
                                 except:
@@ -125,13 +125,18 @@
                         out_str = prev_item
     
                         for i, op in enumerate( ops ):
-                            rfunc = "r." + op 
-                            if op not in ['c','length','unique','random']:
+                            if op == 'cuniq':
+                                rfunc = "r.c"
+                            else:
+                                rfunc = "r." + op 
+                            if op not in ['c','length','unique','random','cuniq']:
                                 for j, elem in enumerate( prev_vals[i] ):
                                     prev_vals[i][j] = float( elem )
-                                rout = "%g" %( eval( rfunc )( prev_vals[i] ))
+                                rout = eval( rfunc )( prev_vals[i] )
                                 if rounds[i] == 'yes':
                                     rout = int(round(float(rout)))
+                                else:
+                                    rout = '%g' %(float(rout))
                             else:
                                 if op != 'random':
                                     rout = eval( rfunc )( prev_vals[i] )
@@ -142,8 +147,14 @@
                             if op == 'unique':
                                 rfunc = "r.length" 
                                 rout = eval( rfunc )( rout )
-                            out_str += "\t" + str(rout)
-    
+                            if op in ['c', 'cuniq']:
+                                if op == 'c':
+                                    out_str += "\t" + ','.join(rout)
+                                else:
+                                    out_str += "\t" + ','.join(list(set(rout)))
+                            else:
+                                out_str += "\t" + str(rout)
+                                
                         print >>fout, out_str
     
                         prev_item = item   
@@ -175,14 +186,19 @@
     out_str = prev_item
     
     for i, op in enumerate(ops):
-        rfunc = "r." + op 
+        if op == 'cuniq':
+            rfunc = "r.c"
+        else:
+            rfunc = "r." + op 
         try:
-            if op not in ['c','length','unique','random']:
+            if op not in ['c','length','unique','random','cuniq']:
                 for j, elem in enumerate( prev_vals[i] ):
                     prev_vals[i][j] = float( elem )
-                rout = '%g' %( eval( rfunc )( prev_vals[i] ))
+                rout = eval( rfunc )( prev_vals[i] )
                 if rounds[i] == 'yes':
                     rout = int(round(float(rout)))
+                else:
+                    rout = '%g' %(float(rout))
             else:
                 if op != 'random':
                     rout = eval( rfunc )( prev_vals[i] )
@@ -192,8 +208,14 @@
                     
             if op == 'unique':
                 rfunc = "r.length" 
-                rout = eval( rfunc )( rout )    
-            out_str += "\t" + str( rout )
+                rout = eval( rfunc )( rout )  
+            if op in ['c','cuniq']:
+                if op == 'c':
+                    out_str += "\t" + ','.join(rout)
+                else:
+                    out_str += "\t" + ','.join(list(set(rout)))
+            else:
+                out_str += "\t" + str( rout )
         except:
             skipped_lines += 1
             if not first_invalid_line:
@@ -212,6 +234,8 @@
             op = 'count_distinct'
         elif op == 'random':
             op = 'randomly_pick'
+        elif op == 'cuniq':
+            op = 'concat_distinct'
         msg += op + "[c" + cols[i] + "] "
     if skipped_lines > 0:
         msg+= "--skipped %d invalid lines starting with line %d.  Value '%s' in column %d is not numeric." % ( skipped_lines, first_invalid_line, invalid_value, invalid_column )
diff -r e9971a098d14 -r 541ae0e5e599 tools/stats/grouping.xml
--- a/tools/stats/grouping.xml	Tue Apr 21 10:45:13 2009 -0400
+++ b/tools/stats/grouping.xml	Tue Apr 21 10:53:34 2009 -0400
@@ -1,4 +1,4 @@
-<tool id="Grouping1" name="Group" version="1.5.0">
+<tool id="Grouping1" name="Group" version="1.6.0">
   <description>data by a column and perform aggregate operation on other columns.</description>
   <command interpreter="python">
     grouping.py 
@@ -23,6 +23,7 @@
         <option value="length">Count</option>
         <option value="unique">Count Distinct</option>
         <option value="c">Concatenate</option>
+        <option value="cuniq">Concatenate Distinct</option>
         <option value="random">Randomly pick</option>
       </param>
       <param name="opcol" label="On column" type="data_column" data_ref="input1" />
@@ -87,7 +88,7 @@
 
 - running this tool with **Group by column 1**, Operations **Mean on column 2** and **Concatenate on column 3** will return::
 
-   chr10    1700.00 ['NM_11', 'NM_10']
-   chr22    1533.33 ['NM_17', 'NM_19', 'NM_18']
+   chr10    1700.00 NM_11,NM_10
+   chr22    1533.33 NM_17,NM_19,NM_18
   </help>
 </tool>

    

gua110＠scofield.bx.psu.edu

tags

participants (1)