details:
http://www.bx.psu.edu/hg/galaxy/rev/541ae0e5e599
changeset: 2359:541ae0e5e599
user: guru
date: Tue Apr 21 10:53:34 2009 -0400
description:
Bug fixed in linear regression tool, and new option added to grouping tool.
4 file(s) affected in this change:
tools/regVariation/linear_regression.py
tools/regVariation/linear_regression.xml
tools/stats/grouping.py
tools/stats/grouping.xml
diffs (186 lines):
diff -r e9971a098d14 -r 541ae0e5e599 tools/regVariation/linear_regression.py
--- a/tools/regVariation/linear_regression.py Tue Apr 21 10:45:13 2009 -0400
+++ b/tools/regVariation/linear_regression.py Tue Apr 21 10:53:34 2009 -0400
@@ -108,10 +108,15 @@
r.pdf( outfile2, 8, 8 )
if len(x_vals) == 1: #Simple linear regression case with 1 predictor variable
sub_title = "Slope = %s; Y-int = %s" %(slope,yintercept)
- r.plot(x=x_vals[0], y=y_vals, xlab="X", ylab="Y", sub=sub_title,
main="Scatterplot with regression")
- r.abline(a=yintercept, b=slope, col="red")
+ try:
+ r.plot(x=x_vals[0], y=y_vals, xlab="X", ylab="Y",
sub=sub_title, main="Scatterplot with regression")
+ r.abline(a=yintercept, b=slope, col="red")
+ except:
+ pass
else:
r.pairs(dat, main="Scatterplot Matrix", col="blue")
-
-r.plot(linear_model)
+try:
+ r.plot(linear_model)
+except:
+ pass
r.dev_off()
diff -r e9971a098d14 -r 541ae0e5e599 tools/regVariation/linear_regression.xml
--- a/tools/regVariation/linear_regression.xml Tue Apr 21 10:45:13 2009 -0400
+++ b/tools/regVariation/linear_regression.xml Tue Apr 21 10:53:34 2009 -0400
@@ -1,4 +1,4 @@
-<tool id="LinearRegression1" name="Perform Linear Regression">
+<tool id="LinearRegression1" name="Perform Linear Regression"
version="1.0.1">
<description> </description>
<command interpreter="python">
linear_regression.py
@@ -48,7 +48,7 @@
**Note**
-- This tool currently treats all predictor and response variables as continuous
variables.
+- This tool currently treats all predictor and response variables as continuous
variables. Running the tool on categorical variables might result in incorrect results.
- Rows containing non-numeric (or missing) data in any of the chosen columns will be
skipped from the analysis.
diff -r e9971a098d14 -r 541ae0e5e599 tools/stats/grouping.py
--- a/tools/stats/grouping.py Tue Apr 21 10:45:13 2009 -0400
+++ b/tools/stats/grouping.py Tue Apr 21 10:53:34 2009 -0400
@@ -3,7 +3,7 @@
"""
This tool provides the SQL "group by" functionality.
"""
-import sys, string, re, commands, tempfile, random
+import sys, string, re, commands, tempfile, random, sets
from rpy import *
def stop_err(msg):
@@ -48,7 +48,7 @@
for k,col in enumerate(cols):
col = int(col)-1
- if ops[k] not in ['c', 'length', 'unique',
'random']:
+ if ops[k] not in ['c', 'length', 'unique',
'random', 'cuniq']:
# We'll get here only if the user didn't choose 'Concatenate'
or 'Count' or 'Count Distinct' or 'pick randmly', which are the
# only aggregation functions that can be used on columns containing strings.
try:
@@ -104,7 +104,7 @@
valid = True
# Before appending the current value, make sure it is numeric
if the
# operation for the column requires it.
- if ops[i] not in ['c','length',
'unique','random']:
+ if ops[i] not in ['c','length',
'unique','random','cuniq']:
try:
float( fields[col].strip())
except:
@@ -125,13 +125,18 @@
out_str = prev_item
for i, op in enumerate( ops ):
- rfunc = "r." + op
- if op not in
['c','length','unique','random']:
+ if op == 'cuniq':
+ rfunc = "r.c"
+ else:
+ rfunc = "r." + op
+ if op not in
['c','length','unique','random','cuniq']:
for j, elem in enumerate( prev_vals[i] ):
prev_vals[i][j] = float( elem )
- rout = "%g" %( eval( rfunc )( prev_vals[i] ))
+ rout = eval( rfunc )( prev_vals[i] )
if rounds[i] == 'yes':
rout = int(round(float(rout)))
+ else:
+ rout = '%g' %(float(rout))
else:
if op != 'random':
rout = eval( rfunc )( prev_vals[i] )
@@ -142,8 +147,14 @@
if op == 'unique':
rfunc = "r.length"
rout = eval( rfunc )( rout )
- out_str += "\t" + str(rout)
-
+ if op in ['c', 'cuniq']:
+ if op == 'c':
+ out_str += "\t" + ','.join(rout)
+ else:
+ out_str += "\t" +
','.join(list(set(rout)))
+ else:
+ out_str += "\t" + str(rout)
+
print >>fout, out_str
prev_item = item
@@ -175,14 +186,19 @@
out_str = prev_item
for i, op in enumerate(ops):
- rfunc = "r." + op
+ if op == 'cuniq':
+ rfunc = "r.c"
+ else:
+ rfunc = "r." + op
try:
- if op not in
['c','length','unique','random']:
+ if op not in
['c','length','unique','random','cuniq']:
for j, elem in enumerate( prev_vals[i] ):
prev_vals[i][j] = float( elem )
- rout = '%g' %( eval( rfunc )( prev_vals[i] ))
+ rout = eval( rfunc )( prev_vals[i] )
if rounds[i] == 'yes':
rout = int(round(float(rout)))
+ else:
+ rout = '%g' %(float(rout))
else:
if op != 'random':
rout = eval( rfunc )( prev_vals[i] )
@@ -192,8 +208,14 @@
if op == 'unique':
rfunc = "r.length"
- rout = eval( rfunc )( rout )
- out_str += "\t" + str( rout )
+ rout = eval( rfunc )( rout )
+ if op in ['c','cuniq']:
+ if op == 'c':
+ out_str += "\t" + ','.join(rout)
+ else:
+ out_str += "\t" + ','.join(list(set(rout)))
+ else:
+ out_str += "\t" + str( rout )
except:
skipped_lines += 1
if not first_invalid_line:
@@ -212,6 +234,8 @@
op = 'count_distinct'
elif op == 'random':
op = 'randomly_pick'
+ elif op == 'cuniq':
+ op = 'concat_distinct'
msg += op + "[c" + cols[i] + "] "
if skipped_lines > 0:
msg+= "--skipped %d invalid lines starting with line %d. Value '%s'
in column %d is not numeric." % ( skipped_lines, first_invalid_line, invalid_value,
invalid_column )
diff -r e9971a098d14 -r 541ae0e5e599 tools/stats/grouping.xml
--- a/tools/stats/grouping.xml Tue Apr 21 10:45:13 2009 -0400
+++ b/tools/stats/grouping.xml Tue Apr 21 10:53:34 2009 -0400
@@ -1,4 +1,4 @@
-<tool id="Grouping1" name="Group" version="1.5.0">
+<tool id="Grouping1" name="Group" version="1.6.0">
<description>data by a column and perform aggregate operation on other
columns.</description>
<command interpreter="python">
grouping.py
@@ -23,6 +23,7 @@
<option value="length">Count</option>
<option value="unique">Count Distinct</option>
<option value="c">Concatenate</option>
+ <option value="cuniq">Concatenate Distinct</option>
<option value="random">Randomly pick</option>
</param>
<param name="opcol" label="On column"
type="data_column" data_ref="input1" />
@@ -87,7 +88,7 @@
- running this tool with **Group by column 1**, Operations **Mean on column 2** and
**Concatenate on column 3** will return::
- chr10 1700.00 ['NM_11', 'NM_10']
- chr22 1533.33 ['NM_17', 'NM_19', 'NM_18']
+ chr10 1700.00 NM_11,NM_10
+ chr22 1533.33 NM_17,NM_19,NM_18
</help>
</tool>