[galaxy-dev] [hg] galaxy 1522: Adding a new set of toolss to perform multiple...

22 Sep 2008

details:   http://www.bx.psu.edu/hg/galaxy/rev/05974294cbf1
changeset: 1522:05974294cbf1
user:      guru
date:      Sat Sep 20 18:14:24 2008 -0400
description:
Adding a new set of toolss to perform multiple linear regression analysis.

9 file(s) affected in this change:

test-data/rcve_out.dat
test-data/reg_inp.tab
tool_conf.xml.sample
tools/regVariation/best_regression_subsets.py
tools/regVariation/best_regression_subsets.xml
tools/regVariation/linear_regression.py
tools/regVariation/linear_regression.xml
tools/regVariation/rcve.py
tools/regVariation/rcve.xml

diffs (700 lines):

diff -r 618210a97e62 -r 05974294cbf1 test-data/rcve_out.dat

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/rcve_out.dat	Sat Sep 20 18:14:24 2008 -0400
@@ -0,0 +1,8 @@
+#Model	R-sq	RCVE_Terms	RCVE_Value
+2 3 4 	0.3997	-	-
+3 4 	0.3319	2 	0.1697
+2 4 	0.2974	3 	0.2561
+2 3 	0.3985	4 	0.0031
+4 	0.1226	2 3 	0.6934
+3 	0.2733	2 4 	0.3164
+2 	0.2972	3 4 	0.2564
diff -r 618210a97e62 -r 05974294cbf1 test-data/reg_inp.tab
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/test-data/reg_inp.tab	Sat Sep 20 18:14:24 2008 -0400
@@ -0,0 +1,100 @@
+2.04	2.01	1070	 5
+2.56	3.40	1254	 6
+3.75	3.68	1466	 6
+1.10	1.54	 706	 4
+3.00	3.32	1160	 5
+0.05	0.33	 756	 3
+1.38	0.36	1058	 2
+1.50	1.97	1008	 7
+1.38	2.03	1104	 4
+4.01	2.05	1200	 7
+1.50	2.13	 896	 7
+1.29	1.34	 848	 3
+1.90	1.51	 958	 5
+3.11	3.12	1246	 6
+1.92	2.14	1106	 4
+0.81	2.60	 790	 5
+1.01	1.90	 954	 4
+3.66	3.06	1500	 6
+2.00	1.60	1046	 5
+2.05	1.96	1054	 4
+2.60	1.96	1198	 6
+2.55	1.56	 940	 3
+0.38	1.60	 456	 6
+2.48	1.92	1150	 7
+2.74	3.09	 636	 6
+1.77	0.78	 744	 5
+1.61	2.12	 644	 5
+0.99	1.85	 842	 3
+1.62	1.78	 852	 5
+2.03	1.03	1170	 3
+3.50	3.44	1034	10
+3.18	2.42	1202	 5
+2.39	1.74	1018	 5
+1.48	1.89	1180	 5
+1.54	1.43	 952	 3
+1.57	1.64	1038	 4
+2.46	2.69	1090	 6
+2.42	1.79	 694	 5
+2.11	2.72	1096	 6
+2.04	2.15	1114	 5
+1.68	2.22	1256	 6
+1.64	1.55	1208	 5
+2.41	2.34	 820	 6
+2.10	2.92	1222	 4
+1.40	2.10	1120	 5
+2.03	1.64	 886	 4
+1.99	2.83	1126	 7
+2.24	1.76	1158	 4
+0.45	1.81	 676	 6
+2.31	2.68	1214	 7
+2.41	2.55	1136	 6
+2.56	2.70	1264	 6
+2.50	1.66	1116	 3
+2.92	2.23	1292	 4
+2.35	2.01	 604	 5
+2.82	1.24	 854	 6
+1.80	1.95	 814	 6
+1.29	1.73	 778	 3
+1.68	1.08	 800	 2
+3.44	3.46	1424	 7
+1.90	3.01	 950	 6
+2.06	0.54	1056	 3
+3.30	3.20	 956	 8
+1.80	1.50	1352	 5
+2.00	1.71	 852	 5
+1.68	1.99	1168	 5
+1.94	2.76	 970	 6
+0.97	1.56	 776	 4
+1.12	1.78	 854	 6
+1.31	1.32	1232	 5
+1.68	0.87	1140	 6
+3.09	1.75	1084	 4
+1.87	1.41	 954	 2
+2.00	2.77	1000	 4
+2.39	1.78	1084	 4
+1.50	1.34	1058	 4
+1.82	1.52	 816	 5
+1.80	2.97	1146	 7
+2.01	1.75	1000	 6
+1.88	1.64	 856	 4
+1.64	1.80	 798	 4
+2.42	3.37	1324	 6
+0.22	1.15	 704	 6
+2.31	1.72	1222	 5
+0.95	2.27	 948	 6
+1.99	2.85	1182	 8
+1.86	2.21	1000	 6
+1.79	1.94	 910	 6
+3.02	4.25	1374	 9
+1.85	1.83	1014	 6
+1.98	2.75	1420	 7
+2.15	1.71	 400	 6
+1.46	2.20	 998	 7
+2.29	2.13	 776	 6
+2.39	2.38	1134	 7
+1.80	1.64	 772	 4
+2.64	1.87	1304	 6
+2.08	2.53	1212	 4
+0.70	1.78	 818	 6
+0.89	1.20	 864	 2
\ No newline at end of file
diff -r 618210a97e62 -r 05974294cbf1 tool_conf.xml.sample
--- a/tool_conf.xml.sample	Fri Sep 19 12:34:51 2008 -0400
+++ b/tool_conf.xml.sample	Sat Sep 20 18:14:24 2008 -0400
@@ -128,6 +128,11 @@
     <tool file="regVariation/getIndels_2way.xml" />
     <tool file="regVariation/getIndels_3way.xml" />
     <tool file="regVariation/getIndelRates_3way.xml" />
+  </section>
+  <section name="Multiple regression" id="multReg">
+    <tool file="regVariation/linear_regression.xml" />
+    <tool file="regVariation/best_regression_subsets.xml" />
+    <tool file="regVariation/rcve.xml" />
   </section>
  <section name="Evolution: HyPhy" id="hyphy">
     <tool file="hyphy/hyphy_branch_lengths_wrapper.xml" />
diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/best_regression_subsets.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/best_regression_subsets.py	Sat Sep 20 18:14:24 2008 -0400
@@ -0,0 +1,90 @@
+#!/usr/bin/env python
+
+from galaxy import eggs
+
+import sys, string
+from rpy import *
+import numpy
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+infile = sys.argv[1]
+y_col = int(sys.argv[2])-1
+x_cols = sys.argv[3].split(',')
+outfile = sys.argv[4]
+outfile2 = sys.argv[5]
+print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1)
+fout = open(outfile,'w')
+
+for i, line in enumerate( file ( infile )):
+    line = line.rstrip('\r\n')
+    if len( line )>0 and not line.startswith( '#' ):
+        elems = line.split( '\t' )
+        break 
+    if i == 30:
+        break # Hopefully we'll never get here...
+
+if len( elems )<1:
+    stop_err( "The data in your input dataset is either missing or not formatted properly." )
+
+y_vals = []
+x_vals = []
+
+for k,col in enumerate(x_cols):
+    x_cols[k] = int(col)-1
+    x_vals.append([])
+    
+NA = 'NA'
+for ind,line in enumerate( file( infile )):
+    if line and not line.startswith( '#' ):
+        try:
+            fields = line.split("\t")
+            try:
+                yval = float(fields[y_col])
+            except Exception, ey:
+                yval = r('NA')
+            y_vals.append(yval)
+            for k,col in enumerate(x_cols):
+                try:
+                    xval = float(fields[col])
+                except Exception, ex:
+                    xval = r('NA')
+                x_vals[k].append(xval)
+        except:
+            pass
+
+response_term = ""
+
+x_vals1 = numpy.asarray(x_vals).transpose()
+
+dat= r.list(x=array(x_vals1), y=y_vals)
+
+r.library("leaps")
+ 
+set_default_mode(NO_CONVERSION)
+try:
+    leaps = r.regsubsets(r("y ~ x"), data= r.na_exclude(dat))
+except RException, rex:
+    stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.")
+set_default_mode(BASIC_CONVERSION)
+
+summary = r.summary(leaps)
+tot = len(x_vals)
+pattern = "["
+for i in range(tot):
+    pattern = pattern + 'c' + str(int(x_cols[int(i)]) + 1) + ' '
+pattern = pattern.strip() + ']'  
+print >>fout, "#Vars\t%s\tR-sq\tAdj. R-sq\tC-p\tbic" %(pattern)
+for ind,item in enumerate(summary['outmat']):
+    print >>fout, "%s\t%s\t%s\t%s\t%s\t%s" %(str(item).count('*'), item, summary['rsq'][ind], summary['adjr2'][ind], summary['cp'][ind], summary['bic'][ind])
+
+
+r.pdf( outfile2, 8, 8 )
+r.plot(leaps, scale="Cp", main="Best subsets using Cp Criterion")
+r.plot(leaps, scale="r2", main="Best subsets using R-sq Criterion")
+r.plot(leaps, scale="adjr2", main="Best subsets using Adjusted R-sq Criterion")
+r.plot(leaps, scale="bic", main="Best subsets using bic Criterion")
+
+r.dev_off()
diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/best_regression_subsets.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/best_regression_subsets.xml	Sat Sep 20 18:14:24 2008 -0400
@@ -0,0 +1,64 @@
+<tool id="BestSubsetsRegression1" name="Perform Best-subsets Regression">
+  <description> </description>
+  <command interpreter="python">
+    best_regression_subsets.py 
+      $input1
+      $response_col
+      $predictor_cols
+      $out_file1
+      $out_file2
+      1>/dev/null
+      2>/dev/null
+  </command>
+  <inputs>
+    <param format="tabular" name="input1" type="data" label="Select data" help="Query missing? See TIP below."/>
+    <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" />
+    <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" multiple="true" />
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input1" />
+    <data format="pdf" name="out_file2" />
+  </outputs>
+  <requirements>
+    <requirement type="python-module">rpy</requirement>
+  </requirements>
+  <tests>
+    <!-- Testing this tool will not be possible because this tool produces a pdf output file.
+    -->
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Edit Queries->Convert characters*
+
+-----
+
+.. class:: infomark
+
+**What it does**
+
+This tool uses the 'regsubsets' function from R statistical package for regression subset selection. It outputs two files, one containing a table with the best subsets and the corresponding summary statistics, and the other containing the graphical representation of the results.  
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+- This tool currently treats all predictor and response variables as continuous variables. 
+
+- Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis.
+
+- The 6 columns in the output are described below:
+
+  - Column 1 (Vars): denotes the number of variables in the model
+  - Column 2 ([c2 c3 c4...]): represents a list of the user-selected predictor variables (full model). An asterix denotes the presence of the corresponding predictor variable in the selected model.
+  - Column 3 (R-sq): the fraction of variance explained by the model
+  - Column 4 (Adj. R-sq): the above R-squared statistic adjusted, penalizing for higher number of predictors (p)
+  - Column 5 (Cp): Mallow's Cp statistics  
+  - Column 6 (bic): Bayesian Information Criterion. 
+
+
+  </help>
+</tool>
diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/linear_regression.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/linear_regression.py	Sat Sep 20 18:14:24 2008 -0400
@@ -0,0 +1,117 @@
+#!/usr/bin/env python
+
+from galaxy import eggs
+import sys, string
+from rpy import *
+import numpy
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+infile = sys.argv[1]
+y_col = int(sys.argv[2])-1
+x_cols = sys.argv[3].split(',')
+outfile = sys.argv[4]
+outfile2 = sys.argv[5]
+
+print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1)
+fout = open(outfile,'w')
+
+for i, line in enumerate( file ( infile )):
+    line = line.rstrip('\r\n')
+    if len( line )>0 and not line.startswith( '#' ):
+        elems = line.split( '\t' )
+        break 
+    if i == 30:
+        break # Hopefully we'll never get here...
+
+if len( elems )<1:
+    stop_err( "The data in your input dataset is either missing or not formatted properly." )
+
+y_vals = []
+x_vals = []
+
+for k,col in enumerate(x_cols):
+    x_cols[k] = int(col)-1
+    x_vals.append([])
+
+NA = 'NA'
+for ind,line in enumerate( file( infile )):
+    if line and not line.startswith( '#' ):
+        try:
+            fields = line.split("\t")
+            try:
+                yval = float(fields[y_col])
+            except:
+                yval = r('NA')
+            y_vals.append(yval)
+            for k,col in enumerate(x_cols):
+                try:
+                    xval = float(fields[col])
+                except:
+                    xval = r('NA')
+                x_vals[k].append(xval)
+        except:
+            pass
+
+x_vals1 = numpy.asarray(x_vals).transpose()
+
+dat= r.list(x=array(x_vals1), y=y_vals)
+
+set_default_mode(NO_CONVERSION)
+try:
+    linear_model = r.lm(r("y ~ x"), data = r.na_exclude(dat))
+except RException, rex:
+    stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain only non-numeric or invalid values.")
+set_default_mode(BASIC_CONVERSION)
+
+coeffs=linear_model.as_py()['coefficients']
+yintercept= coeffs['(Intercept)']
+print >>fout, "Y-intercept\t%s" %(yintercept)
+summary = r.summary(linear_model)
+
+co = summary.get('coefficients', 'NA')
+"""
+if len(co) != len(x_vals)+1:
+    stop_err("Stopped performing linear regression on the input data, since one of the predictor columns contains only non-numeric or invalid values.")
+"""
+print >>fout, "p-value (Y-intercept)\t%s" %(co[0][3])
+
+if len(x_vals) == 1:    #Simple linear  regression case with 1 predictor variable
+    try:
+        slope = coeffs['x']
+    except:
+        slope = 'NA'
+    try:
+        pval = co[1][3]
+    except:
+        pval = 'NA'
+    print >>fout, "Slope (c%d)\t%s" %(x_cols[0]+1,slope)
+    print >>fout, "p-value (c%d)\t%s" %(x_cols[0]+1,pval)
+else:    #Multiple regression case with >1 predictors
+    ind=1
+    while ind < len(coeffs.keys()):
+        print >>fout, "Slope (c%d)\t%s" %(x_cols[ind-1]+1,coeffs['x'+str(ind)])
+        try:
+            pval = co[ind][3]
+        except:
+            pval = 'NA'
+        print >>fout, "p-value (c%d)\t%s" %(x_cols[ind-1]+1,pval)
+        ind+=1
+
+print >>fout, "R-squared\t%s" %(summary.get('r.squared','NA'))
+print >>fout, "Adjusted R-squared\t%s" %(summary.get('adj.r.squared','NA'))
+print >>fout, "F-statistic\t%s" %(summary.get('fstatistic','NA'))
+print >>fout, "Sigma\t%s" %(summary.get('sigma','NA'))
+
+r.pdf( outfile2, 8, 8 )
+if len(x_vals) == 1:    #Simple linear  regression case with 1 predictor variable
+    sub_title =  "Slope = %s; Y-int = %s" %(slope,yintercept)
+    r.plot(x=x_vals[0], y=y_vals, xlab="X", ylab="Y", sub=sub_title, main="Scatterplot with regression")
+    r.abline(a=yintercept, b=slope, col="red")
+else:
+    r.pairs(dat, main="Scatterplot Matrix", col="blue")
+
+r.plot(linear_model)
+r.dev_off()
diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/linear_regression.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/linear_regression.xml	Sat Sep 20 18:14:24 2008 -0400
@@ -0,0 +1,62 @@
+<tool id="LinearRegression1" name="Perform Linear Regression">
+  <description> </description>
+  <command interpreter="python">
+    linear_regression.py 
+      $input1
+      $response_col
+      $predictor_cols
+      $out_file1
+      $out_file2
+      1>/dev/null
+  </command>
+  <inputs>
+    <param format="tabular" name="input1" type="data" label="Select data" help="Query missing? See TIP below."/>
+    <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" />
+    <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" multiple="true" />
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input1" />
+    <data format="pdf" name="out_file2" />
+  </outputs>
+  <requirements>
+    <requirement type="python-module">rpy</requirement>
+  </requirements>
+  <tests>
+  	<!-- Testing this tool will not be possible because this tool produces a pdf output file.
+    -->
+  </tests>
+  <help>
+
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Edit Queries->Convert characters*
+
+-----
+
+.. class:: infomark
+
+**What it does**
+
+This tool uses the 'lm' function from R statistical package to perform linear regression on the input data. It outputs two files, one containing the summary statistics of the performed regression, and the other containing diagnostic plots to check whether model assumptions are satisfied.   
+
+-----
+
+.. class:: warningmark
+
+**Note**
+
+- This tool currently treats all predictor and response variables as continuous variables. 
+
+- Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis.
+
+- The summary statistics in the output are described below:
+
+  - sigma: the square root of the estimated variance of the random error (standard error of the residiuals)
+  - R-squared: the fraction of variance explained by the model
+  - Adjusted R-squared: the above R-squared statistic adjusted, penalizing for the number of the predictors (p)
+  - p-value: p-value for the t-test of the null hypothesis that the corresponding slope is equal to zero against the two-sided alternative.
+
+
+  </help>
+</tool>
diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/rcve.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/rcve.py	Sat Sep 20 18:14:24 2008 -0400
@@ -0,0 +1,143 @@
+#!/usr/bin/env python
+
+from galaxy import eggs
+
+import sys, string
+from rpy import *
+import numpy
+
+def stop_err(msg):
+    sys.stderr.write(msg)
+    sys.exit()
+
+def sscombs(s):
+    if len(s) == 1:
+        return [s]
+    else:
+        ssc = sscombs(s[1:])
+        return [s[0]] + [s[0]+comb for comb in ssc] + ssc
+
+
+infile = sys.argv[1]
+y_col = int(sys.argv[2])-1
+x_cols = sys.argv[3].split(',')
+outfile = sys.argv[4]
+
+print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1)
+fout = open(outfile,'w')
+
+for i, line in enumerate( file ( infile )):
+    line = line.rstrip('\r\n')
+    if len( line )>0 and not line.startswith( '#' ):
+        elems = line.split( '\t' )
+        break 
+    if i == 30:
+        break # Hopefully we'll never get here...
+
+if len( elems )<1:
+    stop_err( "The data in your input dataset is either missing or not formatted properly." )
+
+y_vals = []
+x_vals = []
+
+for k,col in enumerate(x_cols):
+    x_cols[k] = int(col)-1
+    x_vals.append([])
+    """
+    try:
+        float( elems[x_cols[k]] )
+    except:
+        try:
+            msg = "This operation cannot be performed on non-numeric column %d containing value '%s'." %( col, elems[x_cols[k]] )
+        except:
+            msg = "This operation cannot be performed on non-numeric data."
+        stop_err( msg )
+    """
+NA = 'NA'
+for ind,line in enumerate( file( infile )):
+    if line and not line.startswith( '#' ):
+        try:
+            fields = line.split("\t")
+            try:
+                yval = float(fields[y_col])
+            except Exception, ey:
+                yval = r('NA')
+                #print >>sys.stderr, "ey = %s" %ey
+            y_vals.append(yval)
+            for k,col in enumerate(x_cols):
+                try:
+                    xval = float(fields[col])
+                except Exception, ex:
+                    xval = r('NA')
+                    #print >>sys.stderr, "ex = %s" %ex
+                x_vals[k].append(xval)
+        except:
+            pass
+
+x_vals1 = numpy.asarray(x_vals).transpose()
+dat= r.list(x=array(x_vals1), y=y_vals)
+
+set_default_mode(NO_CONVERSION)
+try:
+    full = r.lm(r("y ~ x"), data= r.na_exclude(dat))    #full model includes all the predictor variables specified by the user
+except RException, rex:
+    stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.")
+set_default_mode(BASIC_CONVERSION)
+
+summary = r.summary(full)
+fullr2 = summary.get('r.squared','NA')
+
+if fullr2 == 'NA':
+    stop_error("Error in linear regression")
+
+if len(x_vals) < 10:
+    s = ""
+    for ch in range(len(x_vals)):
+        s += str(ch)
+else:
+    stop_err("This tool only works with less than 10 predictors.")
+
+print >>fout, "#Model\tR-sq\tRCVE_Terms\tRCVE_Value"
+all_combos = sorted(sscombs(s), key=len)
+all_combos.reverse()
+for j,cols in enumerate(all_combos):
+    #if len(cols) == len(s):    #Same as the full model above
+    #    continue
+    if len(cols) == 1:
+        x_vals1 = x_vals[int(cols)]
+    else:
+        x_v = []
+        for col in cols:
+            x_v.append(x_vals[int(col)])
+        x_vals1 = numpy.asarray(x_v).transpose()
+    dat= r.list(x=array(x_vals1), y=y_vals)
+    set_default_mode(NO_CONVERSION)
+    red = r.lm(r("y ~ x"), data= dat)    #Reduced model
+    set_default_mode(BASIC_CONVERSION)
+    summary = r.summary(red)
+    redr2 = summary.get('r.squared','NA')
+    try:
+        rcve = (float(fullr2)-float(redr2))/float(fullr2)
+    except:
+        rcve = 'NA'
+    col_str = ""
+    for col in cols:
+        col_str = col_str + str(int(x_cols[int(col)]) + 1) + " "
+    col_str.strip()
+    rcve_col_str = ""
+    for col in s:
+        if col not in cols:
+            rcve_col_str = rcve_col_str + str(int(x_cols[int(col)]) + 1) + " "
+    rcve_col_str.strip()
+    if len(cols) == len(s):    #full model
+        rcve_col_str = "-"
+        rcve = "-"
+    try:
+        redr2 = "%.4f" %(float(redr2))
+    except:
+        pass
+    try:
+        rcve = "%.4f" %(float(rcve))
+    except:
+        pass
+    print >>fout, "%s\t%s\t%s\t%s" %(col_str,redr2,rcve_col_str,rcve)
diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/rcve.xml
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tools/regVariation/rcve.xml	Sat Sep 20 18:14:24 2008 -0400
@@ -0,0 +1,68 @@
+<tool id="rcve1" name="Compute RCVE" version="1.0.0">
+  <description> </description>
+  <command interpreter="python">
+    rcve.py 
+      $input1
+      $response_col
+      $predictor_cols
+      $out_file1
+      1>/dev/null
+  </command>
+  <inputs>
+    <param format="tabular" name="input1" type="data" label="Select data" help="Query missing? See TIP below."/>
+    <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" />
+    <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" multiple="true" />
+  </inputs>
+  <outputs>
+    <data format="input" name="out_file1" metadata_source="input1" />
+  </outputs>
+  <requirements>
+    <requirement type="python-module">rpy</requirement>
+  </requirements>
+  <tests>
+    <!-- Test data with vlid values -->
+  	<test>
+      <param name="input1" value="reg_inp.tab"/>
+      <param name="response_col" value="1"/>
+      <param name="predictor_cols" value="2,3,4"/>
+      <output name="out_file1" file="rcve_out.dat"/>
+    </test>
+    
+  </tests>
+  <help>
+
+.. class:: infomark
+
+**TIP:** If your data is not TAB delimited, use *Edit Queries->Convert characters*
+
+-----
+
+.. class:: infomark
+
+**What it does**
+
+This tool computes the RCVE (Relative Contribution to Variance) for all possible variable subsets using the following formula:
+
+**RCVE(i) = [R-sq (full: 1,2,..,i..,p-1) - R-sq(without i: 1,2,...,p-1)] / R-sq (full: 1,2,..,i..,p-1)**,
+which denotes the case where the 'i'th predictor is dropped. 
+
+
+In general,
+**RCVE(X+) = [R-sq (full: {X,X+}) - R-sq(reduced: {X})] / R-sq (full: {X,X+})**,
+where,
+
+- {X,X+} denotes the set of all predictors, 
+- X+ is the set of predictors for which we compute RCVE (and therefore drop from the full model to obtain a reduced one), 
+- {X} is the set of the predictors that are left in the reduced model after excluding {X+} 
+
+
+The 4 columns in the output are described below:
+
+- Column 1 (Model): denotes the variables present in the model ({X})
+- Column 2 (R-sq): denotes the R-squared value corresponding to the model in Column 1
+- Column 3 (RCVE_Terms): denotes the variable/s for which RCVE is computed ({X+}). These are the variables that are absent in the reduced model in Column 1. A '-' in this column indicates that the model in Column 1 is the Full model.
+- Column 4 (RCVE): denotes the RCVE value corresponding to the variable/s in Column 3. A '-' in this column indicates that the model in Column 1 is the Full model.
+  
+  
+  </help>
+</tool>

    

[galaxy-dev] [hg] galaxy 1522: Adding a new set of toolss to perform multiple...

greg＠scofield.bx.psu.edu