[hg] galaxy 2359: Bug fixed in linear regression tool, and new o...
details: http://www.bx.psu.edu/hg/galaxy/rev/541ae0e5e599 changeset: 2359:541ae0e5e599 user: guru date: Tue Apr 21 10:53:34 2009 -0400 description: Bug fixed in linear regression tool, and new option added to grouping tool. 4 file(s) affected in this change: tools/regVariation/linear_regression.py tools/regVariation/linear_regression.xml tools/stats/grouping.py tools/stats/grouping.xml diffs (186 lines): diff -r e9971a098d14 -r 541ae0e5e599 tools/regVariation/linear_regression.py --- a/tools/regVariation/linear_regression.py Tue Apr 21 10:45:13 2009 -0400 +++ b/tools/regVariation/linear_regression.py Tue Apr 21 10:53:34 2009 -0400 @@ -108,10 +108,15 @@ r.pdf( outfile2, 8, 8 ) if len(x_vals) == 1: #Simple linear regression case with 1 predictor variable sub_title = "Slope = %s; Y-int = %s" %(slope,yintercept) - r.plot(x=x_vals[0], y=y_vals, xlab="X", ylab="Y", sub=sub_title, main="Scatterplot with regression") - r.abline(a=yintercept, b=slope, col="red") + try: + r.plot(x=x_vals[0], y=y_vals, xlab="X", ylab="Y", sub=sub_title, main="Scatterplot with regression") + r.abline(a=yintercept, b=slope, col="red") + except: + pass else: r.pairs(dat, main="Scatterplot Matrix", col="blue") - -r.plot(linear_model) +try: + r.plot(linear_model) +except: + pass r.dev_off() diff -r e9971a098d14 -r 541ae0e5e599 tools/regVariation/linear_regression.xml --- a/tools/regVariation/linear_regression.xml Tue Apr 21 10:45:13 2009 -0400 +++ b/tools/regVariation/linear_regression.xml Tue Apr 21 10:53:34 2009 -0400 @@ -1,4 +1,4 @@ -<tool id="LinearRegression1" name="Perform Linear Regression"> +<tool id="LinearRegression1" name="Perform Linear Regression" version="1.0.1"> <description> </description> <command interpreter="python"> linear_regression.py @@ -48,7 +48,7 @@ **Note** -- This tool currently treats all predictor and response variables as continuous variables. +- This tool currently treats all predictor and response variables as continuous variables. Running the tool on categorical variables might result in incorrect results. - Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis. diff -r e9971a098d14 -r 541ae0e5e599 tools/stats/grouping.py --- a/tools/stats/grouping.py Tue Apr 21 10:45:13 2009 -0400 +++ b/tools/stats/grouping.py Tue Apr 21 10:53:34 2009 -0400 @@ -3,7 +3,7 @@ """ This tool provides the SQL "group by" functionality. """ -import sys, string, re, commands, tempfile, random +import sys, string, re, commands, tempfile, random, sets from rpy import * def stop_err(msg): @@ -48,7 +48,7 @@ for k,col in enumerate(cols): col = int(col)-1 - if ops[k] not in ['c', 'length', 'unique', 'random']: + if ops[k] not in ['c', 'length', 'unique', 'random', 'cuniq']: # We'll get here only if the user didn't choose 'Concatenate' or 'Count' or 'Count Distinct' or 'pick randmly', which are the # only aggregation functions that can be used on columns containing strings. try: @@ -104,7 +104,7 @@ valid = True # Before appending the current value, make sure it is numeric if the # operation for the column requires it. - if ops[i] not in ['c','length', 'unique','random']: + if ops[i] not in ['c','length', 'unique','random','cuniq']: try: float( fields[col].strip()) except: @@ -125,13 +125,18 @@ out_str = prev_item for i, op in enumerate( ops ): - rfunc = "r." + op - if op not in ['c','length','unique','random']: + if op == 'cuniq': + rfunc = "r.c" + else: + rfunc = "r." + op + if op not in ['c','length','unique','random','cuniq']: for j, elem in enumerate( prev_vals[i] ): prev_vals[i][j] = float( elem ) - rout = "%g" %( eval( rfunc )( prev_vals[i] )) + rout = eval( rfunc )( prev_vals[i] ) if rounds[i] == 'yes': rout = int(round(float(rout))) + else: + rout = '%g' %(float(rout)) else: if op != 'random': rout = eval( rfunc )( prev_vals[i] ) @@ -142,8 +147,14 @@ if op == 'unique': rfunc = "r.length" rout = eval( rfunc )( rout ) - out_str += "\t" + str(rout) - + if op in ['c', 'cuniq']: + if op == 'c': + out_str += "\t" + ','.join(rout) + else: + out_str += "\t" + ','.join(list(set(rout))) + else: + out_str += "\t" + str(rout) + print >>fout, out_str prev_item = item @@ -175,14 +186,19 @@ out_str = prev_item for i, op in enumerate(ops): - rfunc = "r." + op + if op == 'cuniq': + rfunc = "r.c" + else: + rfunc = "r." + op try: - if op not in ['c','length','unique','random']: + if op not in ['c','length','unique','random','cuniq']: for j, elem in enumerate( prev_vals[i] ): prev_vals[i][j] = float( elem ) - rout = '%g' %( eval( rfunc )( prev_vals[i] )) + rout = eval( rfunc )( prev_vals[i] ) if rounds[i] == 'yes': rout = int(round(float(rout))) + else: + rout = '%g' %(float(rout)) else: if op != 'random': rout = eval( rfunc )( prev_vals[i] ) @@ -192,8 +208,14 @@ if op == 'unique': rfunc = "r.length" - rout = eval( rfunc )( rout ) - out_str += "\t" + str( rout ) + rout = eval( rfunc )( rout ) + if op in ['c','cuniq']: + if op == 'c': + out_str += "\t" + ','.join(rout) + else: + out_str += "\t" + ','.join(list(set(rout))) + else: + out_str += "\t" + str( rout ) except: skipped_lines += 1 if not first_invalid_line: @@ -212,6 +234,8 @@ op = 'count_distinct' elif op == 'random': op = 'randomly_pick' + elif op == 'cuniq': + op = 'concat_distinct' msg += op + "[c" + cols[i] + "] " if skipped_lines > 0: msg+= "--skipped %d invalid lines starting with line %d. Value '%s' in column %d is not numeric." % ( skipped_lines, first_invalid_line, invalid_value, invalid_column ) diff -r e9971a098d14 -r 541ae0e5e599 tools/stats/grouping.xml --- a/tools/stats/grouping.xml Tue Apr 21 10:45:13 2009 -0400 +++ b/tools/stats/grouping.xml Tue Apr 21 10:53:34 2009 -0400 @@ -1,4 +1,4 @@ -<tool id="Grouping1" name="Group" version="1.5.0"> +<tool id="Grouping1" name="Group" version="1.6.0"> <description>data by a column and perform aggregate operation on other columns.</description> <command interpreter="python"> grouping.py @@ -23,6 +23,7 @@ <option value="length">Count</option> <option value="unique">Count Distinct</option> <option value="c">Concatenate</option> + <option value="cuniq">Concatenate Distinct</option> <option value="random">Randomly pick</option> </param> <param name="opcol" label="On column" type="data_column" data_ref="input1" /> @@ -87,7 +88,7 @@ - running this tool with **Group by column 1**, Operations **Mean on column 2** and **Concatenate on column 3** will return:: - chr10 1700.00 ['NM_11', 'NM_10'] - chr22 1533.33 ['NM_17', 'NM_19', 'NM_18'] + chr10 1700.00 NM_11,NM_10 + chr22 1533.33 NM_17,NM_19,NM_18 </help> </tool>
participants (1)
-
gua110ļ¼ scofield.bx.psu.edu