- galaxy-dev - lists.galaxyproject.org

[hg] galaxy 1522: Adding a new set of toolss to perform multiple...
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/05974294cbf1 changeset: 1522:05974294cbf1 user: guru date: Sat Sep 20 18:14:24 2008 -0400 description: Adding a new set of toolss to perform multiple linear regression analysis. 9 file(s) affected in this change: test-data/rcve_out.dat test-data/reg_inp.tab tool_conf.xml.sample tools/regVariation/best_regression_subsets.py tools/regVariation/best_regression_subsets.xml tools/regVariation/linear_regression.py tools/regVariation/linear_regression.xml tools/regVariation/rcve.py tools/regVariation/rcve.xml diffs (700 lines): diff -r 618210a97e62 -r 05974294cbf1 test-data/rcve_out.dat --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/rcve_out.dat Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,8 @@ +#Model R-sq RCVE_Terms RCVE_Value +2 3 4 0.3997 - - +3 4 0.3319 2 0.1697 +2 4 0.2974 3 0.2561 +2 3 0.3985 4 0.0031 +4 0.1226 2 3 0.6934 +3 0.2733 2 4 0.3164 +2 0.2972 3 4 0.2564 diff -r 618210a97e62 -r 05974294cbf1 test-data/reg_inp.tab --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/reg_inp.tab Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,100 @@ +2.04 2.01 1070 5 +2.56 3.40 1254 6 +3.75 3.68 1466 6 +1.10 1.54 706 4 +3.00 3.32 1160 5 +0.05 0.33 756 3 +1.38 0.36 1058 2 +1.50 1.97 1008 7 +1.38 2.03 1104 4 +4.01 2.05 1200 7 +1.50 2.13 896 7 +1.29 1.34 848 3 +1.90 1.51 958 5 +3.11 3.12 1246 6 +1.92 2.14 1106 4 +0.81 2.60 790 5 +1.01 1.90 954 4 +3.66 3.06 1500 6 +2.00 1.60 1046 5 +2.05 1.96 1054 4 +2.60 1.96 1198 6 +2.55 1.56 940 3 +0.38 1.60 456 6 +2.48 1.92 1150 7 +2.74 3.09 636 6 +1.77 0.78 744 5 +1.61 2.12 644 5 +0.99 1.85 842 3 +1.62 1.78 852 5 +2.03 1.03 1170 3 +3.50 3.44 1034 10 +3.18 2.42 1202 5 +2.39 1.74 1018 5 +1.48 1.89 1180 5 +1.54 1.43 952 3 +1.57 1.64 1038 4 +2.46 2.69 1090 6 +2.42 1.79 694 5 +2.11 2.72 1096 6 +2.04 2.15 1114 5 +1.68 2.22 1256 6 +1.64 1.55 1208 5 +2.41 2.34 820 6 +2.10 2.92 1222 4 +1.40 2.10 1120 5 +2.03 1.64 886 4 +1.99 2.83 1126 7 +2.24 1.76 1158 4 +0.45 1.81 676 6 +2.31 2.68 1214 7 +2.41 2.55 1136 6 +2.56 2.70 1264 6 +2.50 1.66 1116 3 +2.92 2.23 1292 4 +2.35 2.01 604 5 +2.82 1.24 854 6 +1.80 1.95 814 6 +1.29 1.73 778 3 +1.68 1.08 800 2 +3.44 3.46 1424 7 +1.90 3.01 950 6 +2.06 0.54 1056 3 +3.30 3.20 956 8 +1.80 1.50 1352 5 +2.00 1.71 852 5 +1.68 1.99 1168 5 +1.94 2.76 970 6 +0.97 1.56 776 4 +1.12 1.78 854 6 +1.31 1.32 1232 5 +1.68 0.87 1140 6 +3.09 1.75 1084 4 +1.87 1.41 954 2 +2.00 2.77 1000 4 +2.39 1.78 1084 4 +1.50 1.34 1058 4 +1.82 1.52 816 5 +1.80 2.97 1146 7 +2.01 1.75 1000 6 +1.88 1.64 856 4 +1.64 1.80 798 4 +2.42 3.37 1324 6 +0.22 1.15 704 6 +2.31 1.72 1222 5 +0.95 2.27 948 6 +1.99 2.85 1182 8 +1.86 2.21 1000 6 +1.79 1.94 910 6 +3.02 4.25 1374 9 +1.85 1.83 1014 6 +1.98 2.75 1420 7 +2.15 1.71 400 6 +1.46 2.20 998 7 +2.29 2.13 776 6 +2.39 2.38 1134 7 +1.80 1.64 772 4 +2.64 1.87 1304 6 +2.08 2.53 1212 4 +0.70 1.78 818 6 +0.89 1.20 864 2 \ No newline at end of file diff -r 618210a97e62 -r 05974294cbf1 tool_conf.xml.sample --- a/tool_conf.xml.sample Fri Sep 19 12:34:51 2008 -0400 +++ b/tool_conf.xml.sample Sat Sep 20 18:14:24 2008 -0400 @@ -128,6 +128,11 @@ <tool file="regVariation/getIndels_2way.xml" /> <tool file="regVariation/getIndels_3way.xml" /> <tool file="regVariation/getIndelRates_3way.xml" /> + </section> + <section name="Multiple regression" id="multReg"> + <tool file="regVariation/linear_regression.xml" /> + <tool file="regVariation/best_regression_subsets.xml" /> + <tool file="regVariation/rcve.xml" /> </section> <section name="Evolution: HyPhy" id="hyphy"> <tool file="hyphy/hyphy_branch_lengths_wrapper.xml" /> diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/best_regression_subsets.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/regVariation/best_regression_subsets.py Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,90 @@ +#!/usr/bin/env python + +from galaxy import eggs + +import sys, string +from rpy import * +import numpy + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + +infile = sys.argv[1] +y_col = int(sys.argv[2])-1 +x_cols = sys.argv[3].split(',') +outfile = sys.argv[4] +outfile2 = sys.argv[5] +print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1) +fout = open(outfile,'w') + +for i, line in enumerate( file ( infile )): + line = line.rstrip('\r\n') + if len( line )>0 and not line.startswith( '#' ): + elems = line.split( '\t' ) + break + if i == 30: + break # Hopefully we'll never get here... + +if len( elems )<1: + stop_err( "The data in your input dataset is either missing or not formatted properly." ) + +y_vals = [] +x_vals = [] + +for k,col in enumerate(x_cols): + x_cols[k] = int(col)-1 + x_vals.append([]) + +NA = 'NA' +for ind,line in enumerate( file( infile )): + if line and not line.startswith( '#' ): + try: + fields = line.split("\t") + try: + yval = float(fields[y_col]) + except Exception, ey: + yval = r('NA') + y_vals.append(yval) + for k,col in enumerate(x_cols): + try: + xval = float(fields[col]) + except Exception, ex: + xval = r('NA') + x_vals[k].append(xval) + except: + pass + +response_term = "" + +x_vals1 = numpy.asarray(x_vals).transpose() + +dat= r.list(x=array(x_vals1), y=y_vals) + +r.library("leaps") + +set_default_mode(NO_CONVERSION) +try: + leaps = r.regsubsets(r("y ~ x"), data= r.na_exclude(dat)) +except RException, rex: + stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.") +set_default_mode(BASIC_CONVERSION) + +summary = r.summary(leaps) +tot = len(x_vals) +pattern = "[" +for i in range(tot): + pattern = pattern + 'c' + str(int(x_cols[int(i)]) + 1) + ' ' +pattern = pattern.strip() + ']' +print >>fout, "#Vars\t%s\tR-sq\tAdj. R-sq\tC-p\tbic" %(pattern) +for ind,item in enumerate(summary['outmat']): + print >>fout, "%s\t%s\t%s\t%s\t%s\t%s" %(str(item).count('*'), item, summary['rsq'][ind], summary['adjr2'][ind], summary['cp'][ind], summary['bic'][ind]) + + +r.pdf( outfile2, 8, 8 ) +r.plot(leaps, scale="Cp", main="Best subsets using Cp Criterion") +r.plot(leaps, scale="r2", main="Best subsets using R-sq Criterion") +r.plot(leaps, scale="adjr2", main="Best subsets using Adjusted R-sq Criterion") +r.plot(leaps, scale="bic", main="Best subsets using bic Criterion") + +r.dev_off() diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/best_regression_subsets.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/regVariation/best_regression_subsets.xml Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,64 @@ +<tool id="BestSubsetsRegression1" name="Perform Best-subsets Regression"> + <description> </description> + <command interpreter="python"> + best_regression_subsets.py + $input1 + $response_col + $predictor_cols + $out_file1 + $out_file2 + 1>/dev/null + 2>/dev/null + </command> + <inputs> + <param format="tabular" name="input1" type="data" label="Select data" help="Query missing? See TIP below."/> + <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" /> + <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" multiple="true" /> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input1" /> + <data format="pdf" name="out_file2" /> + </outputs> + <requirements> + <requirement type="python-module">rpy</requirement> + </requirements> + <tests> +  + </tests> + <help> + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Edit Queries->Convert characters* + +----- + +.. class:: infomark + +**What it does** + +This tool uses the 'regsubsets' function from R statistical package for regression subset selection. It outputs two files, one containing a table with the best subsets and the corresponding summary statistics, and the other containing the graphical representation of the results. + +----- + +.. class:: warningmark + +**Note** + +- This tool currently treats all predictor and response variables as continuous variables. + +- Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis. + +- The 6 columns in the output are described below: + + - Column 1 (Vars): denotes the number of variables in the model + - Column 2 ([c2 c3 c4...]): represents a list of the user-selected predictor variables (full model). An asterix denotes the presence of the corresponding predictor variable in the selected model. + - Column 3 (R-sq): the fraction of variance explained by the model + - Column 4 (Adj. R-sq): the above R-squared statistic adjusted, penalizing for higher number of predictors (p) + - Column 5 (Cp): Mallow's Cp statistics + - Column 6 (bic): Bayesian Information Criterion. + + + </help> +</tool> diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/linear_regression.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/regVariation/linear_regression.py Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,117 @@ +#!/usr/bin/env python + +from galaxy import eggs +import sys, string +from rpy import * +import numpy + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + +infile = sys.argv[1] +y_col = int(sys.argv[2])-1 +x_cols = sys.argv[3].split(',') +outfile = sys.argv[4] +outfile2 = sys.argv[5] + +print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1) +fout = open(outfile,'w') + +for i, line in enumerate( file ( infile )): + line = line.rstrip('\r\n') + if len( line )>0 and not line.startswith( '#' ): + elems = line.split( '\t' ) + break + if i == 30: + break # Hopefully we'll never get here... + +if len( elems )<1: + stop_err( "The data in your input dataset is either missing or not formatted properly." ) + +y_vals = [] +x_vals = [] + +for k,col in enumerate(x_cols): + x_cols[k] = int(col)-1 + x_vals.append([]) + +NA = 'NA' +for ind,line in enumerate( file( infile )): + if line and not line.startswith( '#' ): + try: + fields = line.split("\t") + try: + yval = float(fields[y_col]) + except: + yval = r('NA') + y_vals.append(yval) + for k,col in enumerate(x_cols): + try: + xval = float(fields[col]) + except: + xval = r('NA') + x_vals[k].append(xval) + except: + pass + +x_vals1 = numpy.asarray(x_vals).transpose() + +dat= r.list(x=array(x_vals1), y=y_vals) + +set_default_mode(NO_CONVERSION) +try: + linear_model = r.lm(r("y ~ x"), data = r.na_exclude(dat)) +except RException, rex: + stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain only non-numeric or invalid values.") +set_default_mode(BASIC_CONVERSION) + +coeffs=linear_model.as_py()['coefficients'] +yintercept= coeffs['(Intercept)'] +print >>fout, "Y-intercept\t%s" %(yintercept) +summary = r.summary(linear_model) + +co = summary.get('coefficients', 'NA') +""" +if len(co) != len(x_vals)+1: + stop_err("Stopped performing linear regression on the input data, since one of the predictor columns contains only non-numeric or invalid values.") +""" +print >>fout, "p-value (Y-intercept)\t%s" %(co[0][3]) + +if len(x_vals) == 1: #Simple linear regression case with 1 predictor variable + try: + slope = coeffs['x'] + except: + slope = 'NA' + try: + pval = co[1][3] + except: + pval = 'NA' + print >>fout, "Slope (c%d)\t%s" %(x_cols[0]+1,slope) + print >>fout, "p-value (c%d)\t%s" %(x_cols[0]+1,pval) +else: #Multiple regression case with >1 predictors + ind=1 + while ind < len(coeffs.keys()): + print >>fout, "Slope (c%d)\t%s" %(x_cols[ind-1]+1,coeffs['x'+str(ind)]) + try: + pval = co[ind][3] + except: + pval = 'NA' + print >>fout, "p-value (c%d)\t%s" %(x_cols[ind-1]+1,pval) + ind+=1 + +print >>fout, "R-squared\t%s" %(summary.get('r.squared','NA')) +print >>fout, "Adjusted R-squared\t%s" %(summary.get('adj.r.squared','NA')) +print >>fout, "F-statistic\t%s" %(summary.get('fstatistic','NA')) +print >>fout, "Sigma\t%s" %(summary.get('sigma','NA')) + +r.pdf( outfile2, 8, 8 ) +if len(x_vals) == 1: #Simple linear regression case with 1 predictor variable + sub_title = "Slope = %s; Y-int = %s" %(slope,yintercept) + r.plot(x=x_vals[0], y=y_vals, xlab="X", ylab="Y", sub=sub_title, main="Scatterplot with regression") + r.abline(a=yintercept, b=slope, col="red") +else: + r.pairs(dat, main="Scatterplot Matrix", col="blue") + +r.plot(linear_model) +r.dev_off() diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/linear_regression.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/regVariation/linear_regression.xml Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,62 @@ +<tool id="LinearRegression1" name="Perform Linear Regression"> + <description> </description> + <command interpreter="python"> + linear_regression.py + $input1 + $response_col + $predictor_cols + $out_file1 + $out_file2 + 1>/dev/null + </command> + <inputs> + <param format="tabular" name="input1" type="data" label="Select data" help="Query missing? See TIP below."/> + <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" /> + <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" multiple="true" /> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input1" /> + <data format="pdf" name="out_file2" /> + </outputs> + <requirements> + <requirement type="python-module">rpy</requirement> + </requirements> + <tests> +  + </tests> + <help> + + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Edit Queries->Convert characters* + +----- + +.. class:: infomark + +**What it does** + +This tool uses the 'lm' function from R statistical package to perform linear regression on the input data. It outputs two files, one containing the summary statistics of the performed regression, and the other containing diagnostic plots to check whether model assumptions are satisfied. + +----- + +.. class:: warningmark + +**Note** + +- This tool currently treats all predictor and response variables as continuous variables. + +- Rows containing non-numeric (or missing) data in any of the chosen columns will be skipped from the analysis. + +- The summary statistics in the output are described below: + + - sigma: the square root of the estimated variance of the random error (standard error of the residiuals) + - R-squared: the fraction of variance explained by the model + - Adjusted R-squared: the above R-squared statistic adjusted, penalizing for the number of the predictors (p) + - p-value: p-value for the t-test of the null hypothesis that the corresponding slope is equal to zero against the two-sided alternative. + + + </help> +</tool> diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/rcve.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/regVariation/rcve.py Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,143 @@ +#!/usr/bin/env python + +from galaxy import eggs + +import sys, string +from rpy import * +import numpy + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + +def sscombs(s): + if len(s) == 1: + return [s] + else: + ssc = sscombs(s[1:]) + return [s[0]] + [s[0]+comb for comb in ssc] + ssc + + +infile = sys.argv[1] +y_col = int(sys.argv[2])-1 +x_cols = sys.argv[3].split(',') +outfile = sys.argv[4] + +print "Predictor columns: %s; Response column: %d" %(x_cols,y_col+1) +fout = open(outfile,'w') + +for i, line in enumerate( file ( infile )): + line = line.rstrip('\r\n') + if len( line )>0 and not line.startswith( '#' ): + elems = line.split( '\t' ) + break + if i == 30: + break # Hopefully we'll never get here... + +if len( elems )<1: + stop_err( "The data in your input dataset is either missing or not formatted properly." ) + +y_vals = [] +x_vals = [] + +for k,col in enumerate(x_cols): + x_cols[k] = int(col)-1 + x_vals.append([]) + """ + try: + float( elems[x_cols[k]] ) + except: + try: + msg = "This operation cannot be performed on non-numeric column %d containing value '%s'." %( col, elems[x_cols[k]] ) + except: + msg = "This operation cannot be performed on non-numeric data." + stop_err( msg ) + """ +NA = 'NA' +for ind,line in enumerate( file( infile )): + if line and not line.startswith( '#' ): + try: + fields = line.split("\t") + try: + yval = float(fields[y_col]) + except Exception, ey: + yval = r('NA') + #print >>sys.stderr, "ey = %s" %ey + y_vals.append(yval) + for k,col in enumerate(x_cols): + try: + xval = float(fields[col]) + except Exception, ex: + xval = r('NA') + #print >>sys.stderr, "ex = %s" %ex + x_vals[k].append(xval) + except: + pass + +x_vals1 = numpy.asarray(x_vals).transpose() +dat= r.list(x=array(x_vals1), y=y_vals) + +set_default_mode(NO_CONVERSION) +try: + full = r.lm(r("y ~ x"), data= r.na_exclude(dat)) #full model includes all the predictor variables specified by the user +except RException, rex: + stop_err("Error performing linear regression on the input data.\nEither the response column or one of the predictor columns contain no numeric values.") +set_default_mode(BASIC_CONVERSION) + +summary = r.summary(full) +fullr2 = summary.get('r.squared','NA') + +if fullr2 == 'NA': + stop_error("Error in linear regression") + +if len(x_vals) < 10: + s = "" + for ch in range(len(x_vals)): + s += str(ch) +else: + stop_err("This tool only works with less than 10 predictors.") + +print >>fout, "#Model\tR-sq\tRCVE_Terms\tRCVE_Value" +all_combos = sorted(sscombs(s), key=len) +all_combos.reverse() +for j,cols in enumerate(all_combos): + #if len(cols) == len(s): #Same as the full model above + # continue + if len(cols) == 1: + x_vals1 = x_vals[int(cols)] + else: + x_v = [] + for col in cols: + x_v.append(x_vals[int(col)]) + x_vals1 = numpy.asarray(x_v).transpose() + dat= r.list(x=array(x_vals1), y=y_vals) + set_default_mode(NO_CONVERSION) + red = r.lm(r("y ~ x"), data= dat) #Reduced model + set_default_mode(BASIC_CONVERSION) + summary = r.summary(red) + redr2 = summary.get('r.squared','NA') + try: + rcve = (float(fullr2)-float(redr2))/float(fullr2) + except: + rcve = 'NA' + col_str = "" + for col in cols: + col_str = col_str + str(int(x_cols[int(col)]) + 1) + " " + col_str.strip() + rcve_col_str = "" + for col in s: + if col not in cols: + rcve_col_str = rcve_col_str + str(int(x_cols[int(col)]) + 1) + " " + rcve_col_str.strip() + if len(cols) == len(s): #full model + rcve_col_str = "-" + rcve = "-" + try: + redr2 = "%.4f" %(float(redr2)) + except: + pass + try: + rcve = "%.4f" %(float(rcve)) + except: + pass + print >>fout, "%s\t%s\t%s\t%s" %(col_str,redr2,rcve_col_str,rcve) diff -r 618210a97e62 -r 05974294cbf1 tools/regVariation/rcve.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/regVariation/rcve.xml Sat Sep 20 18:14:24 2008 -0400 @@ -0,0 +1,68 @@ +<tool id="rcve1" name="Compute RCVE" version="1.0.0"> + <description> </description> + <command interpreter="python"> + rcve.py + $input1 + $response_col + $predictor_cols + $out_file1 + 1>/dev/null + </command> + <inputs> + <param format="tabular" name="input1" type="data" label="Select data" help="Query missing? See TIP below."/> + <param name="response_col" label="Response column (Y)" type="data_column" data_ref="input1" /> + <param name="predictor_cols" label="Predictor columns (X)" type="data_column" data_ref="input1" multiple="true" /> + </inputs> + <outputs> + <data format="input" name="out_file1" metadata_source="input1" /> + </outputs> + <requirements> + <requirement type="python-module">rpy</requirement> + </requirements> + <tests> +  + <test> + <param name="input1" value="reg_inp.tab"/> + <param name="response_col" value="1"/> + <param name="predictor_cols" value="2,3,4"/> + <output name="out_file1" file="rcve_out.dat"/> + </test> + + </tests> + <help> + +.. class:: infomark + +**TIP:** If your data is not TAB delimited, use *Edit Queries->Convert characters* + +----- + +.. class:: infomark + +**What it does** + +This tool computes the RCVE (Relative Contribution to Variance) for all possible variable subsets using the following formula: + +**RCVE(i) = [R-sq (full: 1,2,..,i..,p-1) - R-sq(without i: 1,2,...,p-1)] / R-sq (full: 1,2,..,i..,p-1)**, +which denotes the case where the 'i'th predictor is dropped. + + +In general, +**RCVE(X+) = [R-sq (full: {X,X+}) - R-sq(reduced: {X})] / R-sq (full: {X,X+})**, +where, + +- {X,X+} denotes the set of all predictors, +- X+ is the set of predictors for which we compute RCVE (and therefore drop from the full model to obtain a reduced one), +- {X} is the set of the predictors that are left in the reduced model after excluding {X+} + + +The 4 columns in the output are described below: + +- Column 1 (Model): denotes the variables present in the model ({X}) +- Column 2 (R-sq): denotes the R-squared value corresponding to the model in Column 1 +- Column 3 (RCVE_Terms): denotes the variable/s for which RCVE is computed ({X+}). These are the variables that are absent in the reduced model in Column 1. A '-' in this column indicates that the model in Column 1 is the Full model. +- Column 4 (RCVE): denotes the RCVE value corresponding to the variable/s in Column 3. A '-' in this column indicates that the model in Column 1 is the Full model. + + + </help> +</tool>

1 0

[hg] galaxy 1518: Add a wrapper for metadata inside of DatasetFi...
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/0f735b21dc12 changeset: 1518:0f735b21dc12 user: Dan Blankenberg <dan(a)bx.psu.edu> date: Thu Sep 18 16:48:29 2008 -0400 description: Add a wrapper for metadata inside of DatasetFilenameWrapper to allow proper string substitution in commandline and templates. 2 file(s) affected in this change: lib/galaxy/datatypes/metadata.py lib/galaxy/tools/__init__.py diffs (56 lines): diff -r 1d326855ba89 -r 0f735b21dc12 lib/galaxy/datatypes/metadata.py --- a/lib/galaxy/datatypes/metadata.py Thu Sep 18 15:41:23 2008 -0400 +++ b/lib/galaxy/datatypes/metadata.py Thu Sep 18 16:48:29 2008 -0400 @@ -211,6 +211,9 @@ elif not isinstance(value, list): MetadataParameter.__setattr__(self, name, [value]) + def __iter__( self ): + return iter( self.value ) + def __str__(self): if self.value in [None, []]: return str(self.spec.no_value) diff -r 1d326855ba89 -r 0f735b21dc12 lib/galaxy/tools/__init__.py --- a/lib/galaxy/tools/__init__.py Thu Sep 18 15:41:23 2008 -0400 +++ b/lib/galaxy/tools/__init__.py Thu Sep 18 16:48:29 2008 -0400 @@ -1177,6 +1177,31 @@ Wraps a dataset so that __str__ returns the filename, but all other attributes are accessible. """ + + class MetadataWrapper: + """ + Wraps a Metadata Collection to return MetadataParameters wrapped according to the metadata spec. + Methods implemented to match behavior of a Metadata Collection. + """ + def __init__( self, metadata ): + self.metadata = metadata + def __getattr__( self, name ): + rval = self.metadata.get( name, None ) + if name in self.metadata.spec: + rval = self.metadata.spec[name].wrap( rval, self.metadata.parent ) + return rval + def __nonzero__( self ): + return self.metadata.__nonzero__() + def __iter__( self ): + return self.metadata.__iter__() + def get( self, key, default=None ): + try: + return getattr( self, key ) + except: + return default + def items( self ): + return iter( [ ( k, self.get( k ) ) for k, v in self.metadata.items() ] ) + def __init__( self, dataset, datatypes_registry = None, tool = None, name = None ): if not dataset: try: @@ -1187,6 +1212,7 @@ self.dataset = NoneDataset( datatypes_registry = datatypes_registry, ext = ext ) else: self.dataset = dataset + self.metadata = self.MetadataWrapper( dataset.metadata ) def __str__( self ): return self.dataset.file_name def __getattr__( self, key ):

1 0

[hg] galaxy 1519: Update GMAJ tool interface.
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/b2a9827178e2 changeset: 1519:b2a9827178e2 user: Dan Blankenberg <dan(a)bx.psu.edu> date: Fri Sep 19 12:27:20 2008 -0400 description: Update GMAJ tool interface. 1 file(s) affected in this change: tools/visualization/GMAJ.xml diffs (117 lines): diff -r 0f735b21dc12 -r b2a9827178e2 tools/visualization/GMAJ.xml --- a/tools/visualization/GMAJ.xml Thu Sep 18 16:48:29 2008 -0400 +++ b/tools/visualization/GMAJ.xml Fri Sep 19 12:27:20 2008 -0400 @@ -3,7 +3,10 @@ <command interpreter="python">GMAJ.py $out_file1 $maf_input $gmaj_file $filenames_file</command> <inputs> <param name="maf_input" type="data" format="maf" label="Alignment File" optional="False"/> - <param name="refseq" label="Reference Sequence" value="" type="text" help="Leave empty to allow interactive selection."/> + <param name="refseq" label="Reference Sequence" type="select"> + <option value="first" selected="true">First sequence in each block</option> + <option value="any">Any sequence</option> + </param> <repeat name="annotations" title="Annotations"> <conditional name="annotation_style"> <param name="style" type="select" label="Annotation Style" help="If your data is not in a style similar to what is available from Galaxy (and the UCSC table browser), choose 'Basic'."> @@ -11,7 +14,7 @@ <option value="basic">Basic</option> </param> <when value="galaxy"> - <param name="species" type="select" label="Species of Annotation" multiple="False"> + <param name="species" type="select" label="Species" multiple="False"> <options> <filter type="data_meta" ref="maf_input" key="species" /> </options> @@ -21,7 +24,6 @@ <param name="underlays_file" type="data" format="bed,gff" label="Underlays File" optional="True"/> <param name="repeats_file" type="data" format="bed,gff" label="Repeats File" optional="True"/> <param name="links_file" type="data" format="bed,gff" label="Links File" optional="True"/> - <param name="offset" label="Offset" value="0" type="integer"/> </when> <when value="basic"> <param name="seq_name" label="Full Sequence Name" value="" type="text"> @@ -44,6 +46,7 @@ <option name="Skipping unsupported paragraph (maf_paragraph)" value="maf_paragraph"/> <option name="Skipping all reconstruction scores: no species specified (recon_noseq)" value="recon_noseq"/> <option name="Skipping reconstruction scores in blocks with missing row (recon_missing)" value="recon_missing"/> + <option name="The first row in some blocks is not the specified reference sequence (refseq_not_first)" value="refseq_not_first"/> <option name="Skipping extra MAF File (unused_maf)" value="unused_maf"/> </option> <option name="Annotation Files" value="annotations"> @@ -71,12 +74,15 @@ </option> <option name="Red Flags" value="red"> <option name="Sequence name in annotation file does not match name in MAF (seqname_mismatch)" value="seqname_mismatch"/> - <option name="BED Start or end < 0 (bed_coord)" value="bed_coord"/> - <option name="GFF Start or end < 1 (gff_coord)" value="gff_coord"/> + <option name="BED start or end < 0 (bed_coord)" value="bed_coord"/> + <option name="GFF start or end < 1 (gff_coord)" value="gff_coord"/> <option name="Missing item name for URL substitution (url_subst)" value="url_subst"/> </option> </option> <option name="Miscellaneous" value="miscellaneous"> + <option name="No refseq specified; assuming 'first' (default_refseq)" value="default_refseq"/> + <option name="One or more bundle entries are not used in parameters file(unused_entry)" value="unused_entry"/> + <option name="Skipping blocks for export where reference sequence is hidden or all gaps (export_skip)" value="export_skip"/> <option name="Possible parse error: token ends with an escaped quote (escaped_quote)" value="escaped_quote"/> <option name="Draggable panel dividers will not be sticky (no_sticky)" value="no_sticky"/> </option> @@ -89,11 +95,7 @@ title = "Galaxy: $maf_input.name" alignfile = input.maf -#if $refseq.value: refseq = $refseq -#else: -refseq = any -#end if tabext = .bed .gff .gtf #if $nowarn.value: nowarn = $nowarn @@ -102,36 +104,35 @@ #set $seq_count = 0 #for $annotation_count, $annotation in $enumerate( $annotations ): #if $annotation.annotation_style.style == "galaxy": -#if $maf_input.metadata.species_chromosomes and $annotation.annotation_style['species'].value in $maf_input.metadata.species_chromosomes and $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]: -#set $seq_names = [ "%s.%s" % ( $annotation.annotation_style['species'].value, $chrom ) for $chrom in $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]] -#set $aliases = [ " %s" % $chrom for $chrom in $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]] +#if $maf_input.dataset.metadata.species_chromosomes and $annotation.annotation_style['species'].value in $maf_input.dataset.metadata.species_chromosomes and $maf_input.dataset.metadata.species_chromosomes[$annotation.annotation_style['species'].value]: +#set $seq_names = [ "%s.%s" % ( $annotation.annotation_style['species'].value, $chrom ) for $chrom in $maf_input.dataset.metadata.species_chromosomes[$annotation.annotation_style['species'].value]] #else: #set $seq_names = [$annotation.annotation_style['species']] -#set $aliases = [""] #end if #else: #set $seq_names = [$annotation.annotation_style['seq_name']] -#set $aliases = [""] #end if -#for $seq_name, $alias in $zip( $seq_names, $aliases ): +#for $seq_name in $seq_names: seq ${seq_count}: seqname = $seq_name #if $annotation.annotation_style['exons_file'].dataset: -exons = ${annotation_count}.exons.${annotation.annotation_style['exons_file'].extension}$alias +exons = ${annotation_count}.exons.${annotation.annotation_style['exons_file'].extension} #end if #if $annotation.annotation_style['repeats_file'].dataset: -repeats = ${annotation_count}.repeats.${annotation.annotation_style['repeats_file'].extension}$alias +repeats = ${annotation_count}.repeats.${annotation.annotation_style['repeats_file'].extension} #end if #if $annotation.annotation_style['links_file'].dataset: -links = ${annotation_count}.links.${annotation.annotation_style['links_file'].extension}$alias +links = ${annotation_count}.links.${annotation.annotation_style['links_file'].extension} #end if #if $annotation.annotation_style['underlays_file'].dataset: -underlays = ${annotation_count}.underlays.${annotation.annotation_style['underlays_file'].extension}$alias +underlays = ${annotation_count}.underlays.${annotation.annotation_style['underlays_file'].extension} #end if #if $annotation.annotation_style['highlights_file'].dataset: -highlights = ${annotation_count}.highlights.${annotation.annotation_style['highlights_file'].extension}$alias +highlights = ${annotation_count}.highlights.${annotation.annotation_style['highlights_file'].extension} #end if +#if $annotation.annotation_style.style == "basic": offset = $annotation.annotation_style['offset'] +#end if #set $seq_count = $seq_count + 1 #end for

1 0

[hg] galaxy 1520: Fix a bug in shrimp_wrapper and add a tool for...
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/9ef55e79068b changeset: 1520:9ef55e79068b user: wychung date: Fri Sep 19 12:02:13 2008 -0400 description: Fix a bug in shrimp_wrapper and add a tool for splitting paired-end reads. Update datatype/fastqsolexa so the number of sequences is correct. 7 file(s) affected in this change: lib/galaxy/datatypes/sequence.py test-data/split_paired_reads_test1.fastq test-data/split_paired_reads_test1.out1 tool_conf.xml.sample tools/metag_tools/shrimp_wrapper.py tools/metag_tools/split_paired_reads.py tools/metag_tools/split_paired_reads.xml diffs (216 lines): diff -r 0f735b21dc12 -r 9ef55e79068b lib/galaxy/datatypes/sequence.py --- a/lib/galaxy/datatypes/sequence.py Thu Sep 18 16:48:29 2008 -0400 +++ b/lib/galaxy/datatypes/sequence.py Fri Sep 19 12:02:13 2008 -0400 @@ -98,8 +98,8 @@ dataset.peek = data.get_file_peek( dataset.file_name ) count = size = 0 bases_regexp = re.compile("^[NGTAC]*$") - for line in file( dataset.file_name ): - if line and line[0] == "@": + for i, line in enumerate(file( dataset.file_name )): + if line and line[0] == "@" and i % 4 == 0: count += 1 elif bases_regexp.match(line): line = line.strip() diff -r 0f735b21dc12 -r 9ef55e79068b test-data/split_paired_reads_test1.fastq --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/split_paired_reads_test1.fastq Fri Sep 19 12:02:13 2008 -0400 @@ -0,0 +1,21 @@ +@HWI-EAS91_1_30788AAXX:7:21:1542:1758 +GTCAATTGTACTGGTCAATACTAAAAGAATAGGATCGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA ++HWI-EAS91_1_30788AAXX:7:21:1542:1758 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR +@HWI-EAS91_1_30788AAXX:7:22:1621:462 +ATAATGGCTATTATTGTGGGGGGGATGATGCTGGAAACTAGCCCCAATATCAATCCTATATCAAATCTCACC ++HWI-EAS91_1_30788AAXX:7:22:1621:462 +hhhhhhhhhhhhQAhh@hhhhNhhhfhMbCIScC?hhJhhhhChhhJhhhRhhKhePhc\KhhV\KhXhJhh +@HWI-EAS91_1_30788AAXX:7:45:408:807 +TACCCGATTTTTTGCTTTCCACTTTATCCTACCCTTATGAGTGCTAGGATCAGGATGGAGAGGATTAGGGCT ++HWI-EAS91_1_30788AAXX:7:45:408:807 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hhhZh`hhhhhRXhhYh +@HWI-EAS91_1_30788AAXX:7:49:654:1439 +CTAACTCTATTTATTGTATTTCAACTAAAAATCTCATAGGTTTATTGATAGTTGTGTTGTTGGTGTAAATGG ++HWI-EAS91_1_30788AAXX:7:49:654:1439 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhdhh_hG\XhU@ +@HWI-EAS91_1_30788AAXX:7:64:947:234 +TATCAAAAAAGAATATAATCTGAATCAACACTACAACCTATTAGTGTGTAGAATAGGAAGTAGAGGCCTGCG ++HWI-EAS91_1_30788AAXX:7:64:947:234 +hhhhhhhhhhhhhhhhhhhhhhhRhhehhahhhhhJhhhhhhhh^hPhWfhhhhThWUhhfhh_hhNIVPUd + diff -r 0f735b21dc12 -r 9ef55e79068b test-data/split_paired_reads_test1.out1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/test-data/split_paired_reads_test1.out1 Fri Sep 19 12:02:13 2008 -0400 @@ -0,0 +1,20 @@ +@HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 +GTCAATTGTACTGGTCAATACTAAAAGAATAGGATC ++HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh +@HWI-EAS91_1_30788AAXX:7:22:1621:462/1 +ATAATGGCTATTATTGTGGGGGGGATGATGCTGGAA ++HWI-EAS91_1_30788AAXX:7:22:1621:462/1 +hhhhhhhhhhhhQAhh@hhhhNhhhfhMbCIScC?h +@HWI-EAS91_1_30788AAXX:7:45:408:807/1 +TACCCGATTTTTTGCTTTCCACTTTATCCTACCCTT ++HWI-EAS91_1_30788AAXX:7:45:408:807/1 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh +@HWI-EAS91_1_30788AAXX:7:49:654:1439/1 +CTAACTCTATTTATTGTATTTCAACTAAAAATCTCA ++HWI-EAS91_1_30788AAXX:7:49:654:1439/1 +hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh +@HWI-EAS91_1_30788AAXX:7:64:947:234/1 +TATCAAAAAAGAATATAATCTGAATCAACACTACAA ++HWI-EAS91_1_30788AAXX:7:64:947:234/1 +hhhhhhhhhhhhhhhhhhhhhhhRhhehhahhhhhJ diff -r 0f735b21dc12 -r 9ef55e79068b tool_conf.xml.sample --- a/tool_conf.xml.sample Thu Sep 18 16:48:29 2008 -0400 +++ b/tool_conf.xml.sample Fri Sep 19 12:02:13 2008 -0400 @@ -274,6 +274,7 @@ <tool file="metag_tools/short_reads_figure_high_quality_length.xml" /> <tool file="metag_tools/short_reads_trim_seq.xml" /> <tool file="metag_tools/blat_coverage_report.xml" /> + <tool file="metag_tools/split_paired_reads.xml" /> </section> <section name="Short Read Mapping" id="solexa_tools"> <tool file="metag_tools/shrimp_wrapper.xml" /> diff -r 0f735b21dc12 -r 9ef55e79068b tools/metag_tools/shrimp_wrapper.py --- a/tools/metag_tools/shrimp_wrapper.py Thu Sep 18 16:48:29 2008 -0400 +++ b/tools/metag_tools/shrimp_wrapper.py Fri Sep 19 12:02:13 2008 -0400 @@ -162,6 +162,7 @@ readname, endindex = line[1:].split('/') else: score = line + if score: # the last one if hits.has_key(readname): if len(hits[readname]) == hit_per_read: @@ -182,8 +183,9 @@ match_count = 0 if hit_per_read == 1: - matches = [ hits[readkey]['1'] ] - match_count = 1 + if len(hits[readkey]['1']) == 1: + matches = [ hits[readkey]['1'] ] + match_count = 1 else: end1_data = hits[readkey]['1'] end2_data = hits[readkey]['2'] @@ -591,6 +593,7 @@ if os.path.exists(query_qual_end2): os.remove(query_qual_end2) if os.path.exists(shrimp_log): os.remove(shrimp_log) + if __name__ == '__main__': __main__() diff -r 0f735b21dc12 -r 9ef55e79068b tools/metag_tools/split_paired_reads.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/metag_tools/split_paired_reads.py Fri Sep 19 12:02:13 2008 -0400 @@ -0,0 +1,46 @@ +#! /usr/bin/python + +""" +Split Solexa paired end reads +""" + +import os, sys + +if __name__ == '__main__': + + infile = sys.argv[1] + outfile_end1 = open(sys.argv[2], 'w') + outfile_end2 = open(sys.argv[3], 'w') + + for i, line in enumerate(file(infile)): + line = line.rstrip() + if not line or line.startswith('#'): continue + + end1 = '' + end2 = '' + + line_index = i % 4 + + if line_index == 0: + end1 = line + '/1' + end2 = line + '/2' + + elif line_index == 1: + seq_len = len(line)/2 + end1 = line[0:seq_len] + end2 = line[seq_len:] + + elif line_index == 2: + end1 = line + '/1' + end2 = line + '/2' + + else: + qual_len = len(line)/2 + end1 = line[0:qual_len] + end2 = line[qual_len:] + + outfile_end1.write('%s\n' %(end1)) + outfile_end2.write('%s\n' %(end2)) + + outfile_end1.close() + outfile_end2.close() \ No newline at end of file diff -r 0f735b21dc12 -r 9ef55e79068b tools/metag_tools/split_paired_reads.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/metag_tools/split_paired_reads.xml Fri Sep 19 12:02:13 2008 -0400 @@ -0,0 +1,56 @@ +<tool id="split_paired_reads" name="Split" version="1.0.0"> + <description>paired-end reads into two ends</description> + <command interpreter="python"> + split_paired_reads.py $input $output1 $output2 + </command> + <inputs> + <param name="input" type="data" format="fastqsolexa" label="Your paired-end file" /> + </inputs> + <outputs> + <data name="output1" format="fastqsolexa"/> + <data name="output2" format="fastqsolexa"/> + </outputs> + <tests> + <test> + <param name="input" value="split_paired_reads_test1.fastq" ftype="fastqsolexa" /> + <output name="output1" file="split_paired_reads_test1.out1" fype="fastqsolexa" /> + </test> + </tests> +<help> + +**What it does** + +This tool splits a single paired-end file in half and returns two files with each ends. + +----- + +**Input formats** + +A multiple-fastq file, for example:: + + @HWI-EAS91_1_30788AAXX:7:21:1542:1758 + GTCAATTGTACTGGTCAATACTAAAAGAATAGGATCGCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA + +HWI-EAS91_1_30788AAXX:7:21:1542:1758 + hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR + + +----- + +**Outputs** + +One end:: + + @HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 + GTCAATTGTACTGGTCAATACTAAAAGAATAGGATC + +HWI-EAS91_1_30788AAXX:7:21:1542:1758/1 + hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh + +The other end:: + + @HWI-EAS91_1_30788AAXX:7:21:1542:1758/2 + GCTCCTAGCATCTGGAGTCTCTATCACCTGAGCCCA + +HWI-EAS91_1_30788AAXX:7:21:1542:1758/2 + hhhhhhhhhhhhhhhhhhhhhhhh`hfhhVZSWehR + +</help> +</tool>

1 0

[hg] galaxy 1521: Merge with b2a9827178e28d93e2a978f64033a556a72...
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/618210a97e62 changeset: 1521:618210a97e62 user: wychung date: Fri Sep 19 12:34:51 2008 -0400 description: Merge with b2a9827178e28d93e2a978f64033a556a72b4c51 0 file(s) affected in this change: diffs (117 lines): diff -r 9ef55e79068b -r 618210a97e62 tools/visualization/GMAJ.xml --- a/tools/visualization/GMAJ.xml Fri Sep 19 12:02:13 2008 -0400 +++ b/tools/visualization/GMAJ.xml Fri Sep 19 12:34:51 2008 -0400 @@ -3,7 +3,10 @@ <command interpreter="python">GMAJ.py $out_file1 $maf_input $gmaj_file $filenames_file</command> <inputs> <param name="maf_input" type="data" format="maf" label="Alignment File" optional="False"/> - <param name="refseq" label="Reference Sequence" value="" type="text" help="Leave empty to allow interactive selection."/> + <param name="refseq" label="Reference Sequence" type="select"> + <option value="first" selected="true">First sequence in each block</option> + <option value="any">Any sequence</option> + </param> <repeat name="annotations" title="Annotations"> <conditional name="annotation_style"> <param name="style" type="select" label="Annotation Style" help="If your data is not in a style similar to what is available from Galaxy (and the UCSC table browser), choose 'Basic'."> @@ -11,7 +14,7 @@ <option value="basic">Basic</option> </param> <when value="galaxy"> - <param name="species" type="select" label="Species of Annotation" multiple="False"> + <param name="species" type="select" label="Species" multiple="False"> <options> <filter type="data_meta" ref="maf_input" key="species" /> </options> @@ -21,7 +24,6 @@ <param name="underlays_file" type="data" format="bed,gff" label="Underlays File" optional="True"/> <param name="repeats_file" type="data" format="bed,gff" label="Repeats File" optional="True"/> <param name="links_file" type="data" format="bed,gff" label="Links File" optional="True"/> - <param name="offset" label="Offset" value="0" type="integer"/> </when> <when value="basic"> <param name="seq_name" label="Full Sequence Name" value="" type="text"> @@ -44,6 +46,7 @@ <option name="Skipping unsupported paragraph (maf_paragraph)" value="maf_paragraph"/> <option name="Skipping all reconstruction scores: no species specified (recon_noseq)" value="recon_noseq"/> <option name="Skipping reconstruction scores in blocks with missing row (recon_missing)" value="recon_missing"/> + <option name="The first row in some blocks is not the specified reference sequence (refseq_not_first)" value="refseq_not_first"/> <option name="Skipping extra MAF File (unused_maf)" value="unused_maf"/> </option> <option name="Annotation Files" value="annotations"> @@ -71,12 +74,15 @@ </option> <option name="Red Flags" value="red"> <option name="Sequence name in annotation file does not match name in MAF (seqname_mismatch)" value="seqname_mismatch"/> - <option name="BED Start or end < 0 (bed_coord)" value="bed_coord"/> - <option name="GFF Start or end < 1 (gff_coord)" value="gff_coord"/> + <option name="BED start or end < 0 (bed_coord)" value="bed_coord"/> + <option name="GFF start or end < 1 (gff_coord)" value="gff_coord"/> <option name="Missing item name for URL substitution (url_subst)" value="url_subst"/> </option> </option> <option name="Miscellaneous" value="miscellaneous"> + <option name="No refseq specified; assuming 'first' (default_refseq)" value="default_refseq"/> + <option name="One or more bundle entries are not used in parameters file(unused_entry)" value="unused_entry"/> + <option name="Skipping blocks for export where reference sequence is hidden or all gaps (export_skip)" value="export_skip"/> <option name="Possible parse error: token ends with an escaped quote (escaped_quote)" value="escaped_quote"/> <option name="Draggable panel dividers will not be sticky (no_sticky)" value="no_sticky"/> </option> @@ -89,11 +95,7 @@ title = "Galaxy: $maf_input.name" alignfile = input.maf -#if $refseq.value: refseq = $refseq -#else: -refseq = any -#end if tabext = .bed .gff .gtf #if $nowarn.value: nowarn = $nowarn @@ -102,36 +104,35 @@ #set $seq_count = 0 #for $annotation_count, $annotation in $enumerate( $annotations ): #if $annotation.annotation_style.style == "galaxy": -#if $maf_input.metadata.species_chromosomes and $annotation.annotation_style['species'].value in $maf_input.metadata.species_chromosomes and $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]: -#set $seq_names = [ "%s.%s" % ( $annotation.annotation_style['species'].value, $chrom ) for $chrom in $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]] -#set $aliases = [ " %s" % $chrom for $chrom in $maf_input.metadata.species_chromosomes[$annotation.annotation_style['species'].value]] +#if $maf_input.dataset.metadata.species_chromosomes and $annotation.annotation_style['species'].value in $maf_input.dataset.metadata.species_chromosomes and $maf_input.dataset.metadata.species_chromosomes[$annotation.annotation_style['species'].value]: +#set $seq_names = [ "%s.%s" % ( $annotation.annotation_style['species'].value, $chrom ) for $chrom in $maf_input.dataset.metadata.species_chromosomes[$annotation.annotation_style['species'].value]] #else: #set $seq_names = [$annotation.annotation_style['species']] -#set $aliases = [""] #end if #else: #set $seq_names = [$annotation.annotation_style['seq_name']] -#set $aliases = [""] #end if -#for $seq_name, $alias in $zip( $seq_names, $aliases ): +#for $seq_name in $seq_names: seq ${seq_count}: seqname = $seq_name #if $annotation.annotation_style['exons_file'].dataset: -exons = ${annotation_count}.exons.${annotation.annotation_style['exons_file'].extension}$alias +exons = ${annotation_count}.exons.${annotation.annotation_style['exons_file'].extension} #end if #if $annotation.annotation_style['repeats_file'].dataset: -repeats = ${annotation_count}.repeats.${annotation.annotation_style['repeats_file'].extension}$alias +repeats = ${annotation_count}.repeats.${annotation.annotation_style['repeats_file'].extension} #end if #if $annotation.annotation_style['links_file'].dataset: -links = ${annotation_count}.links.${annotation.annotation_style['links_file'].extension}$alias +links = ${annotation_count}.links.${annotation.annotation_style['links_file'].extension} #end if #if $annotation.annotation_style['underlays_file'].dataset: -underlays = ${annotation_count}.underlays.${annotation.annotation_style['underlays_file'].extension}$alias +underlays = ${annotation_count}.underlays.${annotation.annotation_style['underlays_file'].extension} #end if #if $annotation.annotation_style['highlights_file'].dataset: -highlights = ${annotation_count}.highlights.${annotation.annotation_style['highlights_file'].extension}$alias +highlights = ${annotation_count}.highlights.${annotation.annotation_style['highlights_file'].extension} #end if +#if $annotation.annotation_style.style == "basic": offset = $annotation.annotation_style['offset'] +#end if #set $seq_count = $seq_count + 1 #end for

1 0

[hg] galaxy 1517: update shrimp_wrapper.
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/1d326855ba89 changeset: 1517:1d326855ba89 user: wychung date: Thu Sep 18 15:41:23 2008 -0400 description: update shrimp_wrapper. 2 file(s) affected in this change: tools/metag_tools/shrimp_wrapper.py tools/metag_tools/shrimp_wrapper.xml diffs (621 lines): diff -r f1da9b95549b -r 1d326855ba89 tools/metag_tools/shrimp_wrapper.py --- a/tools/metag_tools/shrimp_wrapper.py Thu Sep 18 15:24:51 2008 -0400 +++ b/tools/metag_tools/shrimp_wrapper.py Thu Sep 18 15:41:23 2008 -0400 @@ -61,17 +61,13 @@ reversed_s.reverse() return "".join(reversed_s) -def generate_sub_table(result_file, ref_file, score_files, table_outfile, hit_per_read): +def generate_sub_table(result_file, ref_file, score_files, table_outfile, hit_per_read, insertion_size): - """ - TODO: the cross-over error has not been addressed yet. - """ + invalid_editstring_char = 0 - insertion_size = 600 + all_score_file = score_files.split(',') - all_score_file = score_files.split('&') - - if len(all_score_file) != hit_per_read: stop_err('Un-equal number of files!') + if len(all_score_file) != hit_per_read: stop_err('One or more query files is missing. Please check your dataset.') temp_table_name = tempfile.NamedTemporaryFile().name temp_table = open(temp_table_name, 'w') @@ -178,7 +174,7 @@ hits_score[readname] = {} hits_score[readname][endindex] = score - # mutation call to all mappings + # call to all mappings for readkey in hits.keys(): if len(hits[readkey]) != hit_per_read: continue @@ -211,6 +207,7 @@ match_count += 1 if match_count == 1: + for x, end_data in enumerate(matches[0]): end_strand, end_editstring, end_chr_start, end_chr_end, end_read_start, end_chrom = end_data @@ -226,20 +223,26 @@ gap_read = 0 while editindex < len(end_editstring): + editchr = end_editstring[editindex] chrA = '' chrB = '' locIndex = [] + if editchr.isdigit(): editcode = '' + while editchr.isdigit() and editindex < len(end_editstring): editcode += editchr editindex += 1 if editindex < len(end_editstring): editchr = end_editstring[editindex] + for baseIndex in range(int(editcode)): chrA += refsegment[match_len+baseIndex] chrB = chrA + match_len += int(editcode) + elif editchr == 'x': # crossover: inserted between the appropriate two bases # Two sequencing errors: 4x15x6 (25 matches with 2 crossovers) @@ -263,18 +266,21 @@ elif editchr == '(': editcode = '' + while editchr != ')' and editindex < len(end_editstring): if editindex < len(end_editstring): editchr = end_editstring[editindex] editcode += editchr editindex += 1 + editcode = editcode[1:-1] chrA = '-'*len(editcode) chrB = editcode else: - print 'Warning! Unknown symbols', editchr - + invalid_editstring_char += 1 + if end_strand == '-': + chrA = reverse_complement(chrA) chrB = reverse_complement(chrB) @@ -288,9 +294,12 @@ chrBx = chrB[mappingIndex] if chrAx and chrBx and chrBx.upper() != 'N': + if end_strand == '+': + chrom_loc = end_chr_start+match_len-len(chrA)+mappingIndex read_loc = end_read_start+match_len-len(chrA)+mappingIndex-gap_read + if chrAx == '-': chrom_loc -= 1 if chrBx == '-': @@ -300,9 +309,12 @@ # 1-based on chrom_loc and read_loc pos_line = pos_line + '\t'.join([end_chrom, str(chrom_loc+1), readkey+'/'+str(x+1), str(read_loc+1), chrAx, chrBx, scoreBx]) + '\n' + else: + chrom_loc = end_chr_end-match_len+mappingIndex read_loc = end_read_start+match_len-1-mappingIndex-gap_read + if chrAx == '-': chrom_loc -= 1 if chrBx == '-': @@ -314,11 +326,14 @@ rev_line = '\t'.join([end_chrom, str(chrom_loc+1), readkey+'/'+str(x+1), str(read_loc+1), chrAx, chrBx, scoreBx]) +'\n' + rev_line if chrom_cov.has_key(end_chrom): + if chrom_cov[end_chrom].has_key(chrom_loc): chrom_cov[end_chrom][chrom_loc] += 1 else: chrom_cov[end_chrom][chrom_loc] = 1 + else: + chrom_cov[end_chrom] = {} chrom_cov[end_chrom][chrom_loc] = 1 @@ -329,6 +344,7 @@ # chrom-wide coverage for i, line in enumerate(open(temp_table_name)): + line = line.rstrip() if not line or line.startswith('#'): continue @@ -348,6 +364,9 @@ outfile.close() if os.path.exists(temp_table_name): os.remove(temp_table_name) + + if invalid_editstring_char: + print 'Skip ', invalid_editstring_char, ' invalid characters in editstrings' return True @@ -359,7 +378,7 @@ seq_title_startswith = '' qual_title_startswith = '' - default_coding_value = 64 + default_coding_value = 64 # Solexa ascii-code fastq_block_lines = 0 for i, line in enumerate( file( infile_name ) ): @@ -448,16 +467,63 @@ def __main__(): + # SHRiMP path + shrimp = 'rmapper-ls' + # I/O - type_of_reads = sys.argv[1] # single or paired - input_target = sys.argv[2] # fasta - shrimp_outfile = sys.argv[3] # shrimp output - table_outfile = sys.argv[4] # table output - - # SHRiMP parameters: total = 15 - # TODO: put threshold on each of these parameters - if len(sys.argv) == 21 or len(sys.argv) == 22: - spaced_seed = sys.argv[5] + input_target_file = sys.argv[1] # fasta + shrimp_outfile = sys.argv[2] # shrimp output + table_outfile = sys.argv[3] # table output + single_or_paired = sys.argv[4].split(',') + + insertion_size = 600 + + if len(single_or_paired) == 1: # single or paired + type_of_reads = 'single' + hit_per_read = 1 + input_query = single_or_paired[0] + query_fasta = tempfile.NamedTemporaryFile().name + query_qual = tempfile.NamedTemporaryFile().name + + else: # paired-end + type_of_reads = 'paired' + hit_per_read = 2 + input_query_end1 = single_or_paired[0] + input_query_end2 = single_or_paired[1] + insertion_size = int(single_or_paired[2]) + query_fasta_end1 = tempfile.NamedTemporaryFile().name + query_fasta_end2 = tempfile.NamedTemporaryFile().name + query_qual_end1 = tempfile.NamedTemporaryFile().name + query_qual_end2 = tempfile.NamedTemporaryFile().name + + # SHRiMP parameters: total = 15, default values + spaced_seed = '111111011111' + seed_matches_per_window = '2' + seed_hit_taboo_length = '4' + seed_generation_taboo_length = '0' + seed_window_length = '115.0' + max_hits_per_read = '100' + max_read_length = '1000' + kmer = '-1' + sw_match_value = '100' + sw_mismatch_value = '-150' + sw_gap_open_ref = '-400' + sw_gap_open_query = '-400' + sw_gap_ext_ref = '-70' + sw_gap_ext_query = '-70' + sw_hit_threshold = '68.0' + + # TODO: put the threshold on each of these parameters + if len(sys.argv) > 5: + + try: + if sys.argv[5].isdigit(): + spaced_seed = sys.argv[5] + else: + stop_err('Error in assigning parameter: Spaced seed.') + except: + stop_err('Spaced seed must be a combination of 1s and 0s.') + seed_matches_per_window = sys.argv[6] seed_hit_taboo_length = sys.argv[7] seed_generation_taboo_length = sys.argv[8] @@ -473,53 +539,6 @@ sw_gap_ext_query = sys.argv[18] sw_hit_threshold = sys.argv[19] - # Single-end parameters - if type_of_reads == 'single': - input_query = sys.argv[20] # single-end - hit_per_read = 1 - query_fasta = tempfile.NamedTemporaryFile().name - query_qual = tempfile.NamedTemporaryFile().name - else: # Paired-end parameters - input_query_end1 = sys.argv[20] # paired-end - input_query_end2 = sys.argv[21] - hit_per_read = 2 - query_fasta_end1 = tempfile.NamedTemporaryFile().name - query_fasta_end2 = tempfile.NamedTemporaryFile().name - query_qual_end1 = tempfile.NamedTemporaryFile().name - query_qual_end2 = tempfile.NamedTemporaryFile().name - else: - spaced_seed = '111111011111' - seed_matches_per_window = '2' - seed_hit_taboo_length = '4' - seed_generation_taboo_length = '0' - seed_window_length = '115.0' - max_hits_per_read = '100' - max_read_length = '1000' - kmer = '-1' - sw_match_value = '100' - sw_mismatch_value = '-150' - sw_gap_open_ref = '-400' - sw_gap_open_query = '-400' - sw_gap_ext_ref = '-70' - sw_gap_ext_query = '-70' - sw_hit_threshold = '68.0' - - # Single-end parameters - if type_of_reads == 'single': - input_query = sys.argv[5] # single-end - hit_per_read = 1 - query_fasta = tempfile.NamedTemporaryFile().name - query_qual = tempfile.NamedTemporaryFile().name - else: # Paired-end parameters - input_query_end1 = sys.argv[5] # paired-end - input_query_end2 = sys.argv[6] - hit_per_read = 2 - query_fasta_end1 = tempfile.NamedTemporaryFile().name - query_fasta_end2 = tempfile.NamedTemporaryFile().name - query_qual_end1 = tempfile.NamedTemporaryFile().name - query_qual_end2 = tempfile.NamedTemporaryFile().name - - # temp file for shrimp log file shrimp_log = tempfile.NamedTemporaryFile().name @@ -532,7 +551,7 @@ # SHRiMP command if type_of_reads == 'single': - command = ' '.join(['rmapper-ls', '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta, input_target, '>', shrimp_outfile, '2>', shrimp_log]) + command = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta, input_target_file, '>', shrimp_outfile, '2>', shrimp_log]) try: os.system(command) @@ -541,9 +560,9 @@ if os.path.exists(query_qual): os.remove(query_qual) stop_err(str(e)) - else: - command_end1 = ' '.join(['rmapper-ls', '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end1, input_target, '>', shrimp_outfile, '2>', shrimp_log]) - command_end2 = ' '.join(['rmapper-ls', '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end2, input_target, '>>', shrimp_outfile, '2>>', shrimp_log]) + else: # paired + command_end1 = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end1, input_target_file, '>', shrimp_outfile, '2>', shrimp_log]) + command_end2 = ' '.join([shrimp, '-s', spaced_seed, '-n', seed_matches_per_window, '-t', seed_hit_taboo_length, '-9', seed_generation_taboo_length, '-w', seed_window_length, '-o', max_hits_per_read, '-r', max_read_length, '-d', kmer, '-m', sw_match_value, '-i', sw_mismatch_value, '-g', sw_gap_open_ref, '-q', sw_gap_open_query, '-e', sw_gap_ext_ref, '-f', sw_gap_ext_query, '-h', sw_hit_threshold, query_fasta_end2, input_target_file, '>>', shrimp_outfile, '2>>', shrimp_log]) try: os.system(command_end1) @@ -557,9 +576,9 @@ # convert to table if type_of_reads == 'single': - return_value = generate_sub_table(shrimp_outfile, input_target, query_qual, table_outfile, hit_per_read) + return_value = generate_sub_table(shrimp_outfile, input_target_file, query_qual, table_outfile, hit_per_read, insertion_size) else: - return_value = generate_sub_table(shrimp_outfile, input_target, query_qual_end1+'&'+query_qual_end2, table_outfile, hit_per_read) + return_value = generate_sub_table(shrimp_outfile, input_target_file, query_qual_end1+','+query_qual_end2, table_outfile, hit_per_read, insertion_size) # remove temp. files if type_of_reads == 'single': diff -r f1da9b95549b -r 1d326855ba89 tools/metag_tools/shrimp_wrapper.xml --- a/tools/metag_tools/shrimp_wrapper.xml Thu Sep 18 15:24:51 2008 -0400 +++ b/tools/metag_tools/shrimp_wrapper.xml Thu Sep 18 15:41:23 2008 -0400 @@ -1,50 +1,51 @@ <tool id="shrimp_wrapper" name="SHRiMP" version="1.0.0"> <description>SHort Read Mapping Package</description> <command interpreter="python"> - #if ($type_of_reads.single_or_paired=="single" and $param.skip_or_full=="skip"):#shrimp_wrapper.py $type_of_reads.single_or_paired $input_target $output1 $output2 $input_query - #elif ($type_of_reads.single_or_paired=="paired" and $param.skip_or_full=="skip"):#shrimp_wrapper.py $type_of_reads.single_or_paired $input_target $output1 $output2 ${type_of_reads.input1} ${type_of_reads.input2} - #elif ($type_of_reads.single_or_paired=="single" and $param.skip_or_full=="full"):#shrimp_wrapper.py $type_of_reads.single_or_paired $input_target $output1 $output2 $param.spaced_seed $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_hit_threshold $input_query - #elif ($type_of_reads.single_or_paired=="paired" and $param.skip_or_full=="full"):#shrimp_wrapper.py $type_of_reads.single_or_paired $input_target $output1 $output2 $param.spaced_seed $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_hit_threshold ${type_of_reads.input1} ${type_of_reads.input2} + #if ($type_of_reads.single_or_paired=="single" and $param.skip_or_full=="skip"):#shrimp_wrapper.py $input_target $output1 $output2 $input_query + #elif ($type_of_reads.single_or_paired=="paired" and $param.skip_or_full=="skip"):#shrimp_wrapper.py $input_target $output1 $output2 $type_of_reads.input1,$type_of_reads.input2,$type_of_reads.insertion_size + #elif ($type_of_reads.single_or_paired=="single" and $param.skip_or_full=="full"):#shrimp_wrapper.py $input_target $output1 $output2 $input_query $param.spaced_seed $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_hit_threshold + #elif ($type_of_reads.single_or_paired=="paired" and $param.skip_or_full=="full"):#shrimp_wrapper.py $input_target $output1 $output2 $type_of_reads.input1,$type_of_reads.input2,$type_of_reads.insertion_size $param.spaced_seed $param.seed_matches_per_window $param.seed_hit_taboo_length $param.seed_generation_taboo_length $param.seed_window_length $param.max_hits_per_read $param.max_read_length $param.kmer $param.sw_match_value $param.sw_mismatch_value $param.sw_gap_open_ref $param.sw_gap_open_query $param.sw_gap_ext_ref $param.sw_gap_ext_query $param.sw_hit_threshold #end if </command> <inputs> <page> - <param name="input_target" type="data" format="fasta" label="Reference sequence" /> <conditional name="type_of_reads"> <param name="single_or_paired" type="select" label="Single- or Paired-ends"> <option value="single">Single-end</option> <option value="paired">Paired-end</option> </param> <when value="single"> - <param name="input_query" type="data" format="fastqsolexa" label="Sequence file" /> + <param name="input_query" type="data" format="fastqsolexa" label="Align sequencing reads" /> </when> <when value="paired"> - <param name="input1" type="data" format="fastqsolexa" label="One end" /> - <param name="input2" type="data" format="fastqsolexa" label="The other end" /> + <param name="insertion_size" type="integer" size="5" value="600" label="Insertion length between two ends" help="bp" /> + <param name="input1" type="data" format="fastqsolexa" label="Align sequencing reads, one end" /> + <param name="input2" type="data" format="fastqsolexa" label="and the other end" /> </when> </conditional> + <param name="input_target" type="data" format="fasta" label="against reference" /> <conditional name="param"> - <param name="skip_or_full" type="select" label="SHRiMP parameter selection"> - <option value="skip">Default setting</option> - <option value="full">Full list</option> + <param name="skip_or_full" type="select" label="SHRiMP settings to use" help="For most mapping needs use Commonly used settings. If you want full control use Full List"> + <option value="skip">Commonly used</option> + <option value="full">Full Parameter List</option> </param> <when value="skip" /> <when value="full"> - <param name="spaced_seed" type="text" size="30" value="111111011111" label="Spaced Seed" /> - <param name="seed_matches_per_window" type="integer" size="5" value="2" label="Seed Matches per Window" /> - <param name="seed_hit_taboo_length" type="integer" size="5" value="4" label="Seed Hit Taboo Length" /> - <param name="seed_generation_taboo_length" type="integer" size="5" value="0" label="Seed Generation Taboo Length" /> - <param name="seed_window_length" type="float" size="10" value="115.0" label="Seed Window Length" help="in percentage"/> - <param name="max_hits_per_read" type="integer" size="10" value="100" label="Maximum Hits per Read" /> - <param name="max_read_length" type="integer" size="10" value="1000" label="Maximum Read Length" /> - <param name="kmer" type="integer" size="10" value="-1" label="Kmer Std. Deviation Limit" help="-1 as None"/> - <param name="sw_match_value" type="integer" size="10" value="100" label="S-W Match Value" /> - <param name="sw_mismatch_value" type="integer" size="10" value="-150" label="S-W Mismatch Value" /> - <param name="sw_gap_open_ref" type="integer" size="10" value="-400" label="S-W Gap Open Penalty (Reference)" /> - <param name="sw_gap_open_query" type="integer" size="10" value="-400" label="S-W Gap Open Penalty (Query)" /> - <param name="sw_gap_ext_ref" type="integer" size="10" value="-70" label="S-W Gap Extend Penalty (Reference)" /> - <param name="sw_gap_ext_query" type="integer" size="10" value="-70" label="S-W Gap Extend Penalty (Query)" /> - <param name="sw_hit_threshold" type="float" size="10" value="68.0" label="S-W Hit Threshold" help="in percentage"/> + <param name="spaced_seed" type="text" size="30" value="111111011111" label="Spaced Seed" /> + <param name="seed_matches_per_window" type="integer" size="5" value="2" label="Seed Matches per Window" /> + <param name="seed_hit_taboo_length" type="integer" size="5" value="4" label="Seed Hit Taboo Length" /> + <param name="seed_generation_taboo_length" type="integer" size="5" value="0" label="Seed Generation Taboo Length" /> + <param name="seed_window_length" type="float" size="10" value="115.0" label="Seed Window Length" help="in percentage"/> + <param name="max_hits_per_read" type="integer" size="10" value="100" label="Maximum Hits per Read" /> + <param name="max_read_length" type="integer" size="10" value="1000" label="Maximum Read Length" /> + <param name="kmer" type="integer" size="10" value="-1" label="Kmer Std. Deviation Limit" help="-1 as None"/> + <param name="sw_match_value" type="integer" size="10" value="100" label="S-W Match Value" /> + <param name="sw_mismatch_value" type="integer" size="10" value="-150" label="S-W Mismatch Value" /> + <param name="sw_gap_open_ref" type="integer" size="10" value="-400" label="S-W Gap Open Penalty (Reference)" /> + <param name="sw_gap_open_query" type="integer" size="10" value="-400" label="S-W Gap Open Penalty (Query)" /> + <param name="sw_gap_ext_ref" type="integer" size="10" value="-70" label="S-W Gap Extend Penalty (Reference)" /> + <param name="sw_gap_ext_query" type="integer" size="10" value="-70" label="S-W Gap Extend Penalty (Query)" /> + <param name="sw_hit_threshold" type="float" size="10" value="68.0" label="S-W Hit Threshold" help="in percentage"/> </when> </conditional> </page> @@ -54,7 +55,7 @@ <data name="output2" format="tabular"/> </outputs> <requirements> - <requirement type="binary">SHRiMP_rmapper</requirement> + <requirement type="binary">rmapper-ls</requirement> </requirements> <tests> <test> @@ -64,13 +65,14 @@ <param name="input_query" value="shrimp_wrapper_test1.fastq" ftype="fastqsolexa"/> <output name="output1" file="shrimp_wrapper_test1.out1" /> </test> -  @@ -124,67 +127,146 @@ .. class:: warningmark -Only nucleotide sequences as query. +Please note that only **nucleotide** sequences (letter-space) can be used as query. ----- **What it does** -Run SHRiMP on letter-space reads. - +SHRiMP (SHort Read Mapping Package) is a software package for aligning genomic reads against a target genome. + +This wrapper post-processes the default SHRiMP/rmapper-ls output and generates a table with all information from reads and reference for the mapping. The tool takes single- or paired-end reads. For single-end reads, only uniquely mapped alignment is considered. In paired-end reads, only pairs that meet the following criteria will be used to generate the table: 1). the ends fall within the insertion size; 2). the ends are mapped at the opposite directions. If there are still multiple mappings after applying the criteria, this paired-end read will be discarded. + + ----- - -**Example** - -- Input a multiple-fastq file like the following:: + +**Input formats** + +A multiple-fastq file, for example:: @seq1 TACCCGATTTTTTGCTTTCCACTTTATCCTACCCTT - +seq2 + +seq1 hhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhhh -- Use default settings (for detail explanations, please see **Parameters** section) - -- Search against your own uploaded file, result will be in the following format:: - - +-------+-------+--------+----------+----------+---------+--------+--------+-------+------------+ - | id | chrom | strand | t.start | t.end | q.start | q.end | length | score | editstring | - +-------+-------+--------+----------+----------+---------+--------+--------+-------+------------+ - | >seq1 | chrMT | + | 14712 | 14747 | 1 | 36 | 36 | 3350 | 24T11 | - +-------+-------+--------+----------+----------+---------+--------+--------+-------+------------+ - -- The result will be formatted Table:: - - +-------+---------+---------+----------+---------+----------+---------+----------+ - | chrom | ref_loc | read_id | read_loc | ref_nuc | read_nuc | quality | coverage | - +-------+---------+---------+----------+---------+----------+---------+----------+ - | chrMT | 14711 | seq1 | 0 | T | T | 40 | 1 | - | chrMT | 14712 | seq1 | 1 | A | A | 40 | 1 | - | chrMT | 14713 | seq1 | 2 | C | C | 40 | 1 | - +-------+---------+---------+----------+---------+----------+---------+----------+ ----- -**Parameters** +**Outputs** -Parameter list with default value settings:: +The tool gives two outputs. + +**Table output** + +Table output contains 8 columns:: + + 1 2 3 4 5 6 7 8 + ---------------------------------------------------- + chrM 14711 seq1 0 T A 40 1 + chrM 14712 seq1 1 T T 40 1 + +where:: + + 1. (chrM) - Reference sequence id + 2. (14711) - Position of the mapping in the reference + 3. (seq1) - Read id + 4. (0) - Position of the mapping in the read + 5. (T) - Nucleotide in the reference + 6. (A) - Nucleotide in the read + 7. (40) - Quality score for the nucleotide in the position of the read + 8. (1) - The number of times this position is covered by reads + + +**SHRiMP output** + +This is the default output from SHRiMP/rmapper-ls:: + + 1 2 3 4 5 6 7 8 9 10 + ------------------------------------------------------------------- + seq1 chrM + 3644 3679 1 36 36 3600 36 + +where:: + + 1. (seq1) - Read id + 2. (chrM) - Reference sequence id + 3. (+) - Strand of the read + 4. (3466) - Start position of the alignment in the reference + 5. (3679) - End position of the alignment in the reference + 6. (1) - Start position of the alignment in the read + 7. (36) - End position of the alignment in the read + 8. (36) - Length of the read + 9. (3600) - Score + 10. (36) - Edit string + + +----- + +**SHRiMP parameter list** + +The commonly used parameters with default value setting:: -s Spaced Seed (default: 111111011111) + The spaced seed is a single contiguous string of 0's and 1's. + 0's represent wildcards, or positions which will always be + considered as matching, whereas 1's dictate positions that + must match. A string of all 1's will result in a simple kmer scan. -n Seed Matches per Window (default: 2) + The number of seed matches per window dictates how many seeds + must match within some window length of the genome before that + region is considered for Smith-Waterman alignment. A lower + value will increase sensitivity while drastically increasing + running time. Higher values will have the opposite effect. -t Seed Hit Taboo Length (default: 4) + The seed taboo length specifies how many target genome bases + or colours must exist prior to a previous seed match in order + to count another seed match as a hit. -9 Seed Generation Taboo Length (default: 0) + -w Seed Window Length (default: 115.00%) + This parameter specifies the genomic span in bases (or colours) + in which *seed_matches_per_window* must exist before the read + is given consideration by the Simth-Waterman alignment machinery. -o Maximum Hits per Read (default: 100) + This parameter specifies how many hits to remember for each read. + If more hits are encountered, ones with lower scores are dropped + to make room. -r Maximum Read Length (default: 1000) + This parameter specifies the maximum length of reads that will + be encountered in the dataset. If larger reads than the default + are used, an appropriate value must be passed to *rmapper*. -d Kmer Std. Deviation Limit (default: -1 [None]) + This option permits pruning read kmers, which occur with + frequencies greater than *kmer_std_dev_limit* standard + deviations above the average. This can shorten running + time at the cost of some sensitivity. + *Note*: A negative value disables this option. + -m S-W Match Value (default: 100) + The value applied to matches during the Smith-Waterman score calculation. + -i S-W Mismatch Value (default: -150) + The value applied to mismatches during the Smith-Waterman + score calculation. + -g S-W Gap Open Penalty (Reference) (default: -400) + The value applied to gap opens along the reference sequence + during the Smith-Waterman score calculation. + *Note*: Note that for backward compatibility, if -g is set + and -q is not set, the gap open penalty for the query will + be set to the same value as specified for the reference. + -q S-W Gap Open Penalty (Query) (default: -400) + The value applied to gap opens along the query sequence during + the Smith-Waterman score calculation. + -e S-W Gap Extend Penalty (Reference) (default: -70) + The value applied to gap extends during the Smith-Waterman score calculation. + *Note*: Note that for backward compatibility, if -e is set + and -f is not set, the gap exten penalty for the query will + be set to the same value as specified for the reference. + -f S-W Gap Extend Penalty (Query) (default: -70) + The value applied to gap extends during the Smith-Waterman score calculation. + -h S-W Hit Threshold (default: 68.00%) + In letter-space, this parameter determines the threshold + score for both vectored and full Smith-Waterman alignments. + Any values less than this quanitity will be thrown away. + *Note* This option differs slightly in meaning between letter-space and colour-space. - -m S-W Match Value (default: 100) - -i S-W Mismatch Value (default: -150) - -g S-W Gap Open Penalty (Reference) (default: -400) - -q S-W Gap Open Penalty (Query) (default: -400) - -e S-W Gap Extend Penalty (Reference) (default: -70) - -f S-W Gap Extend Penalty (Query) (default: -70) - -h S-W Hit Threshold (default: 68.00%) -----

1 0

[hg] galaxy 1514: New tool to format short read mapping data as ...
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/33e06a98b6d8 changeset: 1514:33e06a98b6d8 user: guru date: Wed Sep 17 16:42:08 2008 -0400 description: New tool to format short read mapping data as a UCSC custom track,. 2 file(s) affected in this change: tools/metag_tools/mapping_to_ucsc.py tools/metag_tools/mapping_to_ucsc.xml diffs (415 lines): diff -r cf17b5a16eff -r 33e06a98b6d8 tools/metag_tools/mapping_to_ucsc.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/metag_tools/mapping_to_ucsc.py Wed Sep 17 16:42:08 2008 -0400 @@ -0,0 +1,204 @@ +#! /usr/bin/python + +from galaxy import eggs +import sys, tempfile, os + +assert sys.version_info[:2] >= (2.4) + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + +def main(): + + out_fname = sys.argv[1] + in_fname = sys.argv[2] + chr_col = int(sys.argv[3])-1 + coord_col = int(sys.argv[4])-1 + track_type = sys.argv[5] + if track_type == 'coverage' or track_type == 'both': + coverage_col = int(sys.argv[6])-1 + cname = sys.argv[7] + cdescription = sys.argv[8] + ccolor = sys.argv[9].replace('-',',') + cvisibility = sys.argv[10] + if track_type == 'snp' or track_type == 'both': + if track_type == 'both': + j = 5 + else: + j = 0 + #sname = sys.argv[7+j] + sdescription = sys.argv[6+j] + svisibility = sys.argv[7+j] + #ref_col = int(sys.argv[10+j])-1 + read_col = int(sys.argv[8+j])-1 + + + # Sort the input file based on chromosome (alphabetically) and start co-ordinates (numerically) + sorted_infile = tempfile.NamedTemporaryFile() + try: + os.system("sort -k %d,%d -k %dn -o %s %s" %(chr_col+1,chr_col+1,coord_col+1,sorted_infile.name,in_fname)) + except Exception, exc: + stop_err( 'Initialization error -> %s' %str(exc) ) + + #generate chr list + sorted_infile.seek(0) + chr_vals = [] + for line in file( sorted_infile.name ): + line = line.strip() + if not(line): + continue + try: + fields = line.split('\t') + chr = fields[chr_col] + if chr not in chr_vals: + chr_vals.append(chr) + except: + pass + if not(chr_vals): + stop_err("Skipped all lines as invalid.") + + if track_type == 'coverage' or track_type == 'both': + if track_type == 'coverage': + fout = open( out_fname, "w" ) + else: + fout = tempfile.NamedTemporaryFile() + fout.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \ + % ( cname, cdescription, ccolor, cvisibility )) + if track_type == 'snp' or track_type == 'both': + fout_a = tempfile.NamedTemporaryFile() + fout_t = tempfile.NamedTemporaryFile() + fout_g = tempfile.NamedTemporaryFile() + fout_c = tempfile.NamedTemporaryFile() + fout_ref = tempfile.NamedTemporaryFile() + + fout_a.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \ + % ( "Track A", sdescription, '255,0,0', svisibility )) + fout_t.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \ + % ( "Track T", sdescription, '0,255,0', svisibility )) + fout_g.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \ + % ( "Track G", sdescription, '0,0,255', svisibility )) + fout_c.write('''track type=wiggle_0 name="%s" description="%s" color=%s visibility=%s\n''' \ + % ( "Track C", sdescription, '255,0,255', svisibility )) + + + sorted_infile.seek(0) + for line in file( sorted_infile.name ): + line = line.strip() + if not(line): + continue + try: + fields = line.split('\t') + chr = fields[chr_col] + start = int(fields[coord_col]) + assert start > 0 + except: + continue + try: + ind = chr_vals.index(chr) #encountered chr for the 1st time + del chr_vals[ind] + prev_start = '' + header = "variableStep chrom=%s\n" %(chr) + if track_type == 'coverage' or track_type == 'both': + coverage = int(fields[coverage_col]) + line1 = "%s\t%s\n" %(start,coverage) + fout.write("%s%s" %(header,line1)) + if track_type == 'snp' or track_type == 'both': + a = t = g = c = 0 + fout_a.write("%s" %(header)) + fout_t.write("%s" %(header)) + fout_g.write("%s" %(header)) + fout_c.write("%s" %(header)) + try: + #ref_nt = fields[ref_col].capitalize() + read_nt = fields[read_col].capitalize() + try: + nt_ind = ['A','T','G','C'].index(read_nt) + if nt_ind == 0: + a+=1 + elif nt_ind == 1: + t+=1 + elif nt_ind == 2: + g+=1 + else: + c+=1 + except ValueError: + pass + except: + pass + prev_start = start + except ValueError: + if start != prev_start: + if track_type == 'coverage' or track_type == 'both': + coverage = int(fields[coverage_col]) + fout.write("%s\t%s\n" %(start,coverage)) + if track_type == 'snp' or track_type == 'both': + if a: + fout_a.write("%s\t%s\n" %(prev_start,a)) + if t: + fout_t.write("%s\t%s\n" %(prev_start,t)) + if g: + fout_g.write("%s\t%s\n" %(prev_start,g)) + if c: + fout_c.write("%s\t%s\n" %(prev_start,c)) + a = t = g = c = 0 + try: + #ref_nt = fields[ref_col].capitalize() + read_nt = fields[read_col].capitalize() + try: + nt_ind = ['A','T','G','C'].index(read_nt) + if nt_ind == 0: + a+=1 + elif nt_ind == 1: + t+=1 + elif nt_ind == 2: + g+=1 + else: + c+=1 + except ValueError: + pass + except: + pass + prev_start = start + else: + if track_type == 'snp' or track_type == 'both': + try: + #ref_nt = fields[ref_col].capitalize() + read_nt = fields[read_col].capitalize() + try: + nt_ind = ['A','T','G','C'].index(read_nt) + if nt_ind == 0: + a+=1 + elif nt_ind == 1: + t+=1 + elif nt_ind == 2: + g+=1 + else: + c+=1 + except ValueError: + pass + except: + pass + + if track_type == 'snp' or track_type == 'both': + if a: + fout_a.write("%s\t%s\n" %(prev_start,a)) + if t: + fout_t.write("%s\t%s\n" %(prev_start,t)) + if g: + fout_g.write("%s\t%s\n" %(prev_start,g)) + if c: + fout_c.write("%s\t%s\n" %(prev_start,c)) + + fout_a.seek(0) + fout_g.seek(0) + fout_t.seek(0) + fout_c.seek(0) + + if track_type == 'snp': + os.system("cat %s %s %s %s >> %s" %(fout_a.name,fout_t.name,fout_g.name,fout_c.name,out_fname)) + elif track_type == 'both': + fout.seek(0) + os.system("cat %s %s %s %s %s | cat > %s" %(fout.name,fout_a.name,fout_t.name,fout_g.name,fout_c.name,out_fname)) +if __name__ == "__main__": + main() \ No newline at end of file diff -r cf17b5a16eff -r 33e06a98b6d8 tools/metag_tools/mapping_to_ucsc.xml --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/metag_tools/mapping_to_ucsc.xml Wed Sep 17 16:42:08 2008 -0400 @@ -0,0 +1,202 @@ +<tool id="mapToUCSC" name="Format mapping data" version="1.0.0"> + <description> as UCSC custom track</description> + <command interpreter="python"> + mapping_to_ucsc.py + $out_file1 + $input + $chr_col + $coord_col + $track.track_type + #if $track.track_type == "coverage" or $track.track_type == "both" + $track.coverage_col + "${track.cname}" + "${track.cdescription}" + "${track.ccolor}" + "${track.cvisibility}" + #end if + #if $track.track_type == "snp" or $track.track_type == "both" + "${track.sdescription}" + "${track.svisibility}" + $track.col2 + #end if + </command> + <inputs> + <param format="tabular" name="input" type="data" label="Select mapping data"/> + <param name="chr_col" type="data_column" data_ref="input" label="Column for reference chromosome" /> + <param name="coord_col" type="data_column" data_ref="input" numerical="True" label="Numerical column for reference co-ordinate" /> + <conditional name="track"> + <param name="track_type" type="select" label="Display"> + <option value="snp" selected="true">SNPs</option> + <option value="coverage">Read coverage</option> + <option value="both">Both</option> + </param> + <when value = "coverage"> + <param name="coverage_col" type="data_column" data_ref="input" numerical="True" label="Numerical column for read coverage" /> + <param name="cname" type="text" size="15" value="User Track" label="Coverage track name"> + <validator type="length" max="15"/> + </param> + <param name="cdescription" type="text" value="User Supplied Coverage Track (from Galaxy)" label="Coverage track description"> + <validator type="length" max="60" size="15"/> + </param> + <param label="Coverage track Color" name="ccolor" type="select"> + <option selected="yes" value="0-0-0">Black</option> + <option value="255-0-0">Red</option> + <option value="0-255-0">Green</option> + <option value="0-0-255">Blue</option> + <option value="255-0-255">Magenta</option> + <option value="0-255-255">Cyan</option> + <option value="255-215-0">Gold</option> + <option value="160-32-240">Purple</option> + <option value="255-140-0">Orange</option> + <option value="255-20-147">Pink</option> + <option value="92-51-23">Dark Chocolate</option> + <option value="85-107-47">Olive green</option> + </param> + <param label="Coverage track Visibility" name="cvisibility" type="select"> + <option selected="yes" value="1">Dense</option> + <option value="2">Full</option> + <option value="3">Pack</option> + <option value="4">Squish</option> + <option value="0">Hide</option> + </param> + </when> + + <when value = "snp"> +  + <param name="col2" type="data_column" data_ref="input" label="Column containing the read nucleotide" /> +  + <param name="sdescription" type="text" value="User Supplied Track (from Galaxy)" label="SNP track description"> + <validator type="length" max="60" size="15"/> + </param> + <param label="SNP track Visibility" name="svisibility" type="select"> + <option selected="yes" value="1">Dense</option> + <option value="2">Full</option> + <option value="3">Pack</option> + <option value="4">Squish</option> + <option value="0">Hide</option> + </param> + </when> + + <when value = "both"> + <param name="coverage_col" type="data_column" data_ref="input" numerical="True" label="Numerical column for read coverage" /> + <param name="cname" type="text" size="15" value="User Track" label="Coverage track name"> + <validator type="length" max="15"/> + </param> + <param name="cdescription" type="text" size="15" value="User Supplied Track (from Galaxy)" label="Coverage track description"> + <validator type="length" max="60"/> + </param> + <param label="Coverage track Color" name="ccolor" type="select"> + <option selected="yes" value="0-0-0">Black</option> + <option value="255-0-0">Red</option> + <option value="0-255-0">Green</option> + <option value="0-0-255">Blue</option> + <option value="255-0-255">Magenta</option> + <option value="0-255-255">Cyan</option> + <option value="255-215-0">Gold</option> + <option value="160-32-240">Purple</option> + <option value="255-140-0">Orange</option> + <option value="255-20-147">Pink</option> + <option value="92-51-23">Dark Chocolate</option> + <option value="85-107-47">Olive green</option> + </param> + <param label="Coverage track Visibility" name="cvisibility" type="select"> + <option selected="yes" value="1">Dense</option> + <option value="2">Full</option> + <option value="3">Pack</option> + <option value="4">Squish</option> + <option value="0">Hide</option> + </param> +  + <param name="col2" type="data_column" data_ref="input" label="Column containing the read nucleotide" /> +  + <param name="sdescription" type="text" size="15" value="User Supplied Track (from Galaxy)" label="SNP track description"> + <validator type="length" max="60"/> + </param> + <param label="SNP track Visibility" name="svisibility" type="select"> + <option selected="yes" value="1">Dense</option> + <option value="2">Full</option> + <option value="3">Pack</option> + <option value="4">Squish</option> + <option value="0">Hide</option> + </param> + </when> + </conditional> + </inputs> + <outputs> + <data format="customtrack" name="out_file1"/> + </outputs> + + + <help> + +.. class:: infomark + +**What it does** + +This tool formats mapping data generated by short read mappers, as a custom track that can be displayed at UCSC genome browser. + +----- + +.. class:: warningmark + +**Note** + +This tool requires the mapping data to contain at least the following information: + +chromosome, genome coordinate, read nucleotide (if option to display is SNPs), read coverage (if option to display is Read coverage). + +----- + +**Example** + +For the following Mapping data:: + + #chr g_start read_id read_coord g_nt read_nt qual read_coverage + chrM 1 1:29:1672:1127/1 11 G G 40 134 + chrM 1 1:32:93:933/1 4 G A 40 134 + chrM 1 1:34:116:2032/1 11 G A 40 134 + chrM 1 1:39:207:964/1 1 G G 40 134 + chrM 2 1:3:359:848/1 1 G C 40 234 + chrM 2 1:40:1435:1013/1 1 G G 40 234 + chrM 3 1:40:730:972/1 9 G G 40 334 + chrM 4 1:42:1712:921/2 31 G T 35 434 + chrM 4 1:44:1649:493/1 4 G G 40 434 + +running this tool to display both SNPs and Read coverage will return the following tracks, containing aggregated data per genome co-ordinate:: + + track type=wiggle_0 name="Coverage Track" description="User Supplied Track (from Galaxy)" color=0,0,0 visibility=1 + variableStep chrom=chrM + 1 134 + 2 234 + 3 334 + 4 434 + track type=wiggle_0 name="Track A" description="User Supplied SNP Track (from Galaxy)" color=255,0,0 visibility=1 + variableStep chrom=chrM + 1 2 + track type=wiggle_0 name="Track T" description="User Supplied SNP Track (from Galaxy)" color=0,255,0 visibility=1 + variableStep chrom=chrM + 4 1 + track type=wiggle_0 name="Track G" description="User Supplied SNP Track (from Galaxy)" color=0,0,255 visibility=1 + variableStep chrom=chrM + 1 2 + 2 1 + 3 1 + 4 1 + track type=wiggle_0 name="Track C" description="User Supplied SNP Track (from Galaxy)" color=255,0,255 visibility=1 + variableStep chrom=chrM + 2 1 + + </help> +</tool>

1 0

[hg] galaxy 1515: Forgot to update tool_conf.sample with the new...
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/280e8b68f845 changeset: 1515:280e8b68f845 user: guru date: Wed Sep 17 17:14:59 2008 -0400 description: Forgot to update tool_conf.sample with the new tool details. 1 file(s) affected in this change: tool_conf.xml.sample diffs (10 lines): diff -r 33e06a98b6d8 -r 280e8b68f845 tool_conf.xml.sample --- a/tool_conf.xml.sample Wed Sep 17 16:42:08 2008 -0400 +++ b/tool_conf.xml.sample Wed Sep 17 17:14:59 2008 -0400 @@ -281,5 +281,6 @@ <tool file="metag_tools/megablast_wrapper.xml" /> <tool file="metag_tools/megablast_xml_parser.xml" /> <tool file="metag_tools/blat_wrapper.xml" /> + <tool file="metag_tools/mapping_to_ucsc.xml" /> </section> </toolbox>

1 0

[hg] galaxy 1509: Rewrote "Compare two queries" tool in Python.
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/eb941905fd70 changeset: 1509:eb941905fd70 user: guru date: Tue Sep 16 14:09:16 2008 -0400 description: Rewrote "Compare two queries" tool in Python. 2 file(s) affected in this change: tools/filters/compare.xml tools/filters/joinWrapper.py diffs (68 lines): diff -r ec547440ec97 -r eb941905fd70 tools/filters/compare.xml --- a/tools/filters/compare.xml Tue Sep 16 13:25:42 2008 -0400 +++ b/tools/filters/compare.xml Tue Sep 16 14:09:16 2008 -0400 @@ -1,6 +1,6 @@ <tool id="comp1" name="Compare two Queries"> <description>to find common or distinct rows</description> - <command interpreter="perl">joinWrapper.pl $input1 $input2 $field1 $field2 $mode "Y" $out_file1</command> + <command interpreter="python">joinWrapper.py $input1 $input2 $field1 $field2 $mode $out_file1</command> <inputs> <param format="tabular" name="input1" type="data" label="Compare"/> <param name="field1" label="Using column" type="data_column" data_ref="input1" /> diff -r ec547440ec97 -r eb941905fd70 tools/filters/joinWrapper.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tools/filters/joinWrapper.py Tue Sep 16 14:09:16 2008 -0400 @@ -0,0 +1,53 @@ +#!/usr/bin/env python +#Guruprasad Ananda +""" +This tool provides the UNIX "join" functionality. +""" +import sys, os, tempfile + +def stop_err(msg): + sys.stderr.write(msg) + sys.exit() + +def main(): + infile1 = sys.argv[1] + infile2 = sys.argv[2] + field1 = int(sys.argv[3]) + field2 = int(sys.argv[4]) + mode =sys.argv[5] + outfile = sys.argv[6] + + tmpfile1 = tempfile.NamedTemporaryFile() + tmpfile2 = tempfile.NamedTemporaryFile() + + try: + #Sort the two files based on specified fields + os.system("sort -k %d -o %s %s" %(field1, tmpfile1.name, infile1)) + os.system("sort -k %d -o %s %s" %(field2, tmpfile2.name, infile2)) + except Exception, exc: + stop_err( 'Initialization error -> %s' %str(exc) ) + + option = "" + for line in file(tmpfile1.name): + line = line.strip() + if line: + elems = line.split('\t') + for j in range(1,len(elems)+1): + if j == 1: + option = "1.1" + else: + option = option + ",1." + str(j) + break + + if mode == "V": + cmdline = 'join -v 1 -o %s -1 %d -2 %d %s %s | tr " " "\t" > %s' %(option, field1, field2, tmpfile1.name, tmpfile2.name, outfile) + else: + cmdline = 'join -o %s -1 %d -2 %d %s %s | tr " " "\t" > %s' %(option, field1, field2, tmpfile1.name, tmpfile2.name, outfile) + + try: + os.system(cmdline) + except Exception, exj: + stop_err('Error joining the two datasets -> %s' %str(exj)) + +if __name__ == "__main__": + main()

1 0

[hg] galaxy 1508: Small update for maf stats tool.
by greg＠scofield.bx.psu.edu 22 Sep '08

22 Sep '08

details: http://www.bx.psu.edu/hg/galaxy/rev/ec547440ec97 changeset: 1508:ec547440ec97 user: Dan Blankenberg <dan(a)bx.psu.edu> date: Tue Sep 16 13:25:42 2008 -0400 description: Small update for maf stats tool. 2 file(s) affected in this change: lib/galaxy/tools/util/maf_utilities.py tools/maf/maf_stats.py diffs (99 lines): diff -r 842f1883cf53 -r ec547440ec97 lib/galaxy/tools/util/maf_utilities.py --- a/lib/galaxy/tools/util/maf_utilities.py Mon Sep 15 15:04:41 2008 -0400 +++ b/lib/galaxy/tools/util/maf_utilities.py Tue Sep 16 13:25:42 2008 -0400 @@ -199,7 +199,7 @@ yield block def get_chopped_blocks_with_index_offset_for_region( index, src, region, species = None, mincols = 0, force_strand = None ): for block, idx, offset in index.get_as_iterator_with_index_and_offset( src, region.start, region.end ): - block = chop_block_by_region( block, src, region, species, mincols ) + block = chop_block_by_region( block, src, region, species, mincols, force_strand ) if block is not None: yield block, idx, offset @@ -209,6 +209,25 @@ else: alignment = RegionAlignment( end - start, primary_species ) return fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand, species, mincols ) +#reduces a block to only positions exisiting in the src provided +def reduce_block_by_primary_genome( block, species, chromosome, region_start ): + #returns ( startIndex, {species:texts} + #where texts' contents are reduced to only positions existing in the primary genome + src = "%s.%s" % ( species, chromosome ) + ref = block.get_component_by_src( src ) + start_offset = ref.start - region_start + species_texts = {} + for c in block.components: + species_texts[ c.src.split( '.' )[0] ] = list( c.text ) + #remove locations which are gaps in the primary species, starting from the downstream end + for i in range( len( species_texts[ species ] ) - 1, -1, -1 ): + if species_texts[ species ][i] == '-': + for text in species_texts.values(): + text.pop( i ) + for spec, text in species_texts.items(): + species_texts[spec] = ''.join( text ) + return ( start_offset, species_texts ) + #fills a region alignment def fill_region_alignment( alignment, index, primary_species, chrom, start, end, strand = '+', species = None, mincols = 0 ): region = bx.intervals.Interval( start, end ) @@ -216,22 +235,7 @@ region.strand = strand primary_src = "%s.%s" % ( primary_species, chrom ) - def reduce_block_by_primary_genome( block ): - #returns ( startIndex, {species:texts} - #where texts' contents are reduced to only positions existing in the primary genome - ref = block.get_component_by_src( primary_src ) - start_offset = ref.start - start - species_texts = {} - for c in block.components: - species_texts[ c.src.split( '.' )[0] ] = list( c.text ) - #remove locations which are gaps in the primary species, starting from the downstream end - for i in range( len( species_texts[ primary_species ] ) - 1, -1, -1 ): - if species_texts[ primary_species ][i] == '-': - for text in species_texts.values(): - text.pop( i ) - for spec, text in species_texts.items(): - species_texts[spec] = ''.join( text ) - return ( start_offset, species_texts ) + #Order blocks overlaping this position by score, lowest first blocks = [] @@ -248,7 +252,7 @@ for block_dict in blocks: block = chop_block_by_region( block_dict[1].get_at_offset( block_dict[2] ), primary_src, region, species, mincols, strand ) if block is None: continue - start_offset, species_texts = reduce_block_by_primary_genome( block ) + start_offset, species_texts = reduce_block_by_primary_genome( block, primary_species, chrom, start ) for spec, text in species_texts.items(): try: alignment.set_range( start_offset, spec, text ) diff -r 842f1883cf53 -r ec547440ec97 tools/maf/maf_stats.py --- a/tools/maf/maf_stats.py Mon Sep 15 15:04:41 2008 -0400 +++ b/tools/maf/maf_stats.py Tue Sep 16 13:25:42 2008 -0400 @@ -64,19 +64,11 @@ for c in block.components: spec = c.src.split( '.' )[0] if spec not in coverage: coverage[spec] = zeros( region.end - region.start, dtype = bool ) - ref = block.get_component_by_src( src ) - #skip gap locations due to insertions in secondary species relative to primary species - start_offset = ref.start - region.start - num_gaps = 0 - for i in range( len( ref.text.rstrip().rstrip( "-" ) ) ): - if ref.text[i] in ["-"]: - num_gaps += 1 - continue - #Toggle base if covered - for comp in block.components: - spec = comp.src.split( '.' )[0] - if comp.text and comp.text[i] not in ['-']: - coverage[spec][start_offset + i - num_gaps] = True + start_offset, alignment = maf_utilities.reduce_block_by_primary_genome( block, dbkey, region.chrom, region.start ) + for i in range( len( alignment[dbkey] ) ): + for spec, text in alignment.items(): + if text[i] != '-': + coverage[spec][start_offset + i] = True if summary: #record summary for key in coverage.keys():

1 0