2 new changesets in galaxy-central: http://bitbucket.org/galaxy/galaxy-central/changeset/c25c8055e608/ changeset: r5274:c25c8055e608 user: fubar date: 2011-03-25 18:12:21 summary: Revised rgenetics tools to remove post job hooks affected #: 35 files (21.8 KB) --- a/test-data/rgCaCotest1_CaCo_topTable.gff Thu Mar 24 20:29:40 2011 -0400 +++ b/test-data/rgCaCotest1_CaCo_topTable.gff Fri Mar 25 13:12:21 2011 -0400 @@ -1,26 +1,26 @@ -track name=rgGLM_TopTable description="rgCaCotest1" visibility=2 useScore=1 color=0,60,120 -chr22 rgGLM variation 21784622 21784822 0 . . rs2283802 logp=-0.00 -chr22 rgGLM variation 21785266 21785466 42 . . rs2267000 logp=0.05 -chr22 rgGLM variation 21794654 21794854 249 . . rs16997606 logp=0.29 -chr22 rgGLM variation 21794710 21794910 493 . . rs4820537 logp=0.58 -chr22 rgGLM variation 21797704 21797904 237 . . rs3788347 logp=0.28 -chr22 rgGLM variation 21799818 21800018 61 . . rs756632 logp=0.07 -chr22 rgGLM variation 21807870 21808070 153 . . rs4820539 logp=0.18 -chr22 rgGLM variation 21820235 21820435 44 . . rs2283804 logp=0.05 -chr22 rgGLM variation 21820890 21821090 44 . . rs2267006 logp=0.05 -chr22 rgGLM variation 21820900 21821100 184 . . rs4822363 logp=0.22 -chr22 rgGLM variation 21827574 21827774 317 . . rs5751592 logp=0.37 -chr22 rgGLM variation 21832608 21832808 0 . . rs5759608 logp=-0.00 -chr22 rgGLM variation 21833070 21833270 0 . . rs5759612 logp=-0.00 -chr22 rgGLM variation 21860068 21860268 0 . . rs2267009 logp=-0.00 -chr22 rgGLM variation 21864266 21864466 117 . . rs2267010 logp=0.14 -chr22 rgGLM variation 21868598 21868798 571 . . rs5759636 logp=0.67 -chr22 rgGLM variation 21871388 21871588 0 . . rs2071436 logp=-0.00 -chr22 rgGLM variation 21875779 21875979 249 . . rs2267013 logp=0.29 -chr22 rgGLM variation 21889706 21889906 378 . . rs6003566 logp=0.44 -chr22 rgGLM variation 21892791 21892991 249 . . rs2256725 logp=0.29 -chr22 rgGLM variation 21892825 21893025 193 . . rs12160770 logp=0.23 -chr22 rgGLM variation 21895919 21896119 42 . . rs5751611 logp=0.05 -chr22 rgGLM variation 21898758 21898958 40 . . rs762601 logp=0.05 -chr22 rgGLM variation 21898963 21899163 40 . . rs2156921 logp=0.05 -chr22 rgGLM variation 21905542 21905742 40 . . rs4822375 logp=0.05 +track name=rgCaCo_TopTable description="rgCaCotest1" visibility=2 useScore=1 color=0,60,120 +chr22 rgCaCo variation 21784622 21784822 0 . . rs2283802 logp=-0.00 +chr22 rgCaCo variation 21785266 21785466 42 . . rs2267000 logp=0.05 +chr22 rgCaCo variation 21794654 21794854 249 . . rs16997606 logp=0.29 +chr22 rgCaCo variation 21794710 21794910 493 . . rs4820537 logp=0.58 +chr22 rgCaCo variation 21797704 21797904 237 . . rs3788347 logp=0.28 +chr22 rgCaCo variation 21799818 21800018 61 . . rs756632 logp=0.07 +chr22 rgCaCo variation 21807870 21808070 153 . . rs4820539 logp=0.18 +chr22 rgCaCo variation 21820235 21820435 44 . . rs2283804 logp=0.05 +chr22 rgCaCo variation 21820890 21821090 44 . . rs2267006 logp=0.05 +chr22 rgCaCo variation 21820900 21821100 184 . . rs4822363 logp=0.22 +chr22 rgCaCo variation 21827574 21827774 317 . . rs5751592 logp=0.37 +chr22 rgCaCo variation 21832608 21832808 0 . . rs5759608 logp=-0.00 +chr22 rgCaCo variation 21833070 21833270 0 . . rs5759612 logp=-0.00 +chr22 rgCaCo variation 21860068 21860268 0 . . rs2267009 logp=-0.00 +chr22 rgCaCo variation 21864266 21864466 117 . . rs2267010 logp=0.14 +chr22 rgCaCo variation 21868598 21868798 571 . . rs5759636 logp=0.67 +chr22 rgCaCo variation 21871388 21871588 0 . . rs2071436 logp=-0.00 +chr22 rgCaCo variation 21875779 21875979 249 . . rs2267013 logp=0.29 +chr22 rgCaCo variation 21889706 21889906 378 . . rs6003566 logp=0.44 +chr22 rgCaCo variation 21892791 21892991 249 . . rs2256725 logp=0.29 +chr22 rgCaCo variation 21892825 21893025 193 . . rs12160770 logp=0.23 +chr22 rgCaCo variation 21895919 21896119 42 . . rs5751611 logp=0.05 +chr22 rgCaCo variation 21898758 21898958 40 . . rs762601 logp=0.05 +chr22 rgCaCo variation 21898963 21899163 40 . . rs2156921 logp=0.05 +chr22 rgCaCo variation 21905542 21905742 40 . . rs4822375 logp=0.05 --- a/test-data/rgTDTtest1_TDT_topTable.gff Thu Mar 24 20:29:40 2011 -0400 +++ b/test-data/rgTDTtest1_TDT_topTable.gff Fri Mar 25 13:12:21 2011 -0400 @@ -1,25 +1,25 @@ track name=rgTDT_Top_Table description="rgTDTtest1" visibility=2 useScore=1 color=0,60,120 -chr22 rgGLM variation 21784622 21784822 0 . . rs2283802 logp=-0.00 -chr22 rgGLM variation 21785266 21785466 120 . . rs2267000 logp=0.13 -chr22 rgGLM variation 21794654 21794854 0 . . rs16997606 logp=-0.00 -chr22 rgGLM variation 21794710 21794910 457 . . rs4820537 logp=0.50 -chr22 rgGLM variation 21797704 21797904 120 . . rs3788347 logp=0.13 -chr22 rgGLM variation 21799818 21800018 228 . . rs756632 logp=0.25 -chr22 rgGLM variation 21807870 21808070 120 . . rs4820539 logp=0.13 -chr22 rgGLM variation 21820235 21820435 292 . . rs2283804 logp=0.32 -chr22 rgGLM variation 21820890 21821090 292 . . rs2267006 logp=0.32 -chr22 rgGLM variation 21820900 21821100 0 . . rs4822363 logp=-0.00 -chr22 rgGLM variation 21827574 21827774 541 . . rs5751592 logp=0.59 -chr22 rgGLM variation 21832608 21832808 400 . . rs5759608 logp=0.44 -chr22 rgGLM variation 21833070 21833270 400 . . rs5759612 logp=0.44 -chr22 rgGLM variation 21860068 21860268 0 . . rs2267009 logp=-0.00 -chr22 rgGLM variation 21864266 21864466 0 . . rs2267010 logp=-0.00 -chr22 rgGLM variation 21868598 21868798 457 . . rs5759636 logp=0.50 -chr22 rgGLM variation 21871388 21871588 0 . . rs2071436 logp=-0.00 -chr22 rgGLM variation 21875779 21875979 457 . . rs2267013 logp=0.50 -chr22 rgGLM variation 21889706 21889906 168 . . rs6003566 logp=0.18 -chr22 rgGLM variation 21892791 21892991 457 . . rs2256725 logp=0.50 -chr22 rgGLM variation 21895919 21896119 359 . . rs5751611 logp=0.39 -chr22 rgGLM variation 21898758 21898958 400 . . rs762601 logp=0.44 -chr22 rgGLM variation 21898963 21899163 400 . . rs2156921 logp=0.44 -chr22 rgGLM variation 21905542 21905742 400 . . rs4822375 logp=0.44 +chr22 rgTDT variation 21784622 21784822 0 . . rs2283802 logp=-0.00 +chr22 rgTDT variation 21785266 21785466 120 . . rs2267000 logp=0.13 +chr22 rgTDT variation 21794654 21794854 0 . . rs16997606 logp=-0.00 +chr22 rgTDT variation 21794710 21794910 457 . . rs4820537 logp=0.50 +chr22 rgTDT variation 21797704 21797904 120 . . rs3788347 logp=0.13 +chr22 rgTDT variation 21799818 21800018 228 . . rs756632 logp=0.25 +chr22 rgTDT variation 21807870 21808070 120 . . rs4820539 logp=0.13 +chr22 rgTDT variation 21820235 21820435 292 . . rs2283804 logp=0.32 +chr22 rgTDT variation 21820890 21821090 292 . . rs2267006 logp=0.32 +chr22 rgTDT variation 21820900 21821100 0 . . rs4822363 logp=-0.00 +chr22 rgTDT variation 21827574 21827774 541 . . rs5751592 logp=0.59 +chr22 rgTDT variation 21832608 21832808 400 . . rs5759608 logp=0.44 +chr22 rgTDT variation 21833070 21833270 400 . . rs5759612 logp=0.44 +chr22 rgTDT variation 21860068 21860268 0 . . rs2267009 logp=-0.00 +chr22 rgTDT variation 21864266 21864466 0 . . rs2267010 logp=-0.00 +chr22 rgTDT variation 21868598 21868798 457 . . rs5759636 logp=0.50 +chr22 rgTDT variation 21871388 21871588 0 . . rs2071436 logp=-0.00 +chr22 rgTDT variation 21875779 21875979 457 . . rs2267013 logp=0.50 +chr22 rgTDT variation 21889706 21889906 168 . . rs6003566 logp=0.18 +chr22 rgTDT variation 21892791 21892991 457 . . rs2256725 logp=0.50 +chr22 rgTDT variation 21895919 21896119 359 . . rs5751611 logp=0.39 +chr22 rgTDT variation 21898758 21898958 400 . . rs762601 logp=0.44 +chr22 rgTDT variation 21898963 21899163 400 . . rs2156921 logp=0.44 +chr22 rgTDT variation 21905542 21905742 400 . . rs4822375 logp=0.44 --- a/tools/rgenetics/rgCaCo.py Thu Mar 24 20:29:40 2011 -0400 +++ b/tools/rgenetics/rgCaCo.py Fri Mar 25 13:12:21 2011 -0400 @@ -20,7 +20,6 @@ import sys,math,shutil,subprocess,os,time,tempfile,string from os.path import abspath from rgutils import timenow, plinke - imagedir = '/static/rg' # if needed for images myversion = 'V000.1 April 2007' verbose = False @@ -82,7 +81,7 @@ outres = [] # need to resort into chrom offset order for i,lrow in enumerate(resfl): chrom,snp,offset,p, = [lrow[x] for x in wewant] - gff = ('chr%s' % chrom,'rgGLM','variation','%d' % (int(offset)-halfwidth), + gff = ('chr%s' % chrom,'rgCaCo','variation','%d' % (int(offset)-halfwidth), '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i])) outres.append(gff) outres = [(x[0],int(x[3]),x) for x in outres] # decorate @@ -94,7 +93,6 @@ outf.close() - def plink_assocToGG(plinkout="hm",tag='test'): """ plink --assoc output looks like this # CHR SNP A1 F_A F_U A2 CHISQ P OR @@ -170,14 +168,8 @@ else: headl = headl.split() delim = None - chrpos = headl.index('CHR') - rspos = headl.index('SNP') - testpos = headl.index('TEST') - naffpos = headl.index('AFF') - nuaffpos = headl.index('UNAFF') - chisqpos = headl.index('CHISQ') - ppos = headl.index('P') - wewant = [chrpos,rspos,testpos,naffpos,nuaffpos,chisqpos,ppos] + whatwewant = ['CHR','SNP','TEST','AFF','UNAFF','CHISQ','P'] + wewant = [headl.index(x) for x in whatwewant] llen = len(headl) lnum = anum = 0 lastsnp = None # so we know when to write out a gg line @@ -273,7 +265,7 @@ retval = p.wait() resf = '%s.model' % tname # plink output is here we hope xformModel(bfname,resf,outfname,name,amapf,flog) # leaves the desired summary file - makeGFF(resf=outfname,outfname=gffout,logf=flog,twd=twd,name='rgGLM_TopTable',description=name,topn=topn) + makeGFF(resf=outfname,outfname=gffout,logf=flog,twd=twd,name='rgCaCo_TopTable',description=name,topn=topn) flog.write('\n'.join(logme)) flog.close() # close the log used #shutil.copytree(twd,logoutdir) --- a/tools/rgenetics/rgCaCo.xml Thu Mar 24 20:29:40 2011 -0400 +++ b/tools/rgenetics/rgCaCo.xml Fri Mar 25 13:12:21 2011 -0400 @@ -1,25 +1,19 @@ <tool id="rgCaCo1" name="Case Control:"> - <code file="listFiles.py"/> - <code file="rgCaCo_code.py"/> - <description>for unrelated subjects</description> - <command interpreter="python"> - rgCaCo.py '$i.extra_files_path/$i.metadata.base_name' "$name" - '$out_file1' '$logf' '$logf.files_path' '$gffout' + rgCaCo.py '$i.extra_files_path/$i.metadata.base_name' "$title" '$out_file1' '$logf' '$logf.files_path' '$gffout' </command> - <inputs><param name="i" type="data" label="RGenetics genotype data from your current history" format="pbed" /> - <param name='name' type='text' size="132" value='CaseControl' label="Title for this job"/> + <param name='title' type='text' size="132" value='CaseControl' label="Title for this job"/></inputs><outputs> - <data format="tabular" name="out_file1" /> - <data format="txt" name="logf" /> - <data format="gff" name="gffout" /> + <data format="tabular" name="out_file1" label="${title}_rgCaCo.xls" /> + <data format="txt" name="logf" label="${title}_rgCaCo.log"/> + <data format="gff" name="gffout" label="${title}_rgCaCoTop.gff" /></outputs><tests><test> @@ -30,7 +24,7 @@ <composite_data value='tinywga.fam' /><edit_attributes type='name' value='tinywga' /></param> - <param name='name' value='rgCaCotest1' /> + <param name='title' value='rgCaCotest1' /><output name='out_file1' file='rgCaCotest1_CaCo.xls' ftype='tabular' compare='diff' /><output name='logf' file='rgCaCotest1_CaCo_log.txt' ftype='txt' compare='diff' lines_diff='20' /><output name='gffout' file='rgCaCotest1_CaCo_topTable.gff' ftype='gff' compare='diff' /> @@ -79,24 +73,31 @@ spreadsheet data, choose the .xls tab delimited format. It's not a stupid binary excel file. Just a plain old tab delimited one with a header. Fortunately excel is dumb enough to open these without much protest. + ----- +.. class:: infomark + **Attribution** -When you click 'execute', this Galaxy tool will run Plink (from Shaun Purcell) for you (which is GPL, so this must be too I guess) for calculations. For full Plink -attribution, source code and documentation, please see http://pngu.mgh.harvard.edu/~purcell/plink/ +This Galaxy tool relies on Plink (see Plinksrc_) to test Casae Control association models. -The Plink output files will be adjusted into UCSC compatible tracks - gg or wig, or else as tab delimited -for you spreadsheet junkies out there. Python is used for all glue and data format yoga. -Originally designed and written for the Rgenetics Galaxy tools by -ross lazarus (ross spot lazarus ate gmail spot com), who didn't write -either Galaxy or Plink but wishes he had. +So, we rely on the author (Shaun Purcell) for the documentation you need specific to those settings - they are very nicely documented - see +DOC_ -copyright Ross Lazarus 2007 -Licensed under the terms of the LGPL as documented http://www.gnu.org/licenses/lgpl.html -but is about as useful as a chocolate teapot without Plink which is GPL. +Tool and Galaxy datatypes originally designed and written for the Rgenetics +series of whole genome scale statistical genetics tools by ross lazarus (ross.lazarus@gmail.com) + +Copyright Ross Lazarus March 2007 +This Galaxy wrapper is released licensed under the LGPL_ but is about as useful as a chocolate teapot without Plink which is GPL. I'm no lawyer, but it looks like you got GPL if you use this software. Good luck. +.. _Plinksrc: http://pngu.mgh.harvard.edu/~purcell/plink/ + +.. _LGPL: http://www.gnu.org/copyleft/lesser.html + +.. _DOC: http://pngu.mgh.harvard.edu/~purcell/plink/anal.shtml#cc + </help></tool> --- a/tools/rgenetics/rgCaCo_code.py Thu Mar 24 20:29:40 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,45 +0,0 @@ -# for caco -# called as plinkCaCo.py $i $name $test $outformat $out_file1 $logf $map -from galaxy import datatypes -import time,string,sys - -def timenow(): - """return current time as a string - """ - return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) - -def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr): - """Sets the name of the data""" - job_name = param_dict.get( 'name', 'RgCaCo' ).encode() - killme = string.punctuation + string.whitespace - trantab = string.maketrans(killme,'_'*len(killme)) - title = job_name.translate(trantab) - indatname = inp_data['i'].name - outxls = ['tabular','%s_CaCo.xls' % job_name] - logtxt = ['txt','%s_CaCo_log.txt' % job_name] - ggout = ['gff','%s_CaCo_topTable.gff' % job_name] - lookup={} - lookup['out_file1'] = outxls - lookup['logf'] = logtxt - lookup['gffout'] = ggout - info = '%s rgCaCo from %s at %s' % (job_name,indatname,timenow()) - for name in lookup.keys(): - data = out_data[name] - data_type,newname = lookup[name] - data.name = newname - data.info = info - out_data[name] = data - app.model.context.flush() - - -def get_test_opts(): - """return test options""" - dat = [('Armitage test for trend chisq. Does NOT assume HWE!','TREND',True), - ('Allelic (A vs a) chisq. assumes HWE','ALLELIC',False), - ('Genotype (AA vs Aa vs aa)chisq. assumes HWE','GENO',False), - ('Dominant model (AA/Aa vs aa) chisq. assumesWE','DOM',False), - ('Recessive (AA vs Aa/aa) chisq. assumes HWE','REC',False)] - dat.reverse() - return dat - - --- a/tools/rgenetics/rgClean.xml Thu Mar 24 20:29:40 2011 -0400 +++ b/tools/rgenetics/rgClean.xml Fri Mar 25 13:12:21 2011 -0400 @@ -1,6 +1,4 @@ <tool id="rgClean1" name="Clean genotypes:"> - <code file="rgClean_code.py"/> - <description>filter markers, subjects</description><command interpreter="python"> @@ -47,7 +45,7 @@ </inputs><outputs> - <data format="pbed" name="out_file1" metadata_source="input_file" /> + <data format="pbed" name="out_file1" metadata_source="input_file" label="${title}_rgClean.pbed" /></outputs><tests> --- a/tools/rgenetics/rgClean_code.py Thu Mar 24 20:29:40 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,28 +0,0 @@ -from galaxy import app -import os, string, time - -def timenow(): - """return current time as a string - """ - return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) - - - -def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr): - name,data = out_data.items()[0] - basename = param_dict['title'] - basename = basename.encode() - killme = string.punctuation + string.whitespace - trantab = string.maketrans(killme,'_'*len(killme)) - title = basename.translate(trantab) - info = '%s filtered by rgClean.py at %s' % (title,timenow()) - data.change_datatype('pbed') - #data.file_name = data.file_name - data.metadata.base_name = title - data.name = '%s.pbed' % title - data.info = info - data.readonly = True - app.model.context.flush() - - - --- a/tools/rgenetics/rgEigPCA.xml Thu Mar 24 20:29:40 2011 -0400 +++ b/tools/rgenetics/rgEigPCA.xml Fri Mar 25 13:12:21 2011 -0400 @@ -1,5 +1,4 @@ <tool id="rgEigPCA1" name="Eigensoft:"> - <code file="rgEigPCA_code.py"/><description>PCA Ancestry using SNP</description><command interpreter="python"> @@ -27,12 +26,10 @@ </inputs><outputs> - <data name="out_file1" format="html" /> - <data name="pca" format="txt" /> + <data name="out_file1" format="html" label="${title}_rgEig.html"/> + <data name="pca" format="txt" label="${title}_rgEig.txt"/></outputs> -<!-- python $TOOLPATH/$TOOL.py "$INPATH/tinywga" "$NPRE" ${OUTPATH}/${NPRE}.html $OUTPATH 4 2 2 2 $OUTPATH/pca.out $BINPATH --> - <tests><test><param name='i' value='tinywga' ftype='ldindep' > @@ -59,17 +56,45 @@ **Syntax** -- **Genotype data** is the input genotype file chosen from available library files. -- **Title** is used to name the output files -- **Tuning parameters** documented in the Eigensoft documentation - see below +- **Genotype data** is an input genotype dataset in Plink lped (http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml) format. See below for notes +- **Title** is used to name the output files so you can remember what the outputs are for +- **Tuning parameters** are documented in the Eigensoft (http://genepath.med.harvard.edu/~reich/Software.htm) documentation - see below -(Note that you may need to convert an existing genotype file into that format to use this tool) ----- **Summary** +Eigensoft requires ld-reduced genotype data. +Galaxy has an automatic converter for genotype data in Plink linkage pedigree (lped) format. +For details of this generic genotype format, please see the Plink documentation at +http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml + +Reading that documentation, you'll see that the linkage pedigree format is really two related files with the same +file base name - a map and ped file - eg 'mygeno.ped' and 'mygeno.map'. +The map file has the chromosome, offset, genetic offset and snp name corresponding to each +genotype stored as separate alleles in the ped file. The ped file has family id, individual id, father id (or 0), mother id +(or 0), gender (1=male, 2=female, 0=unknown) and affection (1=unaffected, 2=affected, 0=unknown), +then two separate allele columns for each genotype. + +Once you have your data in the right format, you can upload those into your Galaxy history using the "upload" tool. + +To upload your lped data in the upload tool, choose 'lped' as the 'file format'. The tool form will change to +allow you to navigate to and select each member of the pair of ped and map files stored on your local computer +(or available at a public URL for Galaxy to grab). +Give the dataset a meaningful name (replace rgeneticsData with something more useful!) and click execute. + +When the upload is done, your new lped format dataset will appear in your history and then, +when you choose the ancestry tool, that history dataset will be available as input. + +**Warning for the Impatient** + +When you execute the tool, it will look like it has not started running for a while as the automatic converter +reduces the amount of LD - otherwise eigenstrat gives biased results. + + **Attribution** + This tool runs and relies on the work of many others, including the maintainers of the Eigensoft program, and the R and Bioconductor projects. For full attribution, source code and documentation, please see @@ -84,7 +109,7 @@ Licensed under the terms of the LGPL as documented http://www.gnu.org/licenses/lgpl.html but is about as useful as a sponge boat without EIGENSOFT pca code. -**README from eigensoft2** +**README from eigensoft2 distribution at http://genepath.med.harvard.edu/~reich/Software.htm** [rerla@beast eigensoft2]$ cat README EIGENSOFT version 2.0, January 2008 (for Linux only) --- a/tools/rgenetics/rgEigPCA_code.py Thu Mar 24 20:29:40 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,42 +0,0 @@ -from galaxy import datatypes,model -import sys,time,string,shutil,os - -def timenow(): - """return current time as a string - """ - return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) - -def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr): - """Sets the name of the html output file - if we created a set of ldreduced files, we need to move them into the input files_path - so it doesn't need to be done again - """ - indat = inp_data['i'] - indatname = indat.name - base_name = indat.metadata.base_name - todir = indat.extra_files_path # where the ldreduced stuff should be - job_name = param_dict.get( 'title', 'Eigenstrat run' ) - job_name = job_name.encode() - killme = string.punctuation + string.whitespace - trantab = string.maketrans(killme,'_'*len(killme)) - job_name = job_name.translate(trantab) - info = '%s rgEigPCA2 on %s at %s' % (job_name,indatname,timenow()) - exts = ['html','txt'] - for i,ofname in enumerate(['out_file1','pca']): - data = out_data[ofname] - ext = exts[i] - newname = '%s.%s' % (job_name,ext) - data.name = newname - data.info = info - out_data[ofname] = data - if i == 0: - fromdir = data.extra_files_path - ldfname = '%s_INDEP_THIN' % base_name # we store ld reduced and thinned data - ldout = os.path.join(todir,ldfname) - ldin = os.path.join(fromdir,ldfname) - if os.path.exists('%s.bed' % ldin) and not os.path.exists('%s.bed' % ldout): # copy ldreduced to input for next time - for ext in ['bim','bed','fam']: - src = '%s.%s' % (ldin,ext) - dest = '%s.%s' % (ldout,ext) - shutil.copy(src,dest) - app.model.context.flush() --- a/tools/rgenetics/rgGLM.py Thu Mar 24 20:29:40 2011 -0400 +++ b/tools/rgenetics/rgGLM.py Fri Mar 25 13:12:21 2011 -0400 @@ -95,6 +95,7 @@ outf.close() + def xformQassoc(resf='',outfname='',logf=None,twd='.'): """ plink.assoc.linear to gg file from the docs --- a/tools/rgenetics/rgGLM.xml Thu Mar 24 20:29:40 2011 -0400 +++ b/tools/rgenetics/rgGLM.xml Fri Mar 25 13:12:21 2011 -0400 @@ -1,18 +1,15 @@ <tool id="rgGLM1" name="Linear Models:" version="0.2"> - <code file="listFiles.py"/> + <description>for genotype data</description><code file="rgGLM_code.py"/> - - <description>for genotype data</description> - <command interpreter="python"> rgGLM.py '$i.extra_files_path/$i.metadata.base_name' '$phef.extra_files_path/$phef.metadata.base_name' - "$title1" '$predvar' '$covar' '$out_file1' '$logf' '$i.metadata.base_name' + "$title" '$predvar' '$covar' '$out_file1' '$logf' '$i.metadata.base_name' '$inter' '$cond' '$gender' '$mind' '$geno' '$maf' '$logistic' '$gffout' </command><inputs><page> - <param name='title1' label='Title for outputs' type='text' value='GLM' size="80" /> + <param name='title' label='Title for outputs' type='text' value='GLM' size="80" /><param name="i" type="data" format="pbed" label="Genotype file" size="80" /><param name="phef" type="data" format="pphe" label="Phenotype file" size="80" help="Dependent variable and covariates will be chosen from this file on the next page"/> @@ -41,9 +38,9 @@ </inputs><outputs> - <data format="tabular" name="out_file1" /> - <data format="txt" name="logf" /> - <data format="gff" name="gffout" /> + <data format="tabular" name="out_file1" label="${title}_rgGLM.xls"/> + <data format="txt" name="logf" label="${title}_rgGLMlog.txt" /> + <data format="gff" name="gffout" label="${title}_rgGLM.gff"/></outputs><tests><test> @@ -59,7 +56,7 @@ <composite_data value='tinywga.pphe' /><edit_attributes type='name' value='tinywga' /></param> - <param name='title1' value='rgGLMtest1' /> + <param name='title' value='rgGLMtest1' /><param name='predvar' value='c1' /><param name='covar' value='None' /><param name='inter' value='0' /> @@ -124,18 +121,26 @@ **Attribution** -This tool allows you to control settings for models using Plink linear models. So, we rely on the author (Shaun Purcell) -for the documentation you need specific to those settings - they are very nicely documented at -http://pngu.mgh.harvard.edu/~purcell/plink/anal.shtml#glm +This Galaxy tool relies on Plink (see Plinksrc_) to test GLM models. + +So, we rely on the author (Shaun Purcell) for the documentation you need specific to those settings - they are very nicely documented - see +DOC_ Tool and Galaxy datatypes originally designed and written for the Rgenetics series of whole genome scale statistical genetics tools by ross lazarus (ross.lazarus@gmail.com) -supported by NIH grant -Shaun Purcell created and maintains Plink -Please acknowledge your use of this tool, Galaxy and Plink in your publications and let -us know so we can keep track. These tools all rely on highly competitive grant funding -so your letting us know about publications is important to our ongoing support. +Copyright Ross Lazarus March 2007 +This Galaxy wrapper is released licensed under the LGPL_ but is about as useful as a chocolate teapot without Plink which is GPL. + +I'm no lawyer, but it looks like you got GPL if you use this software. Good luck. + +.. _Plinksrc: http://pngu.mgh.harvard.edu/~purcell/plink/ + +.. _LGPL: http://www.gnu.org/copyleft/lesser.html + +.. _DOC: http://pngu.mgh.harvard.edu/~purcell/plink/anal.shtml#glm + </help></tool> + --- a/tools/rgenetics/rgGLM_code.py Thu Mar 24 20:29:40 2011 -0400 +++ b/tools/rgenetics/rgGLM_code.py Fri Mar 25 13:12:21 2011 -0400 @@ -2,17 +2,6 @@ import os,string,time from galaxy import datatypes -def timenow(): - """return current time as a string - """ - return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) - -def get_out_formats(): - """return options for formats""" - dat = [['ucsc track','wig'],['ucsc genome graphs','gg'],['tab delimited','xls']] - dat = [(x[0],x[1],False) for x in dat] - dat.reverse() - return dat def get_phecols(phef='',selectOne=0): """return column names """ @@ -32,28 +21,3 @@ res = [('None','no phenotype columns found',False),] return res - -def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr): - """Sets the name of the data""" - killme=string.punctuation+string.whitespace - trantab = string.maketrans(killme,'_'*len(killme)) - job_name = param_dict.get( 'title1', 'GLM' ) - job_name = job_name.encode().translate(trantab) - outxls = ['tabular','%s_GLM.xls' % job_name] - logtxt = ['txt','%s_GLM_log.txt' % job_name] - ggout = ['gg','%s_GLM_topTable.gff' % job_name] - lookup={} - lookup['out_file1'] = outxls - lookup['logf'] = logtxt - lookup['gffout'] = ggout - info = '%s GLM output by rgGLM created at %s' % (job_name,timenow()) - for name in lookup.keys(): - data = out_data[name] - data_type,newname = lookup.get(name,(None,None)) - if data_type <> None: - data.name = newname - data.info = info - out_data[name] = data - else: - print >> stdout,'no output matching %s in exec after hook for rgGLM' % name - app.model.context.flush() --- a/tools/rgenetics/rgGRR.xml Thu Mar 24 20:29:40 2011 -0400 +++ b/tools/rgenetics/rgGRR.xml Fri Mar 25 13:12:21 2011 -0400 @@ -1,20 +1,19 @@ <tool id="rgGRR1" name="GRR:"> - <code file="rgGRR_code.py"/><description>Pairwise Allele Sharing</description><command interpreter="python"> rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name" - '$out_file1' '$out_file1.files_path' "$title1" '$n' '$Z' + '$out_file1' '$out_file1.files_path' "$title" '$n' '$Z' </command><inputs><param name="i" type="data" label="Genotype data file from your current history" format="ldindep" /> - <param name='title1' type='text' size="80" value='rgGRR' label="Title for this job"/> + <param name='title' type='text' size="80" value='rgGRR' label="Title for this job"/><param name="n" type="integer" label="N snps to use (0=all)" value="5000" /><param name="Z" type="float" label="Z score cutoff for outliers (eg 2)" value="6" help="2 works but for very large numbers of pairs, you might want to see less than 5%" /></inputs><outputs> - <data format="html" name="out_file1" /> + <data format="html" name="out_file1" label="${title}_rgGRR.html"/></outputs><tests> @@ -26,7 +25,7 @@ <composite_data value='tinywga.fam' /><edit_attributes type='name' value='tinywga' /></param> - <param name='title1' value='rgGRRtest1' /> + <param name='title' value='rgGRRtest1' /><param name='n' value='100' /><param name='Z' value='6' /><param name='force' value='true' /> --- a/tools/rgenetics/rgGRR_code.py Thu Mar 24 20:29:40 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,37 +0,0 @@ -from galaxy import datatypes,model -import sys,time,string,os,shutil - -def timenow(): - """return current time as a string - """ - return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) - -def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr): - """Sets the name of the html output file - """ - killme = string.punctuation + string.whitespace - trantab = string.maketrans(killme,'_'*len(killme)) - job_name = param_dict.get( 'title1', 'rgGRR' ) - job_name = job_name.encode() - newname = '%s.html' % job_name.translate(trantab) - indatname = inp_data['i'].name - info = '%s Mean allele sharing on %s at %s' % (job_name,indatname,timenow()) - ofname = 'out_file1' - data = out_data[ofname] - data.name = newname - data.info = info - out_data[ofname] = data - fromdir = data.extra_files_path - indat = inp_data['i'] - indatname = indat.name - base_name = indat.metadata.base_name - todir = indat.extra_files_path # where the ldreduced stuff should be - ldfname = '%s_INDEP_THIN' % base_name # we store ld reduced and thinned data - ldout = os.path.join(todir,ldfname) - ldin = os.path.join(fromdir,ldfname) - if os.path.exists('%s.bed' % ldin) and not os.path.exists('%s.bed' % ldout): # copy ldreduced to input for next time - for ext in ['bim','bed','fam']: - src = '%s.%s' % (ldin,ext) - dest = '%s.%s' % (ldout,ext) - shutil.copy(src,dest) - app.model.context.flush() --- a/tools/rgenetics/rgGTOOL.xml Thu Mar 24 20:29:40 2011 -0400 +++ b/tools/rgenetics/rgGTOOL.xml Fri Mar 25 13:12:21 2011 -0400 @@ -1,6 +1,5 @@ <tool id="rgGTOOL1" name="Converter"> - <code file="listFiles.py"/> - <code file="rgGTOOL_code.py"/> + <description>from linkage format to SNPTEST Marchini files</description> --- a/tools/rgenetics/rgGTOOL_code.py Thu Mar 24 20:29:40 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,19 +0,0 @@ -# for GTOOL linkage ped to marchini snptest format -from galaxy import datatypes - - -def exec_before_job(app, inp_data, out_data, param_dict, tool=None): - """Sets the name of the data""" - job_name = param_dict.get( 'o', 'Marchini' ) - log = ['txt','%s_rgGTOOL_report.log' % job_name,'txt'] - lookup={} - lookup['logf'] = log - for name in lookup.keys(): - data = out_data[name] - data_type,newname,ext = lookup[name] - data = app.datatypes_registry.change_datatype(data, data_type) - data.name = newname - out_data[name] = data - - - --- a/tools/rgenetics/rgHaploView.py Thu Mar 24 20:29:40 2011 -0400 +++ b/tools/rgenetics/rgHaploView.py Fri Mar 25 13:12:21 2011 -0400 @@ -105,15 +105,8 @@ self.lf = file(self.log_file,'w') s = 'PATH=%s\n' % os.environ.get('PATH','?') self.lf.write(s) - - def setupRegions(self): - """ - """ - chromosome = '' - spos = epos = -9 - rslist = [] - rsdict = {} + def getRs(self): if self.region > '': useRs = [] useRsdict={} @@ -138,6 +131,20 @@ else: useRs = self.orslist.split() # galaxy replaces newlines with XX - go figure useRsdict = dict(zip(useRs,useRs)) + return useRs, useRsdict + + + def setupRegions(self): + """ + This turns out to be complex because we allow the user + flexibility - paste a list of rs or give a region. + In most cases, some subset has to be generated correctly before running Haploview + """ + chromosome = '' + spos = epos = -9 + rslist = [] + rsdict = {} + useRs,useRsdict = self.getRs() self.useTemp = False try: dfile = open(self.DATA_FILE, 'r') @@ -284,7 +291,60 @@ self.lf.write(s) print >> sys.stdout,s sys.exit(1) - + + def run(self,vcl): + """ + """ + p=subprocess.Popen(vcl,shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf) + retval = p.wait() + self.lf.write('## executing %s returned %d\n' % (vcl,retval)) + + def plotHmPanels(self,ste): + """ + """ + sp = '%d' % (self.spos/1000.) # hapmap wants kb + ep = '%d' % (self.epos/1000.) + fnum=0 + for panel in self.hmpanels: + if panel > '' and panel.lower() <> 'none': # in case someone checks that option too :) + ptran = panel.strip() + ptran = ptran.replace('+','_') + fnum += 1 # preserve an order or else we get sorted + vcl = [javabin,'-jar',self.hvbin,'-n','-memory','%d' % self.memSize, + '-chromosome',self.chromosome, '-panel',panel.strip(), + '-hapmapDownload','-startpos',sp,'-endpos',ep, + '-ldcolorscheme',self.ldType] + if self.minMaf: + vcl += ['-minMaf','%f' % self.minMaf] + if self.maxDist: + vcl += ['-maxDistance',self.maxDist] + if self.hiRes: + vcl.append('-png') + else: + vcl.append('-compressedpng') + if self.infotrack: + vcl.append('-infoTrack') + p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=ste,stdout=self.lf) + retval = p.wait() + inpng = 'Chromosome%s%s.LD.PNG' % (self.chromosome,panel) + inpng = inpng.replace(' ','') # mysterious spaces! + outpng = '%d_HapMap_%s_%s.png' % (fnum,ptran,self.chromosome) + # hack for stupid chb+jpt + outpng = outpng.replace(' ','') + tmppng = '%s.tmp.png' % self.title + tmppng = tmppng.replace(' ','') + outpng = os.path.split(outpng)[-1] + vcl = [self.convert, '-resize 800x400!', inpng, tmppng] + self.run(' '.join(vcl)) + s = "text 10,300 'HapMap %s'" % ptran.strip() + vcl = [self.convert, '-pointsize 25','-fill maroon', + '-draw "%s"' % s, tmppng, outpng] + self.run(' '.join(vcl)) + try: + os.remove(os.path.join(self.outfpath,tmppng)) + except: + pass + def doPlots(self): """ """ @@ -308,15 +368,9 @@ vcl += ['-chromosome',self.chromosome] if self.infotrack: vcl.append('-infoTrack') - p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=ste,stdout=self.lf) - retval = p.wait() - s = '## executing %s returned %d\n' % (' '.join(vcl),retval) - self.lf.write(s) + self.run(' '.join(vcl)) vcl = [self.mogrify, '-resize 800x400!', '*.PNG'] - p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf) - retval = p.wait() - s = '## executing %s returned %d\n' % (' '.join(vcl),retval) - self.lf.write(s) + self.run(' '.join(vcl)) inpng = '%s.LD.PNG' % DATA_FILE # stupid but necessary - can't control haploview name mangle inpng = inpng.replace(' ','') inpng = os.path.split(inpng)[-1] @@ -326,89 +380,29 @@ outpng = outpng.replace(' ','') outpng = os.path.split(outpng)[-1] vcl = [self.convert, '-resize 800x400!', inpng, tmppng] - p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf) - retval = p.wait() - s = '## executing %s returned %d\n' % (' '.join(vcl),retval) - self.lf.write(s) + self.run(' '.join(vcl)) s = "text 10,300 '%s'" % self.title[:40] vcl = [self.convert, '-pointsize 25','-fill maroon', '-draw "%s"' % s, tmppng, outpng] - p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf) - retval = p.wait() - s = '## executing %s returned %d\n' % (' '.join(vcl),retval) - self.lf.write(s) + self.run(' '.join(vcl)) try: os.remove(os.path.join(self.outfpath,tmppng)) except: pass # label all the plots then delete all the .PNG files before munging fnum=1 if self.hmpanels: - sp = '%d' % (self.spos/1000.) # hapmap wants kb - ep = '%d' % (self.epos/1000.) - for panel in self.hmpanels: - if panel > '' and panel.lower() <> 'none': # in case someone checks that option too :) - ptran = panel.strip() - ptran = ptran.replace('+','_') - fnum += 1 # preserve an order or else we get sorted - vcl = [javabin,'-jar',self.hvbin,'-n','-memory','%d' % self.memSize, - '-chromosome',self.chromosome, '-panel',panel.strip(), - '-hapmapDownload','-startpos',sp,'-endpos',ep, - '-ldcolorscheme',self.ldType] - if self.minMaf: - vcl += ['-minMaf','%f' % self.minMaf] - if self.maxDist: - vcl += ['-maxDistance',self.maxDist] - if self.hiRes: - vcl.append('-png') - else: - vcl.append('-compressedpng') - if self.infotrack: - vcl.append('-infoTrack') - p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=ste,stdout=self.lf) - retval = p.wait() - inpng = 'Chromosome%s%s.LD.PNG' % (self.chromosome,panel) - inpng = inpng.replace(' ','') # mysterious spaces! - outpng = '%d_HapMap_%s_%s.png' % (fnum,ptran,self.chromosome) - # hack for stupid chb+jpt - outpng = outpng.replace(' ','') - tmppng = '%s.tmp.png' % self.title - tmppng = tmppng.replace(' ','') - outpng = os.path.split(outpng)[-1] - vcl = [self.convert, '-resize 800x400!', inpng, tmppng] - p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf) - retval = p.wait() - s = '## executing %s returned %d\n' % (' '.join(vcl),retval) - self.lf.write(s) - s = "text 10,300 'HapMap %s'" % ptran.strip() - vcl = [self.convert, '-pointsize 25','-fill maroon', - '-draw "%s"' % s, tmppng, outpng] - p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf) - retval = p.wait() - s = '## executing %s returned %d\n' % (' '.join(vcl),retval) - self.lf.write(s) - try: - os.remove(os.path.join(self.outfpath,tmppng)) - except: - pass + self.plotHmPanels(ste) nimages = len(glob.glob(os.path.join(self.outfpath,'*.png'))) # rely on HaploView shouting - PNG @! self.lf.write('### nimages=%d\n' % nimages) if nimages > 0: # haploview may fail? vcl = '%s -format pdf -resize 800x400! *.png' % self.mogrify - p=subprocess.Popen(vcl,shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf) - retval = p.wait() - self.lf.write('## executing %s returned %d\n' % (vcl,retval)) + self.run(vcl) vcl = '%s *.pdf --fitpaper true --outfile alljoin.pdf' % self.pdfjoin - p=subprocess.Popen(vcl,shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf) - retval = p.wait() - self.lf.write('## executing %s returned %d\n' % (vcl,retval)) + self.run(vcl) vcl = '%s alljoin.pdf --nup 1x%d --outfile allnup.pdf' % (self.pdfnup,nimages) - p=subprocess.Popen(vcl,shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf) - retval = p.wait() - self.lf.write('## executing %s returned %d\n' % (vcl,retval)) + self.run(vcl) vcl = '%s -resize x300 allnup.pdf allnup.png' % (self.convert) - p=subprocess.Popen(vcl,shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf) - retval = p.wait() - self.lf.write('## executing %s returned %d\n' % (vcl,retval)) + self.run(vcl) ste.close() # temp file used to catch haploview blather hblather = open(blog,'r').readlines() # to catch the blather os.unlink(blog) @@ -417,6 +411,10 @@ self.lf.write(''.join(hblather)) self.lf.write('\n') self.lf.close() + + def writeHtml(self): + """ + """ flist = glob.glob(os.path.join(self.outfpath, '*')) flist.sort() ts = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~' + string.whitespace @@ -509,6 +507,7 @@ sys.exit(1) ld = ldPlot(argv = sys.argv) ld.doPlots() + ld.writeHtml() --- a/tools/rgenetics/rgHaploView_code.py Thu Mar 24 20:29:40 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,84 +0,0 @@ -from galaxy import app -import galaxy.util,string,os,glob,time - -librepos = '/usr/local/galaxy/data/rg/library' -myrepos = '/home/rerla/galaxy' - - -#Provides Upload tool with access to list of available builds - -builds = [] -#Read build names and keys from galaxy.util -for dbkey, build_name in galaxy.util.dbnames: - builds.append((build_name,dbkey,False)) - -def timenow(): - """return current time as a string - """ - return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) - -#Return available builds -def get_available_builds(defval='hg18'): - for i,x in enumerate(builds): - if x[1] == defval: - x = list(x) - x[2] = True - builds[i] = tuple(x) - return builds - -#return available datasets for build -def get_available_data( file_type_dir, build='hg18' ): - #we need to allow switching of builds, and properly set in post run hook - import_files = [('Use the History file chosen below','',True),] - flist = glob.glob(os.path.join(librepos,file_type_dir,'*.ped')) - maplist = glob.glob(os.path.join(librepos,file_type_dir,'*.map')) - maplist = [os.path.splitext(x)[0] for x in maplist] # get unique filenames - mapdict = dict(zip(maplist,maplist)) - flist = [os.path.splitext(x)[0] for x in flist] # get unique filenames - flist = list(set(flist)) # remove dupes - flist = [x for x in flist if mapdict.get(x,None)] - flist.sort() - for i, data in enumerate( flist ): - #import_files.append( (os.path.split(data)[-1], os.path.split(data)[-1], False) ) - import_files.append((data,data, False) ) - if len(import_files) < 1: - import_files = [('No %s data available - please choose a History file below' - % file_type_dir,'',True),] - return import_files - -def get_available_outexts(uid): - userId = uid - print 'userId=',userId - flib = os.path.join(librepos,userId,'fped') - plib = os.path.join(librepos,userId,'lped') - e = [('Fbat style (header row has marker names)',flib,False),('Linkage style (separate .map file)',plib,True)] - return e - -def getcols(fname="/usr/local/galaxy/data/camp2007/camp2007.xls"): - """return column names other than chr offset as a select list""" - head = open(fname,'r').next() - c = head.strip().split() - res = [(cname,'%d' % n,False) for n,cname in enumerate(c)] - for i in range(4): - x,y,z = res[i] - res[i] = (x,y,True) # set first couple as selected - return res - - -def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr): - """Sets the name of the html output file - that's all we need - for a html file result set - """ - ts = '%s%s' % (string.punctuation,string.whitespace) - ptran = string.maketrans(ts,'_'*len(ts)) - job_name = param_dict.get( 'title', 'LD_Plot' ) - job_name = job_name.encode().translate(ptran) - ofname = 'out_file1' - data = out_data[ofname] - newname = job_name - data.name = '%s.html' % newname - info = '%s created by rgHaploView at %s' % (job_name,timenow()) - out_data[ofname] = data - app.model.context.flush() - - --- a/tools/rgenetics/rgLDIndep_code.py Thu Mar 24 20:29:40 2011 -0400 +++ b/tools/rgenetics/rgLDIndep_code.py Fri Mar 25 13:12:21 2011 -0400 @@ -15,7 +15,7 @@ trantab = string.maketrans(killme,'_'*len(killme)) title = basename.encode().translate(trantab) info = '%s filtered by rgLDIndep.py at %s' % (title,timenow()) - #data.file_name = data.file_name + data.file_name = data.file_name data.metadata.base_name = title data.name = '%s.pbed' % title data.info = info --- a/tools/rgenetics/rgManQQ.xml Thu Mar 24 20:29:40 2011 -0400 +++ b/tools/rgenetics/rgManQQ.xml Fri Mar 25 13:12:21 2011 -0400 @@ -44,9 +44,9 @@ <param name='grey' value='0' /><output name='out_html' file='rgtestouts/rgManQQ/rgManQQtest1.html' ftype='html' lines_diff='60'><extra_files type="file" name='Allelep_manhattan.png' value='rgtestouts/rgManQQ/Allelep_manhattan.png' compare="sim_size" - delta = "10000"/> + delta = "20000"/><extra_files type="file" name='Allelep_qqplot.png' value='rgtestouts/rgManQQ/Allelep_qqplot.png' compare="sim_size" - delta = "10000" /> + delta = "20000" /><extra_files type="file" name='rgManQQtest1.R' value='rgtestouts/rgManQQ/rgManQQtest1.R' compare="diff" lines_diff="160"/></output></test> --- a/tools/rgenetics/rgPedSub.xml Thu Mar 24 20:29:40 2011 -0400 +++ b/tools/rgenetics/rgPedSub.xml Fri Mar 25 13:12:21 2011 -0400 @@ -1,5 +1,4 @@ <tool id="rgPedSub1" name="Subset markers:"> - <code file="rgPedSub_code.py"/><description>region or rs list</description> @@ -44,7 +43,7 @@ </inputs><outputs> - <data format="lped" name="output1" metadata_source="input1"/> + <data format="lped" name="output1" metadata_source="input1" label="${title}.lped"/></outputs><configfiles> --- a/tools/rgenetics/rgPedSub_code.py Thu Mar 24 20:29:40 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,78 +0,0 @@ -from galaxy import app -import galaxy.util,string,os,glob,time - -#Provides Upload tool with access to list of available builds - -builds = [] -#Read build names and keys from galaxy.util -for dbkey, build_name in galaxy.util.dbnames: - builds.append((build_name,dbkey,False)) - -def timenow(): - """return current time as a string - """ - return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) - - -#Return available builds -def get_available_builds(defval='hg18'): - for i,x in enumerate(builds): - if x[1] == defval: - x = list(x) - x[2] = True - builds[i] = tuple(x) - return builds - -#return available datasets for build -def get_available_data( file_type_dir, build='hg18' ): - #we need to allow switching of builds, and properly set in post run hook - import_files = [] # [('Use the History file chosen below','',False),] - flist = glob.glob(os.path.join(librepos,file_type_dir,'*.ped')) - flist = [os.path.splitext(x)[0] for x in flist] # get unique filenames - flist = list(set(flist)) # remove dupes - flist.sort() - for i, data in enumerate( flist ): - #import_files.append( (os.path.split(data)[-1], os.path.split(data)[-1], False) ) - import_files.append((os.path.split(data)[-1],data, False) ) - if len(import_files) < 1: - import_files.append(('No %s data available - please choose a History file instead' - % file_type_dir,'',True)) - return import_files - -def get_available_outexts(uid): - userId = uid - print 'userId=',userId - flib = os.path.join(librepos,userId,'fped') - plib = os.path.join(librepos,userId,'lped') - e = [('Fbat style (header row has marker names)',flib,False),('Linkage style (separate .map file)',plib,True)] - return e - -def getcols(fname="/usr/local/galaxy/data/camp2007/camp2007.xls"): - """return column names other than chr offset as a select list""" - head = open(fname,'r').next() - c = head.strip().split() - res = [(cname,'%d' % n,False) for n,cname in enumerate(c)] - for i in range(4): - x,y,z = res[i] - res[i] = (x,y,True) # set first couple as selected - return res - - -def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr): - """from rgConvert_code - """ - name,data = out_data.items()[0] - iname,idata = inp_data.items()[0] - basename = idata.metadata.base_name - title = param_dict.get( 'title', 'Lped Subset' ) - title = title.encode() # make str - unicode is evil here - killme = string.punctuation + string.whitespace - trantab = string.maketrans(killme,'_'*len(killme)) - title = title.translate(trantab) - info = '%s filtered by rgPedSub.py at %s' % (title,timenow()) - #data.file_name = data.file_name - data.metadata.base_name = basename - data.name = '%s.lped' % title - data.info = info - app.model.context.flush() - --- a/tools/rgenetics/rgQC.xml Thu Mar 24 20:29:40 2011 -0400 +++ b/tools/rgenetics/rgQC.xml Fri Mar 25 13:12:21 2011 -0400 @@ -1,21 +1,20 @@ <tool id="rgQC1" name="QC reports:"> - <code file="rgQC_code.py"/><description>Marker and Subject measures</description><command interpreter="python"> - rgQC.py -i '$input_file.extra_files_path/$input_file.metadata.base_name' -o "$out_prefix" + rgQC.py -i '$input_file.extra_files_path/$input_file.metadata.base_name' -o "$title" -s '$html_file' -p '$html_file.files_path' </command><inputs><param name="input_file" type="data" label="RGenetics genotype file in compressed Plink format" size="80" format="pbed" /> - <param name="out_prefix" size="80" type="text" value="RgQC report" label="Descriptive report title"/> + <param name="title" size="80" type="text" value="RgQC report" label="Descriptive report title"/></inputs><outputs> - <data format="html" name="html_file" metadata_source="input_file"/> + <data format="html" name="html_file" metadata_source="input_file" label="${title}.html"/></outputs><tests> @@ -27,9 +26,9 @@ <composite_data value='tinywga.fam' /><edit_attributes type='name' value='tinywga' /></param> - <param name='out_prefix' value='rgQCtest1' /> - <output name='html_file' file='rgtestouts/rgQC/rgQCtest1.html' ftype='html' lines_diff='280'> - <extra_files type="file" name='tinywga_All_Paged.pdf' value="rgtestouts/rgQC/tinywga_All_Paged.pdf" compare="sim_size" delta = "5000"/> + <param name='title' value='rgQCtest1' /> + <output name='html_file' file='rgtestouts/rgQC/rgQCtest1.html' ftype='html' lines_diff='300'> + <extra_files type="file" name='tinywga_All_Paged.pdf' value="rgtestouts/rgQC/tinywga_All_Paged.pdf" compare="sim_size" delta = "100000"/><extra_files type="file" name='tinywga.log' value="rgtestouts/rgQC/tinywga.log" compare="diff" lines_diff="15"/><extra_files type="file" name='tinywga.frq' value="rgtestouts/rgQC/tinywga.frq" compare="diff" /><extra_files type="file" name='tinywga.het' value="rgtestouts/rgQC/tinywga.het" compare="diff" lines_diff="90"/> @@ -38,7 +37,7 @@ <extra_files type="file" name='tinywga.imiss' value="rgtestouts/rgQC/tinywga.imiss" compare="diff" /><extra_files type="file" name='tinywga.lmendel' value="rgtestouts/rgQC/tinywga.lmendel" compare="diff" /><extra_files type="file" name='tinywga.lmiss' value="rgtestouts/rgQC/tinywga.lmiss" compare="diff" /> - <extra_files type="file" name='tinywga_All_3x3.pdf' value="rgtestouts/rgQC/tinywga_All_3x3.pdf" compare="sim_size" delta="5000"/> + <extra_files type="file" name='tinywga_All_3x3.pdf' value="rgtestouts/rgQC/tinywga_All_3x3.pdf" compare="sim_size" delta="100000"/><extra_files type="file" name='ldp_tinywga.bed' value="rgtestouts/rgQC/ldp_tinywga.bed" compare="diff" lines_diff="10" /><extra_files type="file" name='ldp_tinywga.bim' value="rgtestouts/rgQC/ldp_tinywga.bim" compare="sim_size" delta="1000" /><extra_files type="file" name='ldp_tinywga.fam' value="rgtestouts/rgQC/ldp_tinywga.fam" compare="diff" /> @@ -46,9 +45,9 @@ <extra_files type="file" name='Ranked_Marker_HWE.xls' value="rgtestouts/rgQC/Ranked_Marker_HWE.xls" compare="diff" /><extra_files type="file" name='Ranked_Marker_MAF.xls' value="rgtestouts/rgQC/Ranked_Marker_MAF.xls" compare="diff" /><extra_files type="file" name='Ranked_Marker_Missing_Genotype.xls' value="rgtestouts/rgQC/Ranked_Marker_Missing_Genotype.xls" compare="diff" lines_diff="5"/> - <extra_files type="file" name='Ranked_Subject_Missing_Genotype.xls' value="rgtestouts/rgQC/Ranked_Subject_Missing_Genotype.xls" compare="diff" lines_diff="10"/> + <extra_files type="file" name='Ranked_Subject_Missing_Genotype.xls' value="rgtestouts/rgQC/Ranked_Subject_Missing_Genotype.xls" compare="diff" lines_diff="40"/><extra_files type="file" name='tinywga_fracmiss_cum.jpg' value="rgtestouts/rgQC/tinywga_fracmiss_cum.jpg" compare="sim_size" delta = "20000"/> - <extra_files type="file" name='tinywga_fracmiss_cum.pdf' value="rgtestouts/rgQC/tinywga_fracmiss_cum.pdf" compare="sim_size" delta = "1000"/> + <extra_files type="file" name='tinywga_fracmiss_cum.pdf' value="rgtestouts/rgQC/tinywga_fracmiss_cum.pdf" compare="sim_size" delta = "100000"/></output></test></tests> --- a/tools/rgenetics/rgQC_code.py Thu Mar 24 20:29:40 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,38 +0,0 @@ -""" -# after running the qc, need to rename various output files - <data format="html" name="html_file" /> - <data format="txt" name="log_file" parent="html_file" /> - <data format="tabular" name="marker_file" parent="html_file" /> - <data format="tabular" name="subject_file" parent="html_file" /> - <data format="tabular" name="freq_file" parent="html_file" /> - </outputs> -""" -from galaxy import datatypes,model -import sys,time - -def timenow(): - """return current time as a string - """ - return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) - - -def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr): - """Change data file names - - """ - job_name = param_dict.get( 'out_prefix', 'rgQCdefault' ) - html = ['html','%s.html' % job_name] - lookup={} - lookup['html_file'] = html - info = '%s QC report by rgQC at %s' % (job_name,timenow()) - for aname in lookup.keys(): - data = out_data[aname] - data_type,newname = lookup[aname] - data = app.datatypes_registry.change_datatype(data, data_type) - data.name = newname - data.info = info - out_data[aname] = data - app.model.context.flush() - - - --- a/tools/rgenetics/rgQQ.xml Thu Mar 24 20:29:40 2011 -0400 +++ b/tools/rgenetics/rgQQ.xml Fri Mar 25 13:12:21 2011 -0400 @@ -4,14 +4,14 @@ <description>for p values from an analysis </description><command interpreter="python"> - rgQQ.py "$input1" "$name" "$sample" "$cols" "$allqq" "$height" "$width" "$logtrans" "$allqq.id" "$__new_file_path__" + rgQQ.py "$input1" "$title" "$sample" "$cols" "$allqq" "$height" "$width" "$logtrans" "$allqq.id" "$__new_file_path__" </command><inputs><page><param name="input1" type="data" label="Choose the History dataset containing p values to QQ plot" size="80" format="tabular" help="Dataset missing? See Tip below" /> - <param name="name" type="text" size="80" label = "Descriptive title for QQ plot" value="QQ" /> + <param name="title" type="text" size="80" label = "Descriptive title for QQ plot" value="QQ" /><param name="logtrans" type="boolean" label = "Use a log scale - recommended for p values in range 0-1.0" truevalue="true" falsevalue="false"/> @@ -28,13 +28,13 @@ </inputs><outputs> - <data format="pdf" name="allqq" /> + <data format="pdf" name="allqq" label="${title}.html"/></outputs><tests><test><param name='input1' value='tinywga.pphe' /> - <param name='name' value="rgQQtest1" /> + <param name='title' value="rgQQtest1" /><param name='logtrans' value="false" /><param name='sample' value='1.0' /><param name='height' value='8' /> --- a/tools/rgenetics/rgQQ_code.py Thu Mar 24 20:29:40 2011 -0400 +++ b/tools/rgenetics/rgQQ_code.py Fri Mar 25 13:12:21 2011 -0400 @@ -48,25 +48,3 @@ else: columns = [('?','?',False),] return columns - - -def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr): - """Sets the name of the data - <outputs> - <data format="pdf" name="allqq" /> - <data format="pdf" name="lowqq" parent="allqq"/> - </outputs> - """ - job_name = param_dict.get( 'name', 'My rgQQplot' ) - killme = string.punctuation + string.whitespace - trantab = string.maketrans(killme,'_'*len(killme)) - newname = '%s.pdf' % job_name.translate(trantab) - lookup={} - lookup['allqq'] = newname - for aname in lookup.keys(): - data = out_data[aname] - newname = lookup[aname] - data.name = newname - out_data[aname] = data - app.model.context.flush() - --- a/tools/rgenetics/rgRegion.xml Thu Mar 24 20:29:40 2011 -0400 +++ b/tools/rgenetics/rgRegion.xml Fri Mar 25 13:12:21 2011 -0400 @@ -1,24 +1,20 @@ <tool id="rgRegion" name="Subset:"> - <code file="rgRegion_code.py"/> - <code file="listFiles.py"/> - <description>genotypes from genomic region</description><command interpreter="python"> - rgRegion.py $infile $r $tag $out_file1 + rgRegion.py $infile $r $title $out_file1 </command><inputs><page><param name="infile" type="data" format="lped" label="Linkage ped genotype file name from current history" size="80"/> + <param name="title" type="text" size="80" label="Title for output files" optional="true" + help="Descriptive title for new genotype/map files" value="RGRegion" /><param name="r" type="text" label="Region" help="Cut and paste a UCSC browser region" size="80" value="chr9:119,506,000-122,518,000"/><param name="rslist" type="text" area="true" label="List of rs numbers" help="Type (or cut and paste) a space or newline separated list of rs numbers" size="5x20"/> - - <param name="tag" type="text" label="Output file name" value="My_favourite_region" size="80"/> - <param name="outformat" type="select" label="Output file format" dynamic_options="get_rgRegionOutFormats()" size="80"/> - <param name="dbkey" type="hidden" value="hg18" /> + <param name="outformat" type="select" label="Output file format" dynamic_options="get_rgRegionOutFormats()" size="80"/></page> @@ -26,7 +22,7 @@ </inputs><outputs> - <data format="lped" name="out_file1" /> + <data format="lped" name="out_file1" label="${title}.lped" metadata_source=infile /></outputs><help> --- a/tools/rgenetics/rgRegion_code.py Thu Mar 24 20:29:40 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,47 +0,0 @@ -from galaxy import app -import galaxy.util,string - -librepos = '/usr/local/galaxy/data/rg' -myrepos = '/home/rerla/galaxy' -marchinirepos = '/usr/local/galaxy/data/rg/snptest' - -#Provides Upload tool with access to list of available builds - - -def get_rgRegionOutFormats(): - """return options for formats""" - dat = [['ucsc track','wig',False],['Strict genome graphs (rs+floats)','gg',True],['tab delimited','xls',False]] - dat = [(x[0],x[1],x[2]) for x in dat] - return dat - - -def get_phecols(phef): - """return column names """ - head = open(phef,'r').next() - c = head.strip().split() - res = [(cname,cname,False) for cname in c] - x,y,z = res[2] # 0,1 = fid,iid - res[2] = (x,y,True) # set second selected - return res - - -def getAllcols(fname="/usr/local/galaxy/data/camp2007/camp2007.xls",outformat='gg'): - """return column names other than chr offset as a select list""" - head = open(fname,'r').next() - c = head.strip().split() - res = [(cname,'%d' % n,True) for n,cname in enumerate(c)] - - return res - - -def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr): - """Sets the name of the data""" - data_name = param_dict.get( 'tag', 'My region' ) - outformat = param_dict.get( 'outformat', 'gg' ) - outfile = param_dict.get( 'outfile1', 'lped' ) - for name, data in out_data.items(): - if name == 'tag': - data = app.datatypes_registry.change_datatype(data, outformat) - data.name = data_name - out_data[name] = data - --- a/tools/rgenetics/rgTDT.py Thu Mar 24 20:29:40 2011 -0400 +++ b/tools/rgenetics/rgTDT.py Fri Mar 25 13:12:21 2011 -0400 @@ -26,6 +26,7 @@ verbose = False + def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000): """ score must be scaled to 0-1000 @@ -84,7 +85,7 @@ outres = [] # need to resort into chrom offset order for i,lrow in enumerate(resfl): chrom,snp,offset,p, = [lrow[x] for x in wewant] - gff = ('chr%s' % chrom,'rgGLM','variation','%d' % (int(offset)-halfwidth), + gff = ('chr%s' % chrom,'rgTDT','variation','%d' % (int(offset)-halfwidth), '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i])) outres.append(gff) outres = [(x[0],int(x[3]),x) for x in outres] # decorate --- a/tools/rgenetics/rgTDT.xml Thu Mar 24 20:29:40 2011 -0400 +++ b/tools/rgenetics/rgTDT.xml Fri Mar 25 13:12:21 2011 -0400 @@ -1,23 +1,21 @@ <tool id="rgTDT1" name="Transmission Distortion:"> - <code file="rgTDT_code.py"/> - <description>for family data</description><command interpreter="python"> - rgTDT.py -i '$i.extra_files_path/$i.metadata.base_name' -o '$title1' + rgTDT.py -i '$i.extra_files_path/$i.metadata.base_name' -o '$title' -r '$out_file1' -l '$logf' -g '$gffout' </command><inputs><param name="i" type="data" label="Genotypes for analysis from your current history datasets" size="132" format="pbed" /> - <param name='title1' type='text' value='rgTDT' size="80"/> + <param name='title' type='text' value='rgTDT' label="Title for the output to remind you what you did" size="80"/></inputs><outputs> - <data format="tabular" name="out_file1" /> - <data format="gff" name="gffout" /> - <data format="txt" name="logf" /> + <data format="tabular" name="out_file1" label="${title}_rgTDT.xls"/> + <data format="gff" name="gffout" label="${title}_rgTDT.gff"/> + <data format="txt" name="logf" label="${title}_rgTDTlog.txt"/></outputs><tests> @@ -29,7 +27,7 @@ <composite_data value='tinywga.fam' /><edit_attributes type='name' value='tinywga' /></param> - <param name='title1' value='rgTDTtest1' /> + <param name='title' value='rgTDTtest1' /><output name='out_file1' file='rgTDTtest1_TDT.xls' ftype='tabular' compare="diff"/><output name='gffout' file='rgTDTtest1_TDT_topTable.gff' ftype='gff' compare="diff" /><output name='logf' file='rgTDTtest1_TDT_log.txt' ftype='txt' lines_diff='79'/> @@ -81,5 +79,30 @@ one with a header. Fortunately excel is dumb enough to open these without much protest. +---- + +.. class:: infomark + +**Attribution** + +This Galaxy tool relies on Plink (see Plinksrc_) to test TDT models. + +So, we rely on the author (Shaun Purcell) for the documentation you need specific to those settings - they are very nicely documented - see +DOC_ + +Tool and Galaxy datatypes originally designed and written for the Rgenetics +series of whole genome scale statistical genetics tools by ross lazarus (ross.lazarus@gmail.com) + +Copyright Ross Lazarus March 2007 +This Galaxy wrapper is released licensed under the LGPL_ but is about as useful as a chocolate teapot without Plink which is GPL. + +I'm no lawyer, but it looks like you got GPL if you use this software. Good luck. + +.. _Plinksrc: http://pngu.mgh.harvard.edu/~purcell/plink/ + +.. _LGPL: http://www.gnu.org/copyleft/lesser.html + +.. _DOC: http://pngu.mgh.harvard.edu/~purcell/plink/anal.shtml#tdt + </help></tool> --- a/tools/rgenetics/rgTDT_code.py Thu Mar 24 20:29:40 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,43 +0,0 @@ -# before running the qc, need to rename various output files -import time,string - -def timenow(): - """return current time as a string - """ - return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) - -def get_out_formats(): - """return options for formats""" - dat = [['ucsc genome graphs','gg',True],['ucsc track','wig',False],['tab delimited','xls',False]] - dat = [(x[0],x[1],x[2]) for x in dat] - return dat - - -def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr): - """Sets the name of the data - <command interpreter="python2.4"> - rgTDT.py -i $i.extra_files_path/$i.metadata.base_name -o $title -f $outformat -r $out_file1 -l $logf - </command> - - """ - dbk = param_dict.get('dbkey','hg18') - job_name = param_dict.get( 'title1', 'rgTDTtest1' ) - killme=string.punctuation+string.whitespace - trantab = string.maketrans(killme,'_'*len(killme)) - job_name = job_name.encode().translate(trantab) - outxls = ['tabular','%s_TDT.xls' % job_name] - logtxt = ['txt','%s_TDT_log.txt' % job_name] - ggout = ['gg','%s_TDT_topTable.gff' % job_name] - lookup={} - lookup['out_file1'] = outxls - lookup['logf'] = logtxt - lookup['gffout'] = ggout - info = 'rgTDT run at %s' % timenow() - for name in lookup.keys(): - data = out_data[name] - data_type,newname = lookup[name] - data.name = newname - data.info = info - data.dbkey = dbk - out_data[name] = data - app.model.context.flush() --- a/tools/rgenetics/rgfakePed.xml Thu Mar 24 20:29:40 2011 -0400 +++ b/tools/rgenetics/rgfakePed.xml Fri Mar 25 13:12:21 2011 -0400 @@ -1,14 +1,13 @@ <tool id="rgfakePed1" name="Null genotypes"><description>for testing</description> - <code file="rgfakePed_code.py"/> - <command interpreter="python">rgfakePed.py --title '$title1' + <command interpreter="python">rgfakePed.py --title '$title' -o '$out_file1' -p '$out_file1.files_path' -c '$ncases' -n '$ntotal' -s '$nsnp' -w '$lowmaf' -v '$missingValue' -l '$outFormat' -d '$mafdist' -m '$missingRate' -M '$mendelRate' </command><inputs><page> - <param name="title1" + <param name="title" type="text" help="Name for outputs from this job" label="Descriptive short name"/> @@ -60,11 +59,11 @@ </inputs><outputs> - <data format="lped" name="out_file1" /> + <data format="lped" name="out_file1" label="${title}.lped" /></outputs><tests><test> - <param name='title1' value='rgfakePedtest1' /> + <param name='title' value='rgfakePedtest1' /><param name="ntotal" value="40" /><param name="ncases" value="20" /><param name="nsnp" value="10" /> --- a/tools/rgenetics/rgfakePed_code.py Thu Mar 24 20:29:40 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,53 +0,0 @@ -import os, string, time -from galaxy import datatypes -from galaxy import app -import galaxy.util - -#Provides Upload tool with access to list of available builds -repospath = '/usr/local/galaxy/data/rg' -builds = [] -#Read build names and keys from galaxy.util -for dbkey, build_name in galaxy.util.dbnames: - builds.append((build_name,dbkey,False)) - -def timenow(): - """return current time as a string - """ - return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) - -#Return available builds -def get_available_builds(defval='hg18'): - for i,x in enumerate(builds): - if x[1] == defval: - x = list(x) - x[2] = True - builds[i] = tuple(x) - return builds - -# Create link to files here -def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr): - base_name = param_dict.get( 'title1', 'Null_Genotypes' ) - base_name = base_name.encode() - outFormat = param_dict.get('outFormat','P') - s = string.whitespace + string.punctuation - ptran = string.maketrans(s,'_'*len(s)) - base_name = base_name.translate(ptran) - pname = "out_file1" - lookup = {'L':('linkped','lped'),'F':('FBAT','fped')} - # /usr/local/galaxy/data/rg/1/lped/ - info = 'Null phenotypes created by rgfakePhe at %s' % timenow() - data = out_data[pname] - ftype,pext = lookup[outFormat] - data.name = '%s.%s' % (base_name,pext) # that's how they've been named in rgfakePhe.py - data.change_datatype(pext) - #data.file_name = data.file_name - data.metadata.base_name = base_name - data.readonly = True - out_data[pname] = data - app.model.context.flush() - - - - - - --- a/tools/rgenetics/rgfakePhe_code.py Thu Mar 24 20:29:40 2011 -0400 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,46 +0,0 @@ -# before running the qc, need to rename various output files -import os, string, time -from galaxy import datatypes -from galaxy import app -import galaxy.util - -#Provides Upload tool with access to list of available builds -repospath = '/usr/local/galaxy/data/rg' -builds = [] -#Read build names and keys from galaxy.util -for dbkey, build_name in galaxy.util.dbnames: - builds.append((build_name,dbkey,False)) - -def timenow(): - """return current time as a string - """ - return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) - -#Return available builds -def get_available_builds(defval='hg18'): - for i,x in enumerate(builds): - if x[1] == defval: - x = list(x) - x[2] = True - builds[i] = tuple(x) - return builds - -# Create link to files here -def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr): - base_name = param_dict.get( 'title1', 'Null_Phenotype' ) - base_name = base_name.encode() - info = 'Null phenotypes created by rgfakePhe at %s' % timenow() - s = string.whitespace + string.punctuation - ptran = string.maketrans(s,'_'*len(s)) - base_name = base_name.translate(ptran) - data = out_data['ppheout'] - data.name = '%s.phe' % (base_name) # that's how they've been named in rgfakePhe.py - #data.file_name = data.file_name - data.metadata.base_name = base_name - out_data['pheout'] = data - app.model.context.flush() - - - - - --- a/tools/rgenetics/rgutils.py Thu Mar 24 20:29:40 2011 -0400 +++ b/tools/rgenetics/rgutils.py Fri Mar 25 13:12:21 2011 -0400 @@ -5,6 +5,7 @@ # import subprocess, os, sys, time, tempfile,string,plinkbinJZ +import datetime galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> @@ -30,6 +31,9 @@ """ return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time())) +def timestamp(): + return datetime.datetime.now().strftime('%Y%m%d%H%M%S') + def fail( message ): print >> sys.stderr, message return -1 @@ -41,6 +45,401 @@ return os.path.join(path, program) return None +def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000,pname='LOG10ARMITAGEP',rgname='rgGLM'): + """ + score must be scaled to 0-1000 + + Want to make some wig tracks from each analysis + Best n -log10(p). Make top hit the window. + we use our tab output which has + rs chrom offset ADD_stat ADD_p ADD_log10p + rs3094315 1 792429 1.151 0.2528 0.597223 + + """ + + def is_number(s): + try: + float(s) + return True + except ValueError: + return False + header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description) + column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ] + halfwidth=100 + resfpath = os.path.join(twd,resf) + resf = open(resfpath,'r') + resfl = resf.readlines() # dumb but convenient for millions of rows + resfl = [x.split() for x in resfl] + headl = resfl[0] + resfl = resfl[1:] + headl = [x.strip().upper() for x in headl] + headIndex = dict(zip(headl,range(0,len(headl)))) + chrpos = headIndex.get('CHROM',None) + rspos = headIndex.get('RS',None) + offspos = headIndex.get('OFFSET',None) + ppos = headIndex.get(pname,None) + wewant = [chrpos,rspos,offspos,ppos] + if None in wewant: # missing something + logf.write('### Error missing a required header in makeGFF - headIndex=%s, wewant=%s\n' % (headIndex,wewant)) + return + resfl = [x for x in resfl if x[ppos] > ''] + resfl = [(float(x[ppos]),x) for x in resfl] # decorate + resfl.sort() + resfl.reverse() # using -log10 so larger is better + resfl = resfl[:topn] # truncate + pvals = [x[0] for x in resfl] # need to scale + resfl = [x[1] for x in resfl] # drop decoration + if len(pvals) == 0: + logf.write('### no pvalues found in resfl - %s' % (resfl[:3])) + sys.exit(1) + maxp = max(pvals) # need to scale + minp = min(pvals) + prange = abs(maxp-minp) + 0.5 # fudge + scalefact = 1000.0/prange + logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact)) + for i,row in enumerate(resfl): + row[ppos] = '%d' % (int(scalefact*pvals[i])) + resfl[i] = row # replace + outf = file(outfname,'w') + outf.write(header) + outres = [] # need to resort into chrom offset order + for i,lrow in enumerate(resfl): + chrom,snp,offset,p, = [lrow[x] for x in wewant] + gff = ('chr%s' % chrom,rgname,'variation','%d' % (int(offset)-halfwidth), + '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i])) + outres.append(gff) + outres = [(x[0],int(x[3]),x) for x in outres] # decorate + outres.sort() # into chrom offset + outres=[x[2] for x in outres] # undecorate + outres = ['\t'.join(x) for x in outres] + outf.write('\n'.join(outres)) + outf.write('\n') + outf.close() + + +def bedToPicInterval(infile=None): + """ + Picard tools requiring targets want + a sam style header which incidentally, MUST be sorted in natural order - not lexicographic order: + + @SQ SN:chrM LN:16571 + @SQ SN:chr1 LN:247249719 + @SQ SN:chr2 LN:242951149 + @SQ SN:chr3 LN:199501827 + @SQ SN:chr4 LN:191273063 + added to the start of what looks like a bed style file + chr1 67052400 67052451 - CCDS635.1_cds_0_0_chr1_67052401_r + chr1 67060631 67060788 - CCDS635.1_cds_1_0_chr1_67060632_r + chr1 67065090 67065317 - CCDS635.1_cds_2_0_chr1_67065091_r + chr1 67066082 67066181 - CCDS635.1_cds_3_0_chr1_67066083_r + + see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 + we need to add 1 to start coordinates on the way through - but length calculations are easier + """ + # bedToPicard.py + # ross lazarus October 2010 + # LGPL + # for Rgenetics + + def getFlen(bedfname=None): + """ + find all features in a BED file and sum their lengths + """ + features = {} + try: + infile = open(bedfname,'r') + except: + print '###ERROR: getFlen unable to open bedfile %s' % bedfname + sys.exit(1) + for i,row in enumerate(infile): + if row[0] == '@': # shouldn't happen given a bed file! + print 'row %d=%s - should NOT start with @!' % (i,row) + sys.exit(1) + row = row.strip() + if len(row) > 0: + srow = row.split('\t') + f = srow[0] + spos = srow[1] # zero based from UCSC so no need to add 1 - eg 0-100 is 100 bases numbered 0-99 (!) + epos = srow[2] # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 + flen = int(epos) - int(spos) + features.setdefault(f,0) + features[f] += flen + infile.close() + return features + + def keynat(string): + ''' + borrowed from http://code.activestate.com/recipes/285264-natural-string-sorting/ + A natural sort helper function for sort() and sorted() + without using regular expressions or exceptions. + + >>> items = ('Z', 'a', '10th', '1st', '9') + >>> sorted(items) + ['10th', '1st', '9', 'Z', 'a'] + >>> sorted(items, key=keynat) + ['1st', '9', '10th', 'a', 'Z'] + ''' + it = type(1) + r = [] + for c in string: + if c.isdigit(): + d = int(c) + if r and type( r[-1] ) == it: + r[-1] = r[-1] * 10 + d + else: + r.append(d) + else: + r.append(c.lower()) + return r + + def writePic(outfname=None,bedfname=None): + """ + collect header info and rewrite bed with header for picard + """ + featlen = getFlen(bedfname=bedfname) + try: + outf = open(outfname,'w') + except: + print '###ERROR: writePic unable to open output picard file %s' % outfname + sys.exit(1) + infile = open(bedfname,'r') # already tested in getFlen + k = featlen.keys() + fk = sorted(k, key=keynat) + header = ['@SQ\tSN:%s\tLN:%d' % (x,featlen[x]) for x in fk] + outf.write('\n'.join(header)) + outf.write('\n') + for row in infile: + row = row.strip() + if len(row) > 0: # convert zero based start coordinate to 1 based + srow = row.split('\t') + srow[1] = '%d' % (int(srow[1])+1) # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 + outf.write('\t'.join(srow)) + outf.write('\n') + outf.close() + infile.close() + + + + # bedToPicInterval starts here + fd,outf = tempfile.mkstemp(prefix='rgPicardHsMetrics') + writePic(outfname=outf,bedfname=infile) + return outf + + +def getFileString(fpath, outpath): + """ + format a nice file size string + """ + size = '' + fp = os.path.join(outpath, fpath) + s = '? ?' + if os.path.isfile(fp): + n = float(os.path.getsize(fp)) + if n > 2**20: + size = ' (%1.1f MB)' % (n/2**20) + elif n > 2**10: + size = ' (%1.1f KB)' % (n/2**10) + elif n > 0: + size = ' (%d B)' % (int(n)) + s = '%s %s' % (fpath, size) + return s + + +def fixPicardOutputs(tempout=None,output_dir=None,log_file=None,html_output=None,progname=None,cl=[],transpose=True): + """ + picard produces long hard to read tab header files + make them available but present them transposed for readability + """ + rstyle="""<style type="text/css"> + tr.d0 td {background-color: oldlace; color: black;} + tr.d1 td {background-color: aliceblue; color: black;} + </style>""" + cruft = [] + dat = [] + try: + r = open(tempout,'r').readlines() + except: + r = [] + for row in r: + if row.strip() > '': + srow = row.split('\t') + if row[0] == '#': + cruft.append(row.strip()) # want strings + else: + dat.append(srow) # want lists + + res = [rstyle,] + res.append(galhtmlprefix % progname) + res.append(galhtmlattr % (progname,timenow())) + flist = os.listdir(output_dir) + pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf'] + if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs + for p in pdflist: + imghref = '%s.jpg' % os.path.splitext(p)[0] # removes .pdf + res.append('<table cellpadding="10"><tr><td>\n') + res.append('<a href="%s"><img src="%s" alt="Click thumbnail to download %s" hspace="10" align="middle"></a>\n' % (p,imghref,p)) + res.append('</tr></td></table>\n') + res.append('<b>Your job produced the following output files.</b><hr/>\n') + res.append('<table>\n') + for i,f in enumerate(flist): + fn = os.path.split(f)[-1] + res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,fn)) + res.append('</table><p/>\n') + if len(cruft) + len(dat) > 0: + res.append('<b>Picard on line resources</b><ul>\n') + res.append('<li><a href="http://picard.sourceforge.net/index.shtml">Click here for Picard Documentation</a></li>\n') + res.append('<li><a href="http://picard.sourceforge.net/picard-metric-definitions.shtml">Click here for Picard Metrics definitions</a></li></ul><hr/>\n') + if transpose: + res.append('<b>Picard output (transposed for readability)</b><hr/>\n') + else: + res.append('<b>Picard output</b><hr/>\n') + res.append('<table cellpadding="3" >\n') + if len(cruft) > 0: + cres = ['<tr class="d%d"><td>%s</td></tr>' % (i % 2,x) for i,x in enumerate(cruft)] + res += cres + if len(dat) > 0: + maxrows = 100 + if transpose: + tdat = map(None,*dat) # transpose an arbitrary list of lists + missing = len(tdat) - maxrows + tdat = ['<tr class="d%d"><td>%s</td><td>%s</td></tr>\n' % ((i+len(cruft)) % 2,x[0],x[1]) for i,x in enumerate(tdat) if i < maxrows] + if len(tdat) > maxrows: + tdat.append('<tr><td colspan="2">...WARNING: %d rows deleted for sanity...see raw files for all rows</td></tr>' % missing) + else: + tdat = ['<tr class="d%d"><td>%s</td></tr>\n' % ((i+len(cruft)) % 2,x) for i,x in enumerate(dat) if i < maxrows] + if len(dat) > maxrows: + missing = len(dat) - maxrows + tdat.append('<tr><td>...WARNING: %d rows deleted for sanity...see raw files for all rows</td></tr>' % missing) + res += tdat + res.append('</table>\n') + else: + res.append('<b>No Picard output found - please consult the Picard log above for an explanation</b>') + l = open(log_file,'r').readlines() + if len(l) > 0: + res.append('<b>Picard log</b><hr/>\n') + rlog = ['<pre>',] + rlog += l + rlog.append('</pre>') + res += rlog + else: + res.append("Odd, Picard left no log file %s - must have really barfed badly?" % log_file) + res.append('<hr/>The freely available <a href="http://picard.sourceforge.net/command-line-overview.shtml">Picard software</a> \n') + res.append( 'generated all outputs reported here, using this command line:<br/>\n<pre>%s</pre>\n' % ''.join(cl)) + res.append(galhtmlpostfix) + outf = open(html_output,'w') + outf.write(''.join(res)) + outf.write('\n') + outf.close() + +def keynat(string): + ''' + borrowed from http://code.activestate.com/recipes/285264-natural-string-sorting/ + A natural sort helper function for sort() and sorted() + without using regular expressions or exceptions. + + >>> items = ('Z', 'a', '10th', '1st', '9') + >>> sorted(items) + ['10th', '1st', '9', 'Z', 'a'] + >>> sorted(items, key=keynat) + ['1st', '9', '10th', 'a', 'Z'] + ''' + it = type(1) + r = [] + for c in string: + if c.isdigit(): + d = int(c) + if r and type( r[-1] ) == it: + r[-1] = r[-1] * 10 + d + else: + r.append(d) + else: + r.append(c.lower()) + return r + +def getFlen(bedfname=None): + """ + find all features in a BED file and sum their lengths + """ + features = {} + otherHeaders = [] + try: + infile = open(bedfname,'r') + except: + print '###ERROR: getFlen unable to open bedfile %s' % bedfname + sys.exit(1) + for i,row in enumerate(infile): + if row.startswith('@'): # add to headers if not @SQ + if not row.startswith('@SQ'): + otherHeaders.append(row) + else: + row = row.strip() + if row.startswith('#') or row.lower().startswith('browser') or row.lower().startswith('track'): + continue # ignore headers + srow = row.split('\t') + if len(srow) > 3: + srow = row.split('\t') + f = srow[0] + spos = srow[1] # zero based from UCSC so no need to add 1 - eg 0-100 is 100 bases numbered 0-99 (!) + epos = srow[2] # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 + flen = int(epos) - int(spos) + features.setdefault(f,0) + features[f] += flen + infile.close() + fk = features.keys() + fk = sorted(fk, key=keynat) + return features,fk,otherHeaders + +def bedToPicInterval(infile=None,outfile=None): + """ + Picard tools requiring targets want + a sam style header which incidentally, MUST be sorted in natural order - not lexicographic order: + + @SQ SN:chrM LN:16571 + @SQ SN:chr1 LN:247249719 + @SQ SN:chr2 LN:242951149 + @SQ SN:chr3 LN:199501827 + @SQ SN:chr4 LN:191273063 + added to the start of what looks like a bed style file + chr1 67052400 67052451 - CCDS635.1_cds_0_0_chr1_67052401_r + chr1 67060631 67060788 - CCDS635.1_cds_1_0_chr1_67060632_r + chr1 67065090 67065317 - CCDS635.1_cds_2_0_chr1_67065091_r + chr1 67066082 67066181 - CCDS635.1_cds_3_0_chr1_67066083_r + + see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 + we need to add 1 to start coordinates on the way through - but length calculations are easier + """ + # bedToPicard.py + # ross lazarus October 2010 + # LGPL + # for Rgenetics + """ + collect header info and rewrite bed with header for picard + """ + featlen,fk,otherHeaders = getFlen(bedfname=infile) + try: + outf = open(outfile,'w') + except: + print '###ERROR: writePic unable to open output picard file %s' % outfile + sys.exit(1) + inf = open(infile,'r') # already tested in getFlen + header = ['@SQ\tSN:%s\tLN:%d' % (x,featlen[x]) for x in fk] + if len(otherHeaders) > 0: + header += otherHeaders + outf.write('\n'.join(header)) + outf.write('\n') + for row in inf: + row = row.strip() + if len(row) > 0: # convert zero based start coordinate to 1 based + if row.startswith('@'): + continue + else: + srow = row.split('\t') + srow[1] = '%d' % (int(srow[1])+1) # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1 + outf.write('\t'.join(srow)) + outf.write('\n') + outf.close() + inf.close() + def oRRun(rcmd=[],outdir=None,title='myR',rexe='R'): """ @@ -201,9 +600,12 @@ The fine Plink docs at http://pngu.mgh.harvard.edu/~purcell/plink/summary.shtml#prune reproduced below -Sometimes it is useful to generate a pruned subset of SNPs that are in approximate linkage equilibrium with each other. This can be achieved via two commands: --indep which prunes based on the variance inflation factor (VIF), which recursively removes SNPs within a sliding window; second, --indep-pairwise which is similar, except it is based only on pairwise genotypic correlation. +Sometimes it is useful to generate a pruned subset of SNPs that are in approximate linkage equilibrium with each other. This can be achieved via two commands: +--indep which prunes based on the variance inflation factor (VIF), which recursively removes SNPs within a sliding window; second, --indep-pairwise which is +similar, except it is based only on pairwise genotypic correlation. -Hint The output of either of these commands is two lists of SNPs: those that are pruned out and those that are not. A separate command using the --extract or --exclude option is necessary to actually perform the pruning. +Hint The output of either of these commands is two lists of SNPs: those that are pruned out and those that are not. A separate command using the --extract or +--exclude option is necessary to actually perform the pruning. The VIF pruning routine is performed: plink --file data --indep 50 5 2 @@ -217,16 +619,23 @@ a --extract or --exclude command. The parameters for --indep are: window size in SNPs (e.g. 50), the number of SNPs to shift the -window at each step (e.g. 5), the VIF threshold. The VIF is 1/(1-R^2) where R^2 is the multiple correlation coefficient for a SNP being regressed on all other SNPs simultaneously. That is, this considers the correlations between SNPs but also between linear combinations of SNPs. A VIF of 10 is often taken to represent near collinearity problems in standard multiple regression analyses (i.e. implies R^2 of 0.9). A VIF of 1 would imply that the SNP is completely independent of all other SNPs. Practically, values between 1.5 and 2 should probably be used; particularly in small samples, if this threshold is too low and/or the window size is too large, too many SNPs may be removed. +window at each step (e.g. 5), the VIF threshold. The VIF is 1/(1-R^2) where R^2 is the multiple correlation coefficient for a SNP being regressed on all other +SNPs simultaneously. That is, this considers the correlations between SNPs but also between linear combinations of SNPs. A VIF of 10 is often taken to represent +near collinearity problems in standard multiple regression analyses (i.e. implies R^2 of 0.9). A VIF of 1 would imply that the SNP is completely independent of +all other SNPs. Practically, values between 1.5 and 2 should probably be used; particularly in small samples, if this threshold is too low and/or the window +size is too large, too many SNPs may be removed. The second procedure is performed: plink --file data --indep-pairwise 50 5 0.5 This generates the same output files as the first version; the only difference is that a -simple pairwise threshold is used. The first two parameters (50 and 5) are the same as above (window size and step); the third parameter represents the r^2 threshold. Note: this represents the pairwise SNP-SNP metric now, not the multiple correlation coefficient; also note, this is based on the genotypic correlation, i.e. it does not involve phasing. +simple pairwise threshold is used. The first two parameters (50 and 5) are the same as above (window size and step); the third parameter represents the r^2 +threshold. Note: this represents the pairwise SNP-SNP metric now, not the multiple correlation coefficient; also note, this is based on the genotypic +correlation, i.e. it does not involve phasing. To give a concrete example: the command above that specifies 50 5 0.5 would a) consider a -window of 50 SNPs, b) calculate LD between each pair of SNPs in the window, b) remove one of a pair of SNPs if the LD is greater than 0.5, c) shift the window 5 SNPs forward and repeat the procedure. +window of 50 SNPs, b) calculate LD between each pair of SNPs in the window, b) remove one of a pair of SNPs if the LD is greater than 0.5, c) shift the window 5 +SNPs forward and repeat the procedure. To make a new, pruned file, then use something like (in this example, we also convert the standard PED fileset to a binary one): @@ -276,3 +685,4 @@ mfile.close() return markers,snpcols,rslist,rsdict + http://bitbucket.org/galaxy/galaxy-central/changeset/a892beeb00b4/ changeset: r5275:a892beeb00b4 user: fubar date: 2011-03-25 18:21:13 summary: Remove bogus makeGFF from rgutils affected #: 1 file (2.9 KB) --- a/tools/rgenetics/rgutils.py Fri Mar 25 13:12:21 2011 -0400 +++ b/tools/rgenetics/rgutils.py Fri Mar 25 13:21:13 2011 -0400 @@ -45,77 +45,6 @@ return os.path.join(path, program) return None -def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000,pname='LOG10ARMITAGEP',rgname='rgGLM'): - """ - score must be scaled to 0-1000 - - Want to make some wig tracks from each analysis - Best n -log10(p). Make top hit the window. - we use our tab output which has - rs chrom offset ADD_stat ADD_p ADD_log10p - rs3094315 1 792429 1.151 0.2528 0.597223 - - """ - - def is_number(s): - try: - float(s) - return True - except ValueError: - return False - header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description) - column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ] - halfwidth=100 - resfpath = os.path.join(twd,resf) - resf = open(resfpath,'r') - resfl = resf.readlines() # dumb but convenient for millions of rows - resfl = [x.split() for x in resfl] - headl = resfl[0] - resfl = resfl[1:] - headl = [x.strip().upper() for x in headl] - headIndex = dict(zip(headl,range(0,len(headl)))) - chrpos = headIndex.get('CHROM',None) - rspos = headIndex.get('RS',None) - offspos = headIndex.get('OFFSET',None) - ppos = headIndex.get(pname,None) - wewant = [chrpos,rspos,offspos,ppos] - if None in wewant: # missing something - logf.write('### Error missing a required header in makeGFF - headIndex=%s, wewant=%s\n' % (headIndex,wewant)) - return - resfl = [x for x in resfl if x[ppos] > ''] - resfl = [(float(x[ppos]),x) for x in resfl] # decorate - resfl.sort() - resfl.reverse() # using -log10 so larger is better - resfl = resfl[:topn] # truncate - pvals = [x[0] for x in resfl] # need to scale - resfl = [x[1] for x in resfl] # drop decoration - if len(pvals) == 0: - logf.write('### no pvalues found in resfl - %s' % (resfl[:3])) - sys.exit(1) - maxp = max(pvals) # need to scale - minp = min(pvals) - prange = abs(maxp-minp) + 0.5 # fudge - scalefact = 1000.0/prange - logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact)) - for i,row in enumerate(resfl): - row[ppos] = '%d' % (int(scalefact*pvals[i])) - resfl[i] = row # replace - outf = file(outfname,'w') - outf.write(header) - outres = [] # need to resort into chrom offset order - for i,lrow in enumerate(resfl): - chrom,snp,offset,p, = [lrow[x] for x in wewant] - gff = ('chr%s' % chrom,rgname,'variation','%d' % (int(offset)-halfwidth), - '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i])) - outres.append(gff) - outres = [(x[0],int(x[3]),x) for x in outres] # decorate - outres.sort() # into chrom offset - outres=[x[2] for x in outres] # undecorate - outres = ['\t'.join(x) for x in outres] - outf.write('\n'.join(outres)) - outf.write('\n') - outf.close() - def bedToPicInterval(infile=None): """ Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.