[galaxy-commits] commit/galaxy-central: 2 new changesets

25 Mar 2011

2 new changesets in galaxy-central:

http://bitbucket.org/galaxy/galaxy-central/changeset/c25c8055e608/
changeset:   r5274:c25c8055e608
user:        fubar
date:        2011-03-25 18:12:21
summary:     Revised rgenetics tools to remove post job hooks
affected #:  35 files (21.8 KB)

--- a/test-data/rgCaCotest1_CaCo_topTable.gff	Thu Mar 24 20:29:40 2011 -0400
+++ b/test-data/rgCaCotest1_CaCo_topTable.gff	Fri Mar 25 13:12:21 2011 -0400
@@ -1,26 +1,26 @@
-track name=rgGLM_TopTable description="rgCaCotest1" visibility=2 useScore=1 color=0,60,120
-chr22	rgGLM	variation	21784622	21784822	0	.	.	rs2283802 logp=-0.00
-chr22	rgGLM	variation	21785266	21785466	42	.	.	rs2267000 logp=0.05
-chr22	rgGLM	variation	21794654	21794854	249	.	.	rs16997606 logp=0.29
-chr22	rgGLM	variation	21794710	21794910	493	.	.	rs4820537 logp=0.58
-chr22	rgGLM	variation	21797704	21797904	237	.	.	rs3788347 logp=0.28
-chr22	rgGLM	variation	21799818	21800018	61	.	.	rs756632 logp=0.07
-chr22	rgGLM	variation	21807870	21808070	153	.	.	rs4820539 logp=0.18
-chr22	rgGLM	variation	21820235	21820435	44	.	.	rs2283804 logp=0.05
-chr22	rgGLM	variation	21820890	21821090	44	.	.	rs2267006 logp=0.05
-chr22	rgGLM	variation	21820900	21821100	184	.	.	rs4822363 logp=0.22
-chr22	rgGLM	variation	21827574	21827774	317	.	.	rs5751592 logp=0.37
-chr22	rgGLM	variation	21832608	21832808	0	.	.	rs5759608 logp=-0.00
-chr22	rgGLM	variation	21833070	21833270	0	.	.	rs5759612 logp=-0.00
-chr22	rgGLM	variation	21860068	21860268	0	.	.	rs2267009 logp=-0.00
-chr22	rgGLM	variation	21864266	21864466	117	.	.	rs2267010 logp=0.14
-chr22	rgGLM	variation	21868598	21868798	571	.	.	rs5759636 logp=0.67
-chr22	rgGLM	variation	21871388	21871588	0	.	.	rs2071436 logp=-0.00
-chr22	rgGLM	variation	21875779	21875979	249	.	.	rs2267013 logp=0.29
-chr22	rgGLM	variation	21889706	21889906	378	.	.	rs6003566 logp=0.44
-chr22	rgGLM	variation	21892791	21892991	249	.	.	rs2256725 logp=0.29
-chr22	rgGLM	variation	21892825	21893025	193	.	.	rs12160770 logp=0.23
-chr22	rgGLM	variation	21895919	21896119	42	.	.	rs5751611 logp=0.05
-chr22	rgGLM	variation	21898758	21898958	40	.	.	rs762601 logp=0.05
-chr22	rgGLM	variation	21898963	21899163	40	.	.	rs2156921 logp=0.05
-chr22	rgGLM	variation	21905542	21905742	40	.	.	rs4822375 logp=0.05
+track name=rgCaCo_TopTable description="rgCaCotest1" visibility=2 useScore=1 color=0,60,120
+chr22	rgCaCo	variation	21784622	21784822	0	.	.	rs2283802 logp=-0.00
+chr22	rgCaCo	variation	21785266	21785466	42	.	.	rs2267000 logp=0.05
+chr22	rgCaCo	variation	21794654	21794854	249	.	.	rs16997606 logp=0.29
+chr22	rgCaCo	variation	21794710	21794910	493	.	.	rs4820537 logp=0.58
+chr22	rgCaCo	variation	21797704	21797904	237	.	.	rs3788347 logp=0.28
+chr22	rgCaCo	variation	21799818	21800018	61	.	.	rs756632 logp=0.07
+chr22	rgCaCo	variation	21807870	21808070	153	.	.	rs4820539 logp=0.18
+chr22	rgCaCo	variation	21820235	21820435	44	.	.	rs2283804 logp=0.05
+chr22	rgCaCo	variation	21820890	21821090	44	.	.	rs2267006 logp=0.05
+chr22	rgCaCo	variation	21820900	21821100	184	.	.	rs4822363 logp=0.22
+chr22	rgCaCo	variation	21827574	21827774	317	.	.	rs5751592 logp=0.37
+chr22	rgCaCo	variation	21832608	21832808	0	.	.	rs5759608 logp=-0.00
+chr22	rgCaCo	variation	21833070	21833270	0	.	.	rs5759612 logp=-0.00
+chr22	rgCaCo	variation	21860068	21860268	0	.	.	rs2267009 logp=-0.00
+chr22	rgCaCo	variation	21864266	21864466	117	.	.	rs2267010 logp=0.14
+chr22	rgCaCo	variation	21868598	21868798	571	.	.	rs5759636 logp=0.67
+chr22	rgCaCo	variation	21871388	21871588	0	.	.	rs2071436 logp=-0.00
+chr22	rgCaCo	variation	21875779	21875979	249	.	.	rs2267013 logp=0.29
+chr22	rgCaCo	variation	21889706	21889906	378	.	.	rs6003566 logp=0.44
+chr22	rgCaCo	variation	21892791	21892991	249	.	.	rs2256725 logp=0.29
+chr22	rgCaCo	variation	21892825	21893025	193	.	.	rs12160770 logp=0.23
+chr22	rgCaCo	variation	21895919	21896119	42	.	.	rs5751611 logp=0.05
+chr22	rgCaCo	variation	21898758	21898958	40	.	.	rs762601 logp=0.05
+chr22	rgCaCo	variation	21898963	21899163	40	.	.	rs2156921 logp=0.05
+chr22	rgCaCo	variation	21905542	21905742	40	.	.	rs4822375 logp=0.05


--- a/test-data/rgTDTtest1_TDT_topTable.gff	Thu Mar 24 20:29:40 2011 -0400
+++ b/test-data/rgTDTtest1_TDT_topTable.gff	Fri Mar 25 13:12:21 2011 -0400
@@ -1,25 +1,25 @@
 track name=rgTDT_Top_Table description="rgTDTtest1" visibility=2 useScore=1 color=0,60,120
-chr22	rgGLM	variation	21784622	21784822	0	.	.	rs2283802 logp=-0.00
-chr22	rgGLM	variation	21785266	21785466	120	.	.	rs2267000 logp=0.13
-chr22	rgGLM	variation	21794654	21794854	0	.	.	rs16997606 logp=-0.00
-chr22	rgGLM	variation	21794710	21794910	457	.	.	rs4820537 logp=0.50
-chr22	rgGLM	variation	21797704	21797904	120	.	.	rs3788347 logp=0.13
-chr22	rgGLM	variation	21799818	21800018	228	.	.	rs756632 logp=0.25
-chr22	rgGLM	variation	21807870	21808070	120	.	.	rs4820539 logp=0.13
-chr22	rgGLM	variation	21820235	21820435	292	.	.	rs2283804 logp=0.32
-chr22	rgGLM	variation	21820890	21821090	292	.	.	rs2267006 logp=0.32
-chr22	rgGLM	variation	21820900	21821100	0	.	.	rs4822363 logp=-0.00
-chr22	rgGLM	variation	21827574	21827774	541	.	.	rs5751592 logp=0.59
-chr22	rgGLM	variation	21832608	21832808	400	.	.	rs5759608 logp=0.44
-chr22	rgGLM	variation	21833070	21833270	400	.	.	rs5759612 logp=0.44
-chr22	rgGLM	variation	21860068	21860268	0	.	.	rs2267009 logp=-0.00
-chr22	rgGLM	variation	21864266	21864466	0	.	.	rs2267010 logp=-0.00
-chr22	rgGLM	variation	21868598	21868798	457	.	.	rs5759636 logp=0.50
-chr22	rgGLM	variation	21871388	21871588	0	.	.	rs2071436 logp=-0.00
-chr22	rgGLM	variation	21875779	21875979	457	.	.	rs2267013 logp=0.50
-chr22	rgGLM	variation	21889706	21889906	168	.	.	rs6003566 logp=0.18
-chr22	rgGLM	variation	21892791	21892991	457	.	.	rs2256725 logp=0.50
-chr22	rgGLM	variation	21895919	21896119	359	.	.	rs5751611 logp=0.39
-chr22	rgGLM	variation	21898758	21898958	400	.	.	rs762601 logp=0.44
-chr22	rgGLM	variation	21898963	21899163	400	.	.	rs2156921 logp=0.44
-chr22	rgGLM	variation	21905542	21905742	400	.	.	rs4822375 logp=0.44
+chr22	rgTDT	variation	21784622	21784822	0	.	.	rs2283802 logp=-0.00
+chr22	rgTDT	variation	21785266	21785466	120	.	.	rs2267000 logp=0.13
+chr22	rgTDT	variation	21794654	21794854	0	.	.	rs16997606 logp=-0.00
+chr22	rgTDT	variation	21794710	21794910	457	.	.	rs4820537 logp=0.50
+chr22	rgTDT	variation	21797704	21797904	120	.	.	rs3788347 logp=0.13
+chr22	rgTDT	variation	21799818	21800018	228	.	.	rs756632 logp=0.25
+chr22	rgTDT	variation	21807870	21808070	120	.	.	rs4820539 logp=0.13
+chr22	rgTDT	variation	21820235	21820435	292	.	.	rs2283804 logp=0.32
+chr22	rgTDT	variation	21820890	21821090	292	.	.	rs2267006 logp=0.32
+chr22	rgTDT	variation	21820900	21821100	0	.	.	rs4822363 logp=-0.00
+chr22	rgTDT	variation	21827574	21827774	541	.	.	rs5751592 logp=0.59
+chr22	rgTDT	variation	21832608	21832808	400	.	.	rs5759608 logp=0.44
+chr22	rgTDT	variation	21833070	21833270	400	.	.	rs5759612 logp=0.44
+chr22	rgTDT	variation	21860068	21860268	0	.	.	rs2267009 logp=-0.00
+chr22	rgTDT	variation	21864266	21864466	0	.	.	rs2267010 logp=-0.00
+chr22	rgTDT	variation	21868598	21868798	457	.	.	rs5759636 logp=0.50
+chr22	rgTDT	variation	21871388	21871588	0	.	.	rs2071436 logp=-0.00
+chr22	rgTDT	variation	21875779	21875979	457	.	.	rs2267013 logp=0.50
+chr22	rgTDT	variation	21889706	21889906	168	.	.	rs6003566 logp=0.18
+chr22	rgTDT	variation	21892791	21892991	457	.	.	rs2256725 logp=0.50
+chr22	rgTDT	variation	21895919	21896119	359	.	.	rs5751611 logp=0.39
+chr22	rgTDT	variation	21898758	21898958	400	.	.	rs762601 logp=0.44
+chr22	rgTDT	variation	21898963	21899163	400	.	.	rs2156921 logp=0.44
+chr22	rgTDT	variation	21905542	21905742	400	.	.	rs4822375 logp=0.44


--- a/tools/rgenetics/rgCaCo.py	Thu Mar 24 20:29:40 2011 -0400
+++ b/tools/rgenetics/rgCaCo.py	Fri Mar 25 13:12:21 2011 -0400
@@ -20,7 +20,6 @@
 import sys,math,shutil,subprocess,os,time,tempfile,string
 from os.path import abspath
 from rgutils import timenow, plinke
-
 imagedir = '/static/rg' # if needed for images
 myversion = 'V000.1 April 2007'
 verbose = False
@@ -82,7 +81,7 @@
     outres = [] # need to resort into chrom offset order
     for i,lrow in enumerate(resfl):
         chrom,snp,offset,p, = [lrow[x] for x in wewant]
-        gff = ('chr%s' % chrom,'rgGLM','variation','%d' % (int(offset)-halfwidth),
+        gff = ('chr%s' % chrom,'rgCaCo','variation','%d' % (int(offset)-halfwidth),
                '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i]))
         outres.append(gff)
     outres = [(x[0],int(x[3]),x) for x in outres] # decorate
@@ -94,7 +93,6 @@
     outf.close()
 
 
-
 def plink_assocToGG(plinkout="hm",tag='test'):
    """ plink --assoc output looks like this
    #  CHR         SNP   A1      F_A      F_U   A2        CHISQ            P           OR 
@@ -170,14 +168,8 @@
     else:
        headl = headl.split()
        delim = None
-    chrpos = headl.index('CHR')
-    rspos = headl.index('SNP')
-    testpos = headl.index('TEST')
-    naffpos = headl.index('AFF')
-    nuaffpos = headl.index('UNAFF')
-    chisqpos = headl.index('CHISQ')
-    ppos = headl.index('P')    
-    wewant = [chrpos,rspos,testpos,naffpos,nuaffpos,chisqpos,ppos]
+    whatwewant = ['CHR','SNP','TEST','AFF','UNAFF','CHISQ','P']
+    wewant = [headl.index(x) for x in whatwewant]
     llen = len(headl)
     lnum = anum = 0
     lastsnp = None # so we know when to write out a gg line
@@ -273,7 +265,7 @@
     retval = p.wait()
     resf = '%s.model' % tname # plink output is here we hope
     xformModel(bfname,resf,outfname,name,amapf,flog) # leaves the desired summary file
-    makeGFF(resf=outfname,outfname=gffout,logf=flog,twd=twd,name='rgGLM_TopTable',description=name,topn=topn)
+    makeGFF(resf=outfname,outfname=gffout,logf=flog,twd=twd,name='rgCaCo_TopTable',description=name,topn=topn)
     flog.write('\n'.join(logme))
     flog.close() # close the log used
     #shutil.copytree(twd,logoutdir)


--- a/tools/rgenetics/rgCaCo.xml	Thu Mar 24 20:29:40 2011 -0400
+++ b/tools/rgenetics/rgCaCo.xml	Fri Mar 25 13:12:21 2011 -0400
@@ -1,25 +1,19 @@
 <tool id="rgCaCo1" name="Case Control:">
-    <code file="listFiles.py"/>
-    <code file="rgCaCo_code.py"/>
-
     <description>for unrelated subjects</description>
-
     <command interpreter="python">
-        rgCaCo.py '$i.extra_files_path/$i.metadata.base_name' "$name"
-        '$out_file1' '$logf' '$logf.files_path' '$gffout'
+        rgCaCo.py '$i.extra_files_path/$i.metadata.base_name' "$title"  '$out_file1' '$logf' '$logf.files_path' '$gffout'
     </command>
-
     <inputs><param name="i"  type="data" label="RGenetics genotype data from your current history"
       format="pbed" />
-       <param name='name' type='text' size="132" value='CaseControl' label="Title for this job"/>
+       <param name='title' type='text' size="132" value='CaseControl' label="Title for this job"/></inputs><outputs>
-       <data format="tabular" name="out_file1" />
-       <data format="txt" name="logf" />
-       <data format="gff" name="gffout"  />
+       <data format="tabular" name="out_file1" label="${title}_rgCaCo.xls" />
+       <data format="txt" name="logf" label="${title}_rgCaCo.log"/>
+       <data format="gff" name="gffout" label="${title}_rgCaCoTop.gff" /></outputs><tests><test>
@@ -30,7 +24,7 @@
    <composite_data value='tinywga.fam' /><edit_attributes type='name' value='tinywga' /></param>
- <param name='name' value='rgCaCotest1' />
+ <param name='title' value='rgCaCotest1' /><output name='out_file1' file='rgCaCotest1_CaCo.xls' ftype='tabular' compare='diff' /><output name='logf' file='rgCaCotest1_CaCo_log.txt' ftype='txt' compare='diff' lines_diff='20' /><output name='gffout' file='rgCaCotest1_CaCo_topTable.gff' ftype='gff' compare='diff' />
@@ -79,24 +73,31 @@
 spreadsheet data, choose the .xls tab delimited format. It's not a stupid binary excel file. Just a plain old tab delimited
 one with a header. Fortunately excel is dumb enough to open these without much protest.
 
+
 -----
 
+.. class:: infomark
+
 **Attribution**
 
-When you click 'execute', this Galaxy tool will run Plink (from Shaun Purcell) for you (which is GPL, so this must be too I guess) for calculations. For full Plink
-attribution, source code and documentation, please see http://pngu.mgh.harvard.edu/~purcell/plink/
+This Galaxy tool relies on Plink (see Plinksrc_) to test Casae Control association models. 
 
-The Plink output files will be adjusted into UCSC compatible tracks - gg or wig, or else as tab delimited
-for you spreadsheet junkies out there. Python is used for all glue and data format yoga.
-Originally designed and written for the Rgenetics Galaxy tools by
-ross lazarus (ross spot lazarus ate gmail spot com), who didn't write
-either Galaxy or Plink but wishes he had.
+So, we rely on the author (Shaun Purcell) for the documentation you need specific to those settings - they are very nicely documented - see
+DOC_
 
-copyright Ross Lazarus 2007
-Licensed under the terms of the LGPL as documented http://www.gnu.org/licenses/lgpl.html
-but is about as useful as a chocolate teapot without Plink which is GPL.
+Tool and Galaxy datatypes originally designed and written for the Rgenetics
+series of whole genome scale statistical genetics tools by ross lazarus (ross.lazarus@gmail.com)
+
+Copyright Ross Lazarus March 2007
+This Galaxy wrapper is released licensed under the LGPL_ but is about as useful as a chocolate teapot without Plink which is GPL.
 
 I'm no lawyer, but it looks like you got GPL if you use this software. Good luck.
 
+.. _Plinksrc: http://pngu.mgh.harvard.edu/~purcell/plink/ 
+
+.. _LGPL: http://www.gnu.org/copyleft/lesser.html
+
+.. _DOC: http://pngu.mgh.harvard.edu/~purcell/plink/anal.shtml#cc
+
 </help></tool>


--- a/tools/rgenetics/rgCaCo_code.py	Thu Mar 24 20:29:40 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,45 +0,0 @@
-# for caco
-# called as  plinkCaCo.py $i $name $test $outformat $out_file1 $logf $map
-from galaxy import datatypes
-import time,string,sys
-
-def timenow():
-    """return current time as a string
-    """
-    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
-
-def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
-    """Sets the name of the data"""
-    job_name = param_dict.get( 'name', 'RgCaCo' ).encode()
-    killme = string.punctuation + string.whitespace
-    trantab = string.maketrans(killme,'_'*len(killme))
-    title = job_name.translate(trantab)
-    indatname = inp_data['i'].name
-    outxls = ['tabular','%s_CaCo.xls' % job_name]
-    logtxt = ['txt','%s_CaCo_log.txt' % job_name]
-    ggout = ['gff','%s_CaCo_topTable.gff' % job_name]
-    lookup={}
-    lookup['out_file1'] = outxls
-    lookup['logf'] = logtxt
-    lookup['gffout'] = ggout
-    info = '%s rgCaCo from %s at %s' % (job_name,indatname,timenow())
-    for name in lookup.keys():
-        data = out_data[name]
-        data_type,newname = lookup[name]
-        data.name = newname
-        data.info = info
-        out_data[name] = data
-    app.model.context.flush()
-    
-
-def get_test_opts():
-    """return test options"""
-    dat = [('Armitage test for trend chisq. Does NOT assume HWE!','TREND',True),
-           ('Allelic (A vs a) chisq. assumes HWE','ALLELIC',False),
-           ('Genotype (AA vs Aa vs aa)chisq. assumes HWE','GENO',False),
-           ('Dominant model (AA/Aa vs aa) chisq. assumesWE','DOM',False),
-           ('Recessive (AA vs Aa/aa) chisq. assumes HWE','REC',False)]
-    dat.reverse()
-    return dat
-
-


--- a/tools/rgenetics/rgClean.xml	Thu Mar 24 20:29:40 2011 -0400
+++ b/tools/rgenetics/rgClean.xml	Fri Mar 25 13:12:21 2011 -0400
@@ -1,6 +1,4 @@
 <tool id="rgClean1" name="Clean genotypes:">
-    <code file="rgClean_code.py"/>
-
     <description>filter markers, subjects</description><command interpreter="python">
@@ -47,7 +45,7 @@
    </inputs><outputs>
-       <data format="pbed" name="out_file1" metadata_source="input_file"  />
+       <data format="pbed" name="out_file1" metadata_source="input_file" label="${title}_rgClean.pbed"  /></outputs><tests>


--- a/tools/rgenetics/rgClean_code.py	Thu Mar 24 20:29:40 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,28 +0,0 @@
-from galaxy import app
-import os, string, time
-
-def timenow():
-    """return current time as a string
-    """
-    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
-
-
-
-def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
-    name,data = out_data.items()[0]
-    basename = param_dict['title']
-    basename = basename.encode()
-    killme = string.punctuation + string.whitespace
-    trantab = string.maketrans(killme,'_'*len(killme))
-    title = basename.translate(trantab)
-    info = '%s filtered by rgClean.py at %s' % (title,timenow())
-    data.change_datatype('pbed')
-    #data.file_name = data.file_name
-    data.metadata.base_name = title
-    data.name = '%s.pbed' % title
-    data.info = info
-    data.readonly = True
-    app.model.context.flush()
-
-
-


--- a/tools/rgenetics/rgEigPCA.xml	Thu Mar 24 20:29:40 2011 -0400
+++ b/tools/rgenetics/rgEigPCA.xml	Fri Mar 25 13:12:21 2011 -0400
@@ -1,5 +1,4 @@
 <tool id="rgEigPCA1" name="Eigensoft:">
-    <code file="rgEigPCA_code.py"/><description>PCA Ancestry using SNP</description><command interpreter="python">
@@ -27,12 +26,10 @@
    </inputs><outputs>
-       <data name="out_file1" format="html" />
-       <data name="pca" format="txt" />
+       <data name="out_file1" format="html" label="${title}_rgEig.html"/>
+       <data name="pca" format="txt" label="${title}_rgEig.txt"/></outputs>
 
-<!-- python $TOOLPATH/$TOOL.py "$INPATH/tinywga" "$NPRE" ${OUTPATH}/${NPRE}.html $OUTPATH 4 2 2 2 $OUTPATH/pca.out $BINPATH -->
-
 <tests><test><param name='i' value='tinywga' ftype='ldindep' >
@@ -59,17 +56,45 @@
 
 **Syntax**
 
-- **Genotype data** is the input genotype file chosen from available library files.
-- **Title** is used to name the output files
-- **Tuning parameters** documented in the Eigensoft documentation - see below
+- **Genotype data** is an input genotype dataset in Plink lped (http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml) format. See below for notes
+- **Title** is used to name the output files so you can remember what the outputs are for
+- **Tuning parameters** are documented in the Eigensoft (http://genepath.med.harvard.edu/~reich/Software.htm) documentation - see below 
 
-(Note that you may need to convert an existing genotype file into that format to use this tool)
 
 -----
 
 **Summary**
 
+Eigensoft requires ld-reduced genotype data. 
+Galaxy has an automatic converter for genotype data in Plink linkage pedigree (lped) format.
+For details of this generic genotype format, please see the Plink documentation at 
+http://pngu.mgh.harvard.edu/~purcell/plink/data.shtml
+
+Reading that documentation, you'll see that the linkage pedigree format is really two related files with the same 
+file base name - a map and ped file - eg 'mygeno.ped' and 'mygeno.map'.
+The map file has the chromosome, offset, genetic offset and snp name corresponding to each
+genotype stored as separate alleles in the ped file. The ped file has family id, individual id, father id (or 0), mother id
+(or 0), gender (1=male, 2=female, 0=unknown) and affection (1=unaffected, 2=affected, 0=unknown), 
+then two separate allele columns for each genotype. 
+
+Once you have your data in the right format, you can upload those into your Galaxy history using the "upload" tool.
+
+To upload your lped data in the upload tool, choose 'lped' as the 'file format'. The tool form will change to 
+allow you to navigate to and select each member of the pair of  ped and map files stored on your local computer
+(or available at a public URL for Galaxy to grab). 
+Give the dataset a meaningful name (replace rgeneticsData with something more useful!) and click execute. 
+
+When the upload is done, your new lped format dataset will appear in your history and then, 
+when you choose the ancestry tool, that history dataset will be available as input.
+
+**Warning for the Impatient**
+
+When you execute the tool, it will look like it has not started running for a while as the automatic converter 
+reduces the amount of LD - otherwise eigenstrat gives biased results.
+
+
 **Attribution**
+
 This tool runs and relies on the work of many others, including the
 maintainers of the Eigensoft program, and the R and
 Bioconductor projects. For full attribution, source code and documentation, please see
@@ -84,7 +109,7 @@
 Licensed under the terms of the LGPL as documented http://www.gnu.org/licenses/lgpl.html
 but is about as useful as a sponge boat without EIGENSOFT pca code.
 
-**README from eigensoft2**
+**README from eigensoft2 distribution at http://genepath.med.harvard.edu/~reich/Software.htm**
 
 [rerla@beast eigensoft2]$ cat README
 EIGENSOFT version 2.0, January 2008 (for Linux only)


--- a/tools/rgenetics/rgEigPCA_code.py	Thu Mar 24 20:29:40 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,42 +0,0 @@
-from galaxy import datatypes,model
-import sys,time,string,shutil,os
-
-def timenow():
-    """return current time as a string
-    """
-    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
-
-def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
-    """Sets the name of the html output file
-       if we created a set of ldreduced files, we need to move them into the input files_path
-       so it doesn't need to be done again
-    """
-    indat = inp_data['i']
-    indatname = indat.name
-    base_name = indat.metadata.base_name
-    todir = indat.extra_files_path # where the ldreduced stuff should be
-    job_name = param_dict.get( 'title', 'Eigenstrat run' )
-    job_name = job_name.encode()
-    killme = string.punctuation + string.whitespace
-    trantab = string.maketrans(killme,'_'*len(killme))
-    job_name = job_name.translate(trantab)
-    info = '%s rgEigPCA2 on %s at %s' % (job_name,indatname,timenow())
-    exts = ['html','txt']
-    for i,ofname in enumerate(['out_file1','pca']):
-        data = out_data[ofname]
-        ext = exts[i]
-        newname = '%s.%s' % (job_name,ext)
-        data.name = newname
-        data.info = info
-        out_data[ofname] = data
-        if i == 0:
-            fromdir = data.extra_files_path
-    ldfname = '%s_INDEP_THIN' % base_name # we store ld reduced and thinned data
-    ldout = os.path.join(todir,ldfname)
-    ldin = os.path.join(fromdir,ldfname)
-    if os.path.exists('%s.bed' % ldin) and not os.path.exists('%s.bed' % ldout): # copy ldreduced to input for next time
-        for ext in ['bim','bed','fam']:
-            src = '%s.%s' % (ldin,ext)
-            dest = '%s.%s' % (ldout,ext)
-            shutil.copy(src,dest)
-    app.model.context.flush()


--- a/tools/rgenetics/rgGLM.py	Thu Mar 24 20:29:40 2011 -0400
+++ b/tools/rgenetics/rgGLM.py	Fri Mar 25 13:12:21 2011 -0400
@@ -95,6 +95,7 @@
     outf.close()
 
 
+
 def xformQassoc(resf='',outfname='',logf=None,twd='.'):
     """	plink.assoc.linear to gg file
 from the docs


--- a/tools/rgenetics/rgGLM.xml	Thu Mar 24 20:29:40 2011 -0400
+++ b/tools/rgenetics/rgGLM.xml	Fri Mar 25 13:12:21 2011 -0400
@@ -1,18 +1,15 @@
 <tool id="rgGLM1" name="Linear Models:" version="0.2">
-    <code file="listFiles.py"/>
+    <description>for genotype data</description><code file="rgGLM_code.py"/>
-
-    <description>for genotype data</description>
-
     <command interpreter="python">
         rgGLM.py '$i.extra_files_path/$i.metadata.base_name' '$phef.extra_files_path/$phef.metadata.base_name'
-        "$title1" '$predvar' '$covar' '$out_file1' '$logf' '$i.metadata.base_name'
+        "$title" '$predvar' '$covar' '$out_file1' '$logf' '$i.metadata.base_name'
         '$inter' '$cond' '$gender' '$mind' '$geno' '$maf' '$logistic' '$gffout'
     </command><inputs><page>
-       <param name='title1' label='Title for outputs' type='text' value='GLM' size="80" />
+       <param name='title' label='Title for outputs' type='text' value='GLM' size="80" /><param name="i" type="data" format="pbed" label="Genotype file" size="80"  /><param name="phef"  type="data" format="pphe" label="Phenotype file" size="80"
        help="Dependent variable and covariates will be chosen from this file on the next page"/>
@@ -41,9 +38,9 @@
    </inputs><outputs>
-       <data format="tabular" name="out_file1" />
-       <data format="txt" name="logf"  />
-       <data format="gff" name="gffout"  />
+       <data format="tabular" name="out_file1" label="${title}_rgGLM.xls"/>
+       <data format="txt" name="logf" label="${title}_rgGLMlog.txt" />
+       <data format="gff" name="gffout"  label="${title}_rgGLM.gff"/></outputs><tests><test>
@@ -59,7 +56,7 @@
    <composite_data value='tinywga.pphe' /><edit_attributes type='name' value='tinywga' /></param>
- <param name='title1' value='rgGLMtest1' />
+ <param name='title' value='rgGLMtest1' /><param name='predvar' value='c1' /><param name='covar' value='None' /><param name='inter' value='0' />
@@ -124,18 +121,26 @@
 
 **Attribution**
 
-This tool allows you to control settings for models using Plink linear models. So, we rely on the author (Shaun Purcell)
-for the documentation you need specific to those settings - they are very nicely documented at
-http://pngu.mgh.harvard.edu/~purcell/plink/anal.shtml#glm
+This Galaxy tool relies on Plink (see Plinksrc_) to test GLM models. 
+
+So, we rely on the author (Shaun Purcell) for the documentation you need specific to those settings - they are very nicely documented - see
+DOC_
 
 Tool and Galaxy datatypes originally designed and written for the Rgenetics
 series of whole genome scale statistical genetics tools by ross lazarus (ross.lazarus@gmail.com)
-supported by NIH grant
-Shaun Purcell created and maintains Plink
 
-Please acknowledge your use of this tool, Galaxy and Plink in your publications and let
-us know so we can keep track. These tools all rely on highly competitive grant funding
-so your letting us know about publications is important to our ongoing support.
+Copyright Ross Lazarus March 2007
+This Galaxy wrapper is released licensed under the LGPL_ but is about as useful as a chocolate teapot without Plink which is GPL.
+
+I'm no lawyer, but it looks like you got GPL if you use this software. Good luck.
+
+.. _Plinksrc: http://pngu.mgh.harvard.edu/~purcell/plink/ 
+
+.. _LGPL: http://www.gnu.org/copyleft/lesser.html
+
+.. _DOC: http://pngu.mgh.harvard.edu/~purcell/plink/anal.shtml#glm
+
 </help></tool>
 
+


--- a/tools/rgenetics/rgGLM_code.py	Thu Mar 24 20:29:40 2011 -0400
+++ b/tools/rgenetics/rgGLM_code.py	Fri Mar 25 13:12:21 2011 -0400
@@ -2,17 +2,6 @@
 import os,string,time
 from galaxy import datatypes 
 
-def timenow():
-    """return current time as a string
-    """
-    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
-
-def get_out_formats():
-    """return options for formats"""
-    dat = [['ucsc track','wig'],['ucsc genome graphs','gg'],['tab delimited','xls']]
-    dat = [(x[0],x[1],False) for x in dat]
-    dat.reverse()
-    return dat
 
 def get_phecols(phef='',selectOne=0):
    """return column names """
@@ -32,28 +21,3 @@
       res = [('None','no phenotype columns found',False),]
    return res
 
-
-def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
-    """Sets the name of the data"""
-    killme=string.punctuation+string.whitespace
-    trantab = string.maketrans(killme,'_'*len(killme))
-    job_name = param_dict.get( 'title1', 'GLM' )
-    job_name = job_name.encode().translate(trantab)
-    outxls = ['tabular','%s_GLM.xls' % job_name]
-    logtxt = ['txt','%s_GLM_log.txt' % job_name]
-    ggout = ['gg','%s_GLM_topTable.gff' % job_name]
-    lookup={}
-    lookup['out_file1'] = outxls
-    lookup['logf'] = logtxt
-    lookup['gffout'] = ggout
-    info = '%s GLM output by rgGLM created at %s' % (job_name,timenow())
-    for name in lookup.keys():
-        data = out_data[name]
-        data_type,newname = lookup.get(name,(None,None))
-        if data_type <> None:
-            data.name = newname
-            data.info = info
-            out_data[name] = data
-        else:
-            print >> stdout,'no output matching %s in exec after hook for rgGLM' % name
-    app.model.context.flush()


--- a/tools/rgenetics/rgGRR.xml	Thu Mar 24 20:29:40 2011 -0400
+++ b/tools/rgenetics/rgGRR.xml	Fri Mar 25 13:12:21 2011 -0400
@@ -1,20 +1,19 @@
 <tool id="rgGRR1" name="GRR:">
-    <code file="rgGRR_code.py"/><description>Pairwise Allele Sharing</description><command interpreter="python">
          rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name"
-        '$out_file1' '$out_file1.files_path' "$title1"  '$n' '$Z'
+        '$out_file1' '$out_file1.files_path' "$title"  '$n' '$Z'
     </command><inputs><param name="i"  type="data" label="Genotype data file from your current history"
       format="ldindep" />
-       <param name='title1' type='text' size="80" value='rgGRR' label="Title for this job"/>
+       <param name='title' type='text' size="80" value='rgGRR' label="Title for this job"/><param name="n" type="integer" label="N snps to use (0=all)" value="5000" /><param name="Z" type="float" label="Z score cutoff for outliers (eg 2)" value="6"
        help="2 works but for very large numbers of pairs, you might want to see less than 5%" /></inputs><outputs>
-       <data format="html" name="out_file1" />
+       <data format="html" name="out_file1" label="${title}_rgGRR.html"/></outputs><tests>
@@ -26,7 +25,7 @@
     <composite_data value='tinywga.fam' /><edit_attributes type='name' value='tinywga' /></param>
-  <param name='title1' value='rgGRRtest1' />
+  <param name='title' value='rgGRRtest1' /><param name='n' value='100' /><param name='Z' value='6' /><param name='force' value='true' />


--- a/tools/rgenetics/rgGRR_code.py	Thu Mar 24 20:29:40 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,37 +0,0 @@
-from galaxy import datatypes,model
-import sys,time,string,os,shutil
-
-def timenow():
-    """return current time as a string
-    """
-    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
-
-def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
-    """Sets the name of the html output file
-    """
-    killme = string.punctuation + string.whitespace
-    trantab = string.maketrans(killme,'_'*len(killme))
-    job_name = param_dict.get( 'title1', 'rgGRR' )
-    job_name = job_name.encode()
-    newname = '%s.html' % job_name.translate(trantab)
-    indatname = inp_data['i'].name
-    info = '%s Mean allele sharing on %s at %s' % (job_name,indatname,timenow())
-    ofname = 'out_file1'
-    data = out_data[ofname]
-    data.name = newname
-    data.info = info
-    out_data[ofname] = data
-    fromdir = data.extra_files_path
-    indat = inp_data['i']
-    indatname = indat.name
-    base_name = indat.metadata.base_name
-    todir = indat.extra_files_path # where the ldreduced stuff should be
-    ldfname = '%s_INDEP_THIN' % base_name # we store ld reduced and thinned data
-    ldout = os.path.join(todir,ldfname)
-    ldin = os.path.join(fromdir,ldfname)
-    if os.path.exists('%s.bed' % ldin) and not os.path.exists('%s.bed' % ldout): # copy ldreduced to input for next time
-        for ext in ['bim','bed','fam']:
-            src = '%s.%s' % (ldin,ext)
-            dest = '%s.%s' % (ldout,ext)
-            shutil.copy(src,dest)
-    app.model.context.flush()


--- a/tools/rgenetics/rgGTOOL.xml	Thu Mar 24 20:29:40 2011 -0400
+++ b/tools/rgenetics/rgGTOOL.xml	Fri Mar 25 13:12:21 2011 -0400
@@ -1,6 +1,5 @@
 <tool id="rgGTOOL1" name="Converter">
-    <code file="listFiles.py"/> 
-    <code file="rgGTOOL_code.py"/> 
+
   
     <description>from linkage format to SNPTEST Marchini files</description>
   


--- a/tools/rgenetics/rgGTOOL_code.py	Thu Mar 24 20:29:40 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,19 +0,0 @@
-# for GTOOL linkage ped to marchini snptest format
-from galaxy import datatypes
-
-
-def exec_before_job(app, inp_data, out_data, param_dict, tool=None):
-    """Sets the name of the data"""
-    job_name = param_dict.get( 'o', 'Marchini' )
-    log = ['txt','%s_rgGTOOL_report.log' % job_name,'txt']
-    lookup={}
-    lookup['logf'] = log
-    for name in lookup.keys():
-        data = out_data[name]
-        data_type,newname,ext = lookup[name]
-        data = app.datatypes_registry.change_datatype(data, data_type)
-        data.name = newname
-        out_data[name] = data
-
-
-


--- a/tools/rgenetics/rgHaploView.py	Thu Mar 24 20:29:40 2011 -0400
+++ b/tools/rgenetics/rgHaploView.py	Fri Mar 25 13:12:21 2011 -0400
@@ -105,15 +105,8 @@
         self.lf = file(self.log_file,'w')
         s = 'PATH=%s\n' % os.environ.get('PATH','?')
         self.lf.write(s)
-    
-    def setupRegions(self):
-        """
-        """
-        chromosome = ''
-        spos = epos = -9
-        rslist = []
-        rsdict = {}
 
+    def getRs(self):
         if self.region > '':
             useRs = []
             useRsdict={}
@@ -138,6 +131,20 @@
         else:
             useRs = self.orslist.split() # galaxy replaces newlines with XX - go figure
             useRsdict = dict(zip(useRs,useRs))
+        return useRs, useRsdict
+
+    
+    def setupRegions(self):
+        """
+        This turns out to be complex because we allow the user
+        flexibility - paste a list of rs or give a region.
+        In most cases, some subset has to be generated correctly before running Haploview
+        """
+        chromosome = ''
+        spos = epos = -9
+        rslist = []
+        rsdict = {}
+        useRs,useRsdict = self.getRs()
         self.useTemp = False
         try:
             dfile = open(self.DATA_FILE, 'r')
@@ -284,7 +291,60 @@
             self.lf.write(s)
             print >> sys.stdout,s
             sys.exit(1)
-        
+
+    def run(self,vcl):
+        """
+        """
+        p=subprocess.Popen(vcl,shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf)
+        retval = p.wait()
+        self.lf.write('## executing %s returned %d\n' % (vcl,retval))
+
+    def plotHmPanels(self,ste):
+        """
+        """
+        sp = '%d' % (self.spos/1000.) # hapmap wants kb
+        ep = '%d' % (self.epos/1000.)
+        fnum=0
+        for panel in self.hmpanels:
+            if panel > '' and panel.lower() <> 'none': # in case someone checks that option too :)
+                ptran = panel.strip()
+                ptran = ptran.replace('+','_')
+                fnum += 1 # preserve an order or else we get sorted
+                vcl = [javabin,'-jar',self.hvbin,'-n','-memory','%d' % self.memSize,
+                  '-chromosome',self.chromosome, '-panel',panel.strip(),
+                  '-hapmapDownload','-startpos',sp,'-endpos',ep,
+                  '-ldcolorscheme',self.ldType]
+                if self.minMaf:
+                    vcl += ['-minMaf','%f' % self.minMaf]
+                if self.maxDist:
+                    vcl += ['-maxDistance',self.maxDist]
+                if self.hiRes:
+                    vcl.append('-png')
+                else:
+                    vcl.append('-compressedpng')
+                if self.infotrack:
+                    vcl.append('-infoTrack')
+                p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=ste,stdout=self.lf)
+                retval = p.wait()
+                inpng = 'Chromosome%s%s.LD.PNG' % (self.chromosome,panel)
+                inpng = inpng.replace(' ','') # mysterious spaces!
+                outpng = '%d_HapMap_%s_%s.png' % (fnum,ptran,self.chromosome)
+                # hack for stupid chb+jpt
+                outpng = outpng.replace(' ','')
+                tmppng = '%s.tmp.png' % self.title
+                tmppng = tmppng.replace(' ','')
+                outpng = os.path.split(outpng)[-1]
+                vcl = [self.convert, '-resize 800x400!', inpng, tmppng]
+                self.run(' '.join(vcl))
+                s = "text 10,300 'HapMap %s'" % ptran.strip()
+                vcl = [self.convert, '-pointsize 25','-fill maroon',
+                      '-draw "%s"' % s, tmppng, outpng]
+                self.run(' '.join(vcl))
+                try:
+                    os.remove(os.path.join(self.outfpath,tmppng))
+                except:
+                    pass
+
     def doPlots(self):
         """
         """    
@@ -308,15 +368,9 @@
             vcl += ['-chromosome',self.chromosome]
         if self.infotrack:
             vcl.append('-infoTrack')
-        p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=ste,stdout=self.lf)
-        retval = p.wait()
-        s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
-        self.lf.write(s)
+        self.run(' '.join(vcl))
         vcl = [self.mogrify, '-resize 800x400!', '*.PNG']
-        p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf)
-        retval = p.wait()
-        s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
-        self.lf.write(s)
+        self.run(' '.join(vcl))
         inpng = '%s.LD.PNG' % DATA_FILE # stupid but necessary - can't control haploview name mangle
         inpng = inpng.replace(' ','')
         inpng = os.path.split(inpng)[-1]
@@ -326,89 +380,29 @@
         outpng = outpng.replace(' ','')
         outpng = os.path.split(outpng)[-1]
         vcl = [self.convert, '-resize 800x400!', inpng, tmppng]
-        p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf)
-        retval = p.wait()
-        s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
-        self.lf.write(s)
+        self.run(' '.join(vcl))
         s = "text 10,300 '%s'" % self.title[:40]
         vcl = [self.convert, '-pointsize 25','-fill maroon',
               '-draw "%s"' % s, tmppng, outpng]
-        p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf)
-        retval = p.wait()
-        s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
-        self.lf.write(s)
+        self.run(' '.join(vcl))
         try:
             os.remove(os.path.join(self.outfpath,tmppng))
         except:
             pass    # label all the plots then delete all the .PNG files before munging
         fnum=1
         if self.hmpanels:
-            sp = '%d' % (self.spos/1000.) # hapmap wants kb
-            ep = '%d' % (self.epos/1000.)
-            for panel in self.hmpanels:
-                if panel > '' and panel.lower() <> 'none': # in case someone checks that option too :)
-                    ptran = panel.strip()
-                    ptran = ptran.replace('+','_')
-                    fnum += 1 # preserve an order or else we get sorted
-                    vcl = [javabin,'-jar',self.hvbin,'-n','-memory','%d' % self.memSize,
-                      '-chromosome',self.chromosome, '-panel',panel.strip(),
-                      '-hapmapDownload','-startpos',sp,'-endpos',ep,
-                      '-ldcolorscheme',self.ldType]
-                    if self.minMaf:
-                        vcl += ['-minMaf','%f' % self.minMaf]
-                    if self.maxDist:
-                        vcl += ['-maxDistance',self.maxDist]
-                    if self.hiRes:
-                        vcl.append('-png')
-                    else:
-                        vcl.append('-compressedpng')
-                    if self.infotrack:
-                        vcl.append('-infoTrack')
-                    p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=ste,stdout=self.lf)
-                    retval = p.wait()
-                    inpng = 'Chromosome%s%s.LD.PNG' % (self.chromosome,panel)
-                    inpng = inpng.replace(' ','') # mysterious spaces!
-                    outpng = '%d_HapMap_%s_%s.png' % (fnum,ptran,self.chromosome)
-                    # hack for stupid chb+jpt
-                    outpng = outpng.replace(' ','')
-                    tmppng = '%s.tmp.png' % self.title
-                    tmppng = tmppng.replace(' ','')
-                    outpng = os.path.split(outpng)[-1]
-                    vcl = [self.convert, '-resize 800x400!', inpng, tmppng]
-                    p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf)
-                    retval = p.wait()
-                    s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
-                    self.lf.write(s)
-                    s = "text 10,300 'HapMap %s'" % ptran.strip()
-                    vcl = [self.convert, '-pointsize 25','-fill maroon',
-                          '-draw "%s"' % s, tmppng, outpng]
-                    p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf)
-                    retval = p.wait()
-                    s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
-                    self.lf.write(s)
-                    try:
-                        os.remove(os.path.join(self.outfpath,tmppng))
-                    except:
-                        pass
+            self.plotHmPanels(ste)
         nimages = len(glob.glob(os.path.join(self.outfpath,'*.png'))) # rely on HaploView shouting - PNG @!
         self.lf.write('### nimages=%d\n' % nimages)
         if nimages > 0: # haploview may fail?
             vcl = '%s -format pdf -resize 800x400! *.png' % self.mogrify
-            p=subprocess.Popen(vcl,shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf)
-            retval = p.wait()
-            self.lf.write('## executing %s returned %d\n' % (vcl,retval))
+            self.run(vcl)
             vcl = '%s *.pdf --fitpaper true --outfile alljoin.pdf' % self.pdfjoin
-            p=subprocess.Popen(vcl,shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf)
-            retval = p.wait()
-            self.lf.write('## executing %s returned %d\n' % (vcl,retval))
+            self.run(vcl)
             vcl = '%s alljoin.pdf --nup 1x%d --outfile allnup.pdf' % (self.pdfnup,nimages)
-            p=subprocess.Popen(vcl,shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf)
-            retval = p.wait()
-            self.lf.write('## executing %s returned %d\n' % (vcl,retval))
+            self.run(vcl)
             vcl = '%s -resize x300 allnup.pdf allnup.png' % (self.convert)
-            p=subprocess.Popen(vcl,shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf)
-            retval = p.wait()
-            self.lf.write('## executing %s returned %d\n' % (vcl,retval))
+            self.run(vcl)
         ste.close() # temp file used to catch haploview blather
         hblather = open(blog,'r').readlines() # to catch the blather    
         os.unlink(blog)
@@ -417,6 +411,10 @@
            self.lf.write(''.join(hblather))
            self.lf.write('\n')
         self.lf.close()
+        
+    def writeHtml(self):
+        """
+        """
         flist = glob.glob(os.path.join(self.outfpath, '*'))
         flist.sort()
         ts = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~' + string.whitespace
@@ -509,6 +507,7 @@
         sys.exit(1)
     ld = ldPlot(argv = sys.argv)
     ld.doPlots()
+    ld.writeHtml()
 
 
 


--- a/tools/rgenetics/rgHaploView_code.py	Thu Mar 24 20:29:40 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,84 +0,0 @@
-from galaxy import app
-import galaxy.util,string,os,glob,time
-
-librepos = '/usr/local/galaxy/data/rg/library'
-myrepos = '/home/rerla/galaxy'
-
-
-#Provides Upload tool with access to list of available builds
-
-builds = []
-#Read build names and keys from galaxy.util
-for dbkey, build_name in galaxy.util.dbnames:
-    builds.append((build_name,dbkey,False))
-
-def timenow():
-    """return current time as a string
-    """
-    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
-
-#Return available builds
-def get_available_builds(defval='hg18'):
-    for i,x in enumerate(builds):
-        if x[1] == defval:
-           x = list(x)
-           x[2] = True
-           builds[i] = tuple(x)
-    return builds
-
-#return available datasets for build
-def get_available_data( file_type_dir, build='hg18' ):
-    #we need to allow switching of builds, and properly set in post run hook
-    import_files = [('Use the History file chosen below','',True),]
-    flist = glob.glob(os.path.join(librepos,file_type_dir,'*.ped'))  
-    maplist = glob.glob(os.path.join(librepos,file_type_dir,'*.map'))
-    maplist = [os.path.splitext(x)[0] for x in maplist] # get unique filenames
-    mapdict = dict(zip(maplist,maplist))
-    flist = [os.path.splitext(x)[0] for x in flist] # get unique filenames
-    flist = list(set(flist)) # remove dupes  
-    flist = [x for x in flist if mapdict.get(x,None)]
-    flist.sort()
-    for i, data in enumerate( flist ):
-        #import_files.append( (os.path.split(data)[-1], os.path.split(data)[-1], False) )
-        import_files.append((data,data, False) )
-    if len(import_files) < 1:
-        import_files = [('No %s data available - please choose a History file below'
-                             % file_type_dir,'',True),]
-    return import_files
-
-def get_available_outexts(uid):
-    userId = uid
-    print 'userId=',userId
-    flib = os.path.join(librepos,userId,'fped')
-    plib = os.path.join(librepos,userId,'lped')
-    e = [('Fbat style (header row has marker names)',flib,False),('Linkage style (separate .map file)',plib,True)]
-    return e
-
-def getcols(fname="/usr/local/galaxy/data/camp2007/camp2007.xls"):
-   """return column names other than chr offset as a select list"""
-   head = open(fname,'r').next()
-   c = head.strip().split()
-   res = [(cname,'%d' % n,False) for n,cname in enumerate(c)]
-   for i in range(4):
-     x,y,z = res[i]
-     res[i] = (x,y,True) # set first couple as selected
-   return res
-
-    
-def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
-    """Sets the name of the html output file - that's all we need
-    for a html file result set
-    """
-    ts = '%s%s' % (string.punctuation,string.whitespace)
-    ptran =  string.maketrans(ts,'_'*len(ts))
-    job_name = param_dict.get( 'title', 'LD_Plot' )
-    job_name = job_name.encode().translate(ptran)
-    ofname = 'out_file1'
-    data = out_data[ofname]
-    newname = job_name
-    data.name = '%s.html' % newname
-    info = '%s created by rgHaploView at %s' % (job_name,timenow())
-    out_data[ofname] = data
-    app.model.context.flush()
-
-


--- a/tools/rgenetics/rgLDIndep_code.py	Thu Mar 24 20:29:40 2011 -0400
+++ b/tools/rgenetics/rgLDIndep_code.py	Fri Mar 25 13:12:21 2011 -0400
@@ -15,7 +15,7 @@
     trantab = string.maketrans(killme,'_'*len(killme))
     title = basename.encode().translate(trantab)
     info = '%s filtered by rgLDIndep.py at %s' % (title,timenow())
-    #data.file_name = data.file_name
+    data.file_name = data.file_name
     data.metadata.base_name = title
     data.name = '%s.pbed' % title
     data.info = info


--- a/tools/rgenetics/rgManQQ.xml	Thu Mar 24 20:29:40 2011 -0400
+++ b/tools/rgenetics/rgManQQ.xml	Fri Mar 25 13:12:21 2011 -0400
@@ -44,9 +44,9 @@
  <param name='grey' value='0' /><output name='out_html' file='rgtestouts/rgManQQ/rgManQQtest1.html' ftype='html' lines_diff='60'><extra_files type="file" name='Allelep_manhattan.png' value='rgtestouts/rgManQQ/Allelep_manhattan.png' compare="sim_size" 
-     delta = "10000"/>
+     delta = "20000"/><extra_files type="file" name='Allelep_qqplot.png' value='rgtestouts/rgManQQ/Allelep_qqplot.png' compare="sim_size"
-     delta = "10000" />
+     delta = "20000" /><extra_files type="file" name='rgManQQtest1.R' value='rgtestouts/rgManQQ/rgManQQtest1.R' compare="diff" lines_diff="160"/></output></test>


--- a/tools/rgenetics/rgPedSub.xml	Thu Mar 24 20:29:40 2011 -0400
+++ b/tools/rgenetics/rgPedSub.xml	Fri Mar 25 13:12:21 2011 -0400
@@ -1,5 +1,4 @@
 <tool id="rgPedSub1" name="Subset markers:">
-    <code file="rgPedSub_code.py"/><description>region or rs list</description>
 
@@ -44,7 +43,7 @@
    </inputs><outputs>
-       <data format="lped" name="output1" metadata_source="input1"/>
+       <data format="lped" name="output1" metadata_source="input1" label="${title}.lped"/></outputs><configfiles>


--- a/tools/rgenetics/rgPedSub_code.py	Thu Mar 24 20:29:40 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,78 +0,0 @@
-from galaxy import app
-import galaxy.util,string,os,glob,time
-
-#Provides Upload tool with access to list of available builds
-
-builds = []
-#Read build names and keys from galaxy.util
-for dbkey, build_name in galaxy.util.dbnames:
-    builds.append((build_name,dbkey,False))
-
-def timenow():
-    """return current time as a string
-    """
-    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
-
-
-#Return available builds
-def get_available_builds(defval='hg18'):
-    for i,x in enumerate(builds):
-        if x[1] == defval:
-           x = list(x)
-           x[2] = True
-           builds[i] = tuple(x)
-    return builds
-
-#return available datasets for build
-def get_available_data( file_type_dir, build='hg18' ):
-    #we need to allow switching of builds, and properly set in post run hook
-    import_files = [] # [('Use the History file chosen below','',False),]
-    flist = glob.glob(os.path.join(librepos,file_type_dir,'*.ped'))  
-    flist = [os.path.splitext(x)[0] for x in flist] # get unique filenames
-    flist = list(set(flist)) # remove dupes  
-    flist.sort()
-    for i, data in enumerate( flist ):
-        #import_files.append( (os.path.split(data)[-1], os.path.split(data)[-1], False) )
-        import_files.append((os.path.split(data)[-1],data, False) )
-    if len(import_files) < 1:
-        import_files.append(('No %s data available - please choose a History file instead'
-                             % file_type_dir,'',True))
-    return import_files
-
-def get_available_outexts(uid):
-    userId = uid
-    print 'userId=',userId
-    flib = os.path.join(librepos,userId,'fped')
-    plib = os.path.join(librepos,userId,'lped')
-    e = [('Fbat style (header row has marker names)',flib,False),('Linkage style (separate .map file)',plib,True)]
-    return e
-
-def getcols(fname="/usr/local/galaxy/data/camp2007/camp2007.xls"):
-   """return column names other than chr offset as a select list"""
-   head = open(fname,'r').next()
-   c = head.strip().split()
-   res = [(cname,'%d' % n,False) for n,cname in enumerate(c)]
-   for i in range(4):
-     x,y,z = res[i]
-     res[i] = (x,y,True) # set first couple as selected
-   return res
-
-
-def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
-    """from rgConvert_code
-    """
-    name,data = out_data.items()[0]
-    iname,idata = inp_data.items()[0]
-    basename = idata.metadata.base_name
-    title = param_dict.get( 'title', 'Lped Subset' )
-    title = title.encode() # make str - unicode is evil here
-    killme = string.punctuation + string.whitespace
-    trantab = string.maketrans(killme,'_'*len(killme))
-    title = title.translate(trantab)
-    info = '%s filtered by rgPedSub.py at %s' % (title,timenow())
-    #data.file_name = data.file_name
-    data.metadata.base_name = basename
-    data.name = '%s.lped' % title
-    data.info = info
-    app.model.context.flush()
-


--- a/tools/rgenetics/rgQC.xml	Thu Mar 24 20:29:40 2011 -0400
+++ b/tools/rgenetics/rgQC.xml	Fri Mar 25 13:12:21 2011 -0400
@@ -1,21 +1,20 @@
 <tool id="rgQC1" name="QC reports:">
-    <code file="rgQC_code.py"/><description>Marker and Subject measures</description><command interpreter="python">
-        rgQC.py -i '$input_file.extra_files_path/$input_file.metadata.base_name' -o "$out_prefix"
+        rgQC.py -i '$input_file.extra_files_path/$input_file.metadata.base_name' -o "$title"
         -s '$html_file' -p '$html_file.files_path'
     </command><inputs><param name="input_file" type="data" label="RGenetics genotype file in compressed Plink format"
           size="80" format="pbed" />
-       <param name="out_prefix" size="80" type="text" value="RgQC report" label="Descriptive report title"/>
+       <param name="title" size="80" type="text" value="RgQC report" label="Descriptive report title"/></inputs><outputs>
-       <data format="html" name="html_file" metadata_source="input_file"/>
+       <data format="html" name="html_file" metadata_source="input_file" label="${title}.html"/></outputs><tests>
@@ -27,9 +26,9 @@
     <composite_data value='tinywga.fam' /><edit_attributes type='name' value='tinywga' /></param>
-    <param name='out_prefix' value='rgQCtest1' />
-    <output name='html_file' file='rgtestouts/rgQC/rgQCtest1.html' ftype='html' lines_diff='280'>
-    <extra_files type="file" name='tinywga_All_Paged.pdf' value="rgtestouts/rgQC/tinywga_All_Paged.pdf" compare="sim_size" delta = "5000"/>
+    <param name='title' value='rgQCtest1' />
+    <output name='html_file' file='rgtestouts/rgQC/rgQCtest1.html' ftype='html' lines_diff='300'>
+    <extra_files type="file" name='tinywga_All_Paged.pdf' value="rgtestouts/rgQC/tinywga_All_Paged.pdf" compare="sim_size" delta = "100000"/><extra_files type="file" name='tinywga.log' value="rgtestouts/rgQC/tinywga.log" compare="diff" lines_diff="15"/><extra_files type="file" name='tinywga.frq' value="rgtestouts/rgQC/tinywga.frq" compare="diff" /><extra_files type="file" name='tinywga.het' value="rgtestouts/rgQC/tinywga.het" compare="diff" lines_diff="90"/>
@@ -38,7 +37,7 @@
     <extra_files type="file" name='tinywga.imiss' value="rgtestouts/rgQC/tinywga.imiss" compare="diff" /><extra_files type="file" name='tinywga.lmendel' value="rgtestouts/rgQC/tinywga.lmendel" compare="diff" /><extra_files type="file" name='tinywga.lmiss' value="rgtestouts/rgQC/tinywga.lmiss" compare="diff" />
-    <extra_files type="file" name='tinywga_All_3x3.pdf' value="rgtestouts/rgQC/tinywga_All_3x3.pdf" compare="sim_size" delta="5000"/>
+    <extra_files type="file" name='tinywga_All_3x3.pdf' value="rgtestouts/rgQC/tinywga_All_3x3.pdf" compare="sim_size" delta="100000"/><extra_files type="file" name='ldp_tinywga.bed' value="rgtestouts/rgQC/ldp_tinywga.bed" compare="diff" lines_diff="10" /><extra_files type="file" name='ldp_tinywga.bim' value="rgtestouts/rgQC/ldp_tinywga.bim" compare="sim_size" delta="1000" /><extra_files type="file" name='ldp_tinywga.fam' value="rgtestouts/rgQC/ldp_tinywga.fam" compare="diff" />
@@ -46,9 +45,9 @@
     <extra_files type="file" name='Ranked_Marker_HWE.xls' value="rgtestouts/rgQC/Ranked_Marker_HWE.xls" compare="diff" /><extra_files type="file" name='Ranked_Marker_MAF.xls' value="rgtestouts/rgQC/Ranked_Marker_MAF.xls" compare="diff" /><extra_files type="file" name='Ranked_Marker_Missing_Genotype.xls' value="rgtestouts/rgQC/Ranked_Marker_Missing_Genotype.xls" compare="diff" lines_diff="5"/>
-    <extra_files type="file" name='Ranked_Subject_Missing_Genotype.xls' value="rgtestouts/rgQC/Ranked_Subject_Missing_Genotype.xls" compare="diff" lines_diff="10"/>
+    <extra_files type="file" name='Ranked_Subject_Missing_Genotype.xls' value="rgtestouts/rgQC/Ranked_Subject_Missing_Genotype.xls" compare="diff" lines_diff="40"/><extra_files type="file" name='tinywga_fracmiss_cum.jpg' value="rgtestouts/rgQC/tinywga_fracmiss_cum.jpg" compare="sim_size" delta = "20000"/>     
-    <extra_files type="file" name='tinywga_fracmiss_cum.pdf' value="rgtestouts/rgQC/tinywga_fracmiss_cum.pdf" compare="sim_size" delta = "1000"/>     
+    <extra_files type="file" name='tinywga_fracmiss_cum.pdf' value="rgtestouts/rgQC/tinywga_fracmiss_cum.pdf" compare="sim_size" delta = "100000"/></output></test></tests>


--- a/tools/rgenetics/rgQC_code.py	Thu Mar 24 20:29:40 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,38 +0,0 @@
-"""
-# after running the qc, need to rename various output files
-       <data format="html" name="html_file" />
-       <data format="txt" name="log_file" parent="html_file" />
-       <data format="tabular" name="marker_file" parent="html_file" />
-       <data format="tabular" name="subject_file" parent="html_file" />
-       <data format="tabular" name="freq_file" parent="html_file" />
-   </outputs>
-"""
-from galaxy import datatypes,model 
-import sys,time
-
-def timenow():
-    """return current time as a string
-    """
-    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
-
-
-def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
-    """Change data file names
-
-    """
-    job_name = param_dict.get( 'out_prefix', 'rgQCdefault' )
-    html = ['html','%s.html' % job_name]
-    lookup={}
-    lookup['html_file'] = html
-    info = '%s QC report by rgQC at %s' % (job_name,timenow())
-    for aname in lookup.keys():
-       data = out_data[aname]
-       data_type,newname = lookup[aname]
-       data = app.datatypes_registry.change_datatype(data, data_type)
-       data.name = newname
-       data.info = info
-       out_data[aname] = data
-    app.model.context.flush()
-
-
-


--- a/tools/rgenetics/rgQQ.xml	Thu Mar 24 20:29:40 2011 -0400
+++ b/tools/rgenetics/rgQQ.xml	Fri Mar 25 13:12:21 2011 -0400
@@ -4,14 +4,14 @@
     <description>for p values from an analysis </description><command interpreter="python">
-        rgQQ.py "$input1" "$name" "$sample" "$cols" "$allqq" "$height" "$width" "$logtrans" "$allqq.id" "$__new_file_path__"
+        rgQQ.py "$input1" "$title" "$sample" "$cols" "$allqq" "$height" "$width" "$logtrans" "$allqq.id" "$__new_file_path__"
     </command><inputs><page><param name="input1"  type="data" label="Choose the History dataset containing p values to QQ plot"
           size="80" format="tabular" help="Dataset missing? See Tip below" />
-       <param name="name" type="text" size="80" label = "Descriptive title for QQ plot" value="QQ" />
+       <param name="title" type="text" size="80" label = "Descriptive title for QQ plot" value="QQ" /><param name="logtrans" type="boolean" label = "Use a log scale - recommended for p values in range 0-1.0"
           truevalue="true" falsevalue="false"/>
@@ -28,13 +28,13 @@
    </inputs><outputs>
-       <data format="pdf" name="allqq" />
+       <data format="pdf" name="allqq" label="${title}.html"/></outputs><tests><test><param name='input1' value='tinywga.pphe' />
- <param name='name' value="rgQQtest1" />
+ <param name='title' value="rgQQtest1" /><param name='logtrans' value="false" /><param name='sample' value='1.0' /><param name='height' value='8' />


--- a/tools/rgenetics/rgQQ_code.py	Thu Mar 24 20:29:40 2011 -0400
+++ b/tools/rgenetics/rgQQ_code.py	Fri Mar 25 13:12:21 2011 -0400
@@ -48,25 +48,3 @@
     else:
         columns = [('?','?',False),]
     return columns
-
-
-def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
-    """Sets the name of the data
-       <outputs>
-       <data format="pdf" name="allqq" />
-       <data format="pdf" name="lowqq" parent="allqq"/>
-    </outputs>
-    """
-    job_name = param_dict.get( 'name', 'My rgQQplot' )
-    killme = string.punctuation + string.whitespace
-    trantab = string.maketrans(killme,'_'*len(killme))
-    newname = '%s.pdf' % job_name.translate(trantab)
-    lookup={}
-    lookup['allqq'] = newname
-    for aname in lookup.keys():
-       data = out_data[aname]
-       newname = lookup[aname]
-       data.name = newname
-       out_data[aname] = data
-    app.model.context.flush()
-


--- a/tools/rgenetics/rgRegion.xml	Thu Mar 24 20:29:40 2011 -0400
+++ b/tools/rgenetics/rgRegion.xml	Fri Mar 25 13:12:21 2011 -0400
@@ -1,24 +1,20 @@
 <tool id="rgRegion" name="Subset:">
-    <code file="rgRegion_code.py"/> 
-    <code file="listFiles.py"/> 
-  
     <description>genotypes from genomic region</description><command interpreter="python">
-        rgRegion.py $infile $r $tag $out_file1
+        rgRegion.py $infile $r $title $out_file1
     </command><inputs><page><param name="infile" type="data" format="lped" label="Linkage ped genotype file name from current history" size="80"/>
+       <param name="title" type="text" size="80" label="Title for output files" optional="true"
+        help="Descriptive title for new genotype/map files" value="RGRegion" /><param name="r" type="text" label="Region" help="Cut and paste a UCSC browser region" 
         size="80" value="chr9:119,506,000-122,518,000"/><param name="rslist" type="text" area="true" label="List of rs numbers" help="Type (or cut and paste) a space or newline separated list of rs numbers" 
         size="5x20"/>
-
-       <param name="tag" type="text" label="Output file name" value="My_favourite_region" size="80"/>
-       <param name="outformat" type="select" label="Output file format" dynamic_options="get_rgRegionOutFormats()" size="80"/>
-        <param name="dbkey" type="hidden" value="hg18" />
+       <param name="outformat" type="select" label="Output file format" dynamic_options="get_rgRegionOutFormats()" size="80"/></page>
 
@@ -26,7 +22,7 @@
    </inputs><outputs>  
-       <data format="lped" name="out_file1" />
+       <data format="lped" name="out_file1" label="${title}.lped" metadata_source=infile /></outputs><help>
 


--- a/tools/rgenetics/rgRegion_code.py	Thu Mar 24 20:29:40 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,47 +0,0 @@
-from galaxy import app
-import galaxy.util,string
-
-librepos = '/usr/local/galaxy/data/rg'
-myrepos = '/home/rerla/galaxy'
-marchinirepos = '/usr/local/galaxy/data/rg/snptest'
-
-#Provides Upload tool with access to list of available builds
-
-
-def get_rgRegionOutFormats():
-    """return options for formats"""
-    dat = [['ucsc track','wig',False],['Strict genome graphs (rs+floats)','gg',True],['tab delimited','xls',False]]
-    dat = [(x[0],x[1],x[2]) for x in dat]
-    return dat
-
-
-def get_phecols(phef):
-   """return column names """
-   head = open(phef,'r').next()
-   c = head.strip().split()
-   res = [(cname,cname,False) for cname in c]
-   x,y,z = res[2] # 0,1 = fid,iid
-   res[2] = (x,y,True) # set second selected
-   return res
-
-
-def getAllcols(fname="/usr/local/galaxy/data/camp2007/camp2007.xls",outformat='gg'):
-   """return column names other than chr offset as a select list"""
-   head = open(fname,'r').next()
-   c = head.strip().split()
-   res = [(cname,'%d' % n,True) for n,cname in enumerate(c)]
-  
-   return res
-
-
-def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
-    """Sets the name of the data"""
-    data_name = param_dict.get( 'tag', 'My region' )
-    outformat = param_dict.get( 'outformat', 'gg' )
-    outfile = param_dict.get( 'outfile1', 'lped' )
-    for name, data in out_data.items():
-       if name == 'tag':
-         data = app.datatypes_registry.change_datatype(data, outformat)
-         data.name = data_name
-         out_data[name] = data
-


--- a/tools/rgenetics/rgTDT.py	Thu Mar 24 20:29:40 2011 -0400
+++ b/tools/rgenetics/rgTDT.py	Fri Mar 25 13:12:21 2011 -0400
@@ -26,6 +26,7 @@
 verbose = False
 
 
+
 def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000):
     """
     score must be scaled to 0-1000
@@ -84,7 +85,7 @@
     outres = [] # need to resort into chrom offset order
     for i,lrow in enumerate(resfl):
         chrom,snp,offset,p, = [lrow[x] for x in wewant]
-        gff = ('chr%s' % chrom,'rgGLM','variation','%d' % (int(offset)-halfwidth),
+        gff = ('chr%s' % chrom,'rgTDT','variation','%d' % (int(offset)-halfwidth),
                '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i]))
         outres.append(gff)
     outres = [(x[0],int(x[3]),x) for x in outres] # decorate


--- a/tools/rgenetics/rgTDT.xml	Thu Mar 24 20:29:40 2011 -0400
+++ b/tools/rgenetics/rgTDT.xml	Fri Mar 25 13:12:21 2011 -0400
@@ -1,23 +1,21 @@
 <tool id="rgTDT1" name="Transmission Distortion:">
-    <code file="rgTDT_code.py"/>
-
     <description>for family data</description><command interpreter="python">
-        rgTDT.py -i '$i.extra_files_path/$i.metadata.base_name' -o '$title1'
+        rgTDT.py -i '$i.extra_files_path/$i.metadata.base_name' -o '$title'
         -r '$out_file1' -l '$logf'  -g '$gffout'
     </command><inputs><param name="i"  type="data" label="Genotypes for analysis from your current history datasets"
           size="132" format="pbed" />
-       <param name='title1' type='text' value='rgTDT'  size="80"/>
+       <param name='title' type='text' value='rgTDT'  label="Title for the output to remind you what you did" size="80"/></inputs><outputs>
-       <data format="tabular" name="out_file1" />
-       <data format="gff" name="gffout" />
-       <data format="txt" name="logf" />
+       <data format="tabular" name="out_file1" label="${title}_rgTDT.xls"/>
+       <data format="gff" name="gffout" label="${title}_rgTDT.gff"/>
+       <data format="txt" name="logf" label="${title}_rgTDTlog.txt"/></outputs><tests>
@@ -29,7 +27,7 @@
    <composite_data value='tinywga.fam' /><edit_attributes type='name' value='tinywga' /></param>
- <param name='title1' value='rgTDTtest1' />
+ <param name='title' value='rgTDTtest1' /><output name='out_file1' file='rgTDTtest1_TDT.xls' ftype='tabular' compare="diff"/><output name='gffout' file='rgTDTtest1_TDT_topTable.gff' ftype='gff' compare="diff" /><output name='logf' file='rgTDTtest1_TDT_log.txt' ftype='txt' lines_diff='79'/>
@@ -81,5 +79,30 @@
 one with a header. Fortunately excel is dumb enough to open these without much protest.
 
 
+----
+
+.. class:: infomark
+
+**Attribution**
+
+This Galaxy tool relies on Plink (see Plinksrc_) to test TDT models. 
+
+So, we rely on the author (Shaun Purcell) for the documentation you need specific to those settings - they are very nicely documented - see
+DOC_
+
+Tool and Galaxy datatypes originally designed and written for the Rgenetics
+series of whole genome scale statistical genetics tools by ross lazarus (ross.lazarus@gmail.com)
+
+Copyright Ross Lazarus March 2007
+This Galaxy wrapper is released licensed under the LGPL_ but is about as useful as a chocolate teapot without Plink which is GPL.
+
+I'm no lawyer, but it looks like you got GPL if you use this software. Good luck.
+
+.. _Plinksrc: http://pngu.mgh.harvard.edu/~purcell/plink/ 
+
+.. _LGPL: http://www.gnu.org/copyleft/lesser.html
+
+.. _DOC: http://pngu.mgh.harvard.edu/~purcell/plink/anal.shtml#tdt
+
 </help></tool>


--- a/tools/rgenetics/rgTDT_code.py	Thu Mar 24 20:29:40 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,43 +0,0 @@
-# before running the qc, need to rename various output files
-import time,string
-
-def timenow():
-    """return current time as a string
-    """
-    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
-
-def get_out_formats():
-    """return options for formats"""
-    dat = [['ucsc genome graphs','gg',True],['ucsc track','wig',False],['tab delimited','xls',False]]
-    dat = [(x[0],x[1],x[2]) for x in dat]
-    return dat
-
-
-def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
-    """Sets the name of the data
-   <command interpreter="python2.4">
-        rgTDT.py -i $i.extra_files_path/$i.metadata.base_name -o $title -f $outformat -r $out_file1 -l $logf
-    </command>
-    
-    """
-    dbk = param_dict.get('dbkey','hg18')
-    job_name = param_dict.get( 'title1', 'rgTDTtest1' )
-    killme=string.punctuation+string.whitespace
-    trantab = string.maketrans(killme,'_'*len(killme))
-    job_name = job_name.encode().translate(trantab)
-    outxls = ['tabular','%s_TDT.xls' % job_name]
-    logtxt = ['txt','%s_TDT_log.txt' % job_name]
-    ggout = ['gg','%s_TDT_topTable.gff' % job_name]
-    lookup={}
-    lookup['out_file1'] = outxls
-    lookup['logf'] = logtxt
-    lookup['gffout'] = ggout
-    info = 'rgTDT run at %s' % timenow()
-    for name in lookup.keys():
-        data = out_data[name]
-        data_type,newname = lookup[name]
-        data.name = newname
-        data.info = info
-        data.dbkey = dbk
-        out_data[name] = data
-    app.model.context.flush()


--- a/tools/rgenetics/rgfakePed.xml	Thu Mar 24 20:29:40 2011 -0400
+++ b/tools/rgenetics/rgfakePed.xml	Fri Mar 25 13:12:21 2011 -0400
@@ -1,14 +1,13 @@
 <tool id="rgfakePed1" name="Null genotypes"><description>for testing</description>
-  <code file="rgfakePed_code.py"/>
-  <command interpreter="python">rgfakePed.py --title '$title1'
+  <command interpreter="python">rgfakePed.py --title '$title'
   -o '$out_file1' -p '$out_file1.files_path' -c '$ncases' -n '$ntotal'
   -s '$nsnp'  -w '$lowmaf' -v '$missingValue' -l '$outFormat'
   -d '$mafdist' -m '$missingRate' -M '$mendelRate' </command><inputs><page>
 
-    <param name="title1"
+    <param name="title"
          type="text"
          help="Name for outputs from this job"
          label="Descriptive short name"/>
@@ -60,11 +59,11 @@
 </inputs><outputs>
-    <data format="lped" name="out_file1" />
+    <data format="lped" name="out_file1" label="${title}.lped" /></outputs><tests><test>
-    <param name='title1' value='rgfakePedtest1' />
+    <param name='title' value='rgfakePedtest1' /><param name="ntotal" value="40" /><param name="ncases" value="20" /><param name="nsnp" value="10" />


--- a/tools/rgenetics/rgfakePed_code.py	Thu Mar 24 20:29:40 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,53 +0,0 @@
-import os, string, time
-from galaxy import datatypes
-from galaxy import app
-import galaxy.util
-
-#Provides Upload tool with access to list of available builds
-repospath = '/usr/local/galaxy/data/rg'
-builds = []
-#Read build names and keys from galaxy.util
-for dbkey, build_name in galaxy.util.dbnames:
-    builds.append((build_name,dbkey,False))
-
-def timenow():
-    """return current time as a string
-    """
-    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
-
-#Return available builds
-def get_available_builds(defval='hg18'):
-    for i,x in enumerate(builds):
-        if x[1] == defval:
-           x = list(x)
-           x[2] = True
-           builds[i] = tuple(x)
-    return builds
-
-# Create link to files here
-def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
-    base_name = param_dict.get( 'title1', 'Null_Genotypes' )
-    base_name = base_name.encode()
-    outFormat = param_dict.get('outFormat','P')
-    s = string.whitespace + string.punctuation
-    ptran =  string.maketrans(s,'_'*len(s))
-    base_name = base_name.translate(ptran)
-    pname = "out_file1"
-    lookup = {'L':('linkped','lped'),'F':('FBAT','fped')}
-    # /usr/local/galaxy/data/rg/1/lped/
-    info = 'Null phenotypes created by rgfakePhe at %s' % timenow()
-    data = out_data[pname]
-    ftype,pext = lookup[outFormat]
-    data.name = '%s.%s' % (base_name,pext) # that's how they've been named in rgfakePhe.py
-    data.change_datatype(pext)
-    #data.file_name = data.file_name
-    data.metadata.base_name = base_name
-    data.readonly = True
-    out_data[pname] = data
-    app.model.context.flush()
-
-
-
-
-
-


--- a/tools/rgenetics/rgfakePhe_code.py	Thu Mar 24 20:29:40 2011 -0400
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,46 +0,0 @@
-# before running the qc, need to rename various output files
-import os, string, time
-from galaxy import datatypes
-from galaxy import app
-import galaxy.util
-
-#Provides Upload tool with access to list of available builds
-repospath = '/usr/local/galaxy/data/rg'
-builds = []
-#Read build names and keys from galaxy.util
-for dbkey, build_name in galaxy.util.dbnames:
-    builds.append((build_name,dbkey,False))
-
-def timenow():
-    """return current time as a string
-    """
-    return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
-
-#Return available builds
-def get_available_builds(defval='hg18'):
-    for i,x in enumerate(builds):
-        if x[1] == defval:
-           x = list(x)
-           x[2] = True
-           builds[i] = tuple(x)
-    return builds
-
-# Create link to files here
-def exec_after_process(app, inp_data, out_data, param_dict, tool, stdout, stderr):
-    base_name = param_dict.get( 'title1', 'Null_Phenotype' )
-    base_name = base_name.encode()
-    info = 'Null phenotypes created by rgfakePhe at %s' % timenow()
-    s = string.whitespace + string.punctuation
-    ptran =  string.maketrans(s,'_'*len(s))
-    base_name = base_name.translate(ptran)
-    data = out_data['ppheout']
-    data.name = '%s.phe' % (base_name) # that's how they've been named in rgfakePhe.py
-    #data.file_name = data.file_name
-    data.metadata.base_name = base_name
-    out_data['pheout'] = data
-    app.model.context.flush()
-
-
-
-
-


--- a/tools/rgenetics/rgutils.py	Thu Mar 24 20:29:40 2011 -0400
+++ b/tools/rgenetics/rgutils.py	Fri Mar 25 13:12:21 2011 -0400
@@ -5,6 +5,7 @@
 #
 
 import subprocess, os, sys, time, tempfile,string,plinkbinJZ
+import datetime
 
 galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
@@ -30,6 +31,9 @@
     """
     return time.strftime('%d/%m/%Y %H:%M:%S', time.localtime(time.time()))
 
+def timestamp():
+    return datetime.datetime.now().strftime('%Y%m%d%H%M%S')
+
 def fail( message ):
     print >> sys.stderr, message
     return -1
@@ -41,6 +45,401 @@
             return os.path.join(path, program)
     return None
 
+def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000,pname='LOG10ARMITAGEP',rgname='rgGLM'):
+    """
+    score must be scaled to 0-1000
+    
+    Want to make some wig tracks from each analysis
+    Best n -log10(p). Make top hit the window.
+    we use our tab output which has
+    rs	chrom	offset	ADD_stat	ADD_p	ADD_log10p
+    rs3094315	1	792429	1.151	0.2528	0.597223
+
+    """
+
+    def is_number(s):
+        try:
+            float(s)
+            return True
+        except ValueError:
+            return False
+    header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description)          
+    column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ]
+    halfwidth=100
+    resfpath = os.path.join(twd,resf)
+    resf = open(resfpath,'r')
+    resfl = resf.readlines() # dumb but convenient for millions of rows
+    resfl = [x.split() for x in resfl]
+    headl = resfl[0]
+    resfl = resfl[1:]
+    headl = [x.strip().upper() for x in headl]
+    headIndex = dict(zip(headl,range(0,len(headl))))
+    chrpos = headIndex.get('CHROM',None)
+    rspos = headIndex.get('RS',None)
+    offspos = headIndex.get('OFFSET',None)
+    ppos = headIndex.get(pname,None)
+    wewant = [chrpos,rspos,offspos,ppos]
+    if None in wewant: # missing something
+       logf.write('### Error missing a required header in makeGFF - headIndex=%s, wewant=%s\n' % (headIndex,wewant))
+       return
+    resfl = [x for x in resfl if x[ppos] > '']
+    resfl = [(float(x[ppos]),x) for x in resfl] # decorate
+    resfl.sort()
+    resfl.reverse() # using -log10 so larger is better
+    resfl = resfl[:topn] # truncate
+    pvals = [x[0] for x in resfl] # need to scale
+    resfl = [x[1] for x in resfl] # drop decoration
+    if len(pvals) == 0:
+        logf.write('### no pvalues found in resfl - %s' % (resfl[:3]))
+        sys.exit(1)
+    maxp = max(pvals) # need to scale
+    minp = min(pvals)
+    prange = abs(maxp-minp) + 0.5 # fudge
+    scalefact = 1000.0/prange
+    logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact))
+    for i,row in enumerate(resfl):
+        row[ppos] = '%d' % (int(scalefact*pvals[i]))
+        resfl[i] = row # replace
+    outf = file(outfname,'w')
+    outf.write(header)
+    outres = [] # need to resort into chrom offset order
+    for i,lrow in enumerate(resfl):
+        chrom,snp,offset,p, = [lrow[x] for x in wewant]
+        gff = ('chr%s' % chrom,rgname,'variation','%d' % (int(offset)-halfwidth),
+               '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i]))
+        outres.append(gff)
+    outres = [(x[0],int(x[3]),x) for x in outres] # decorate
+    outres.sort() # into chrom offset
+    outres=[x[2] for x in outres] # undecorate
+    outres = ['\t'.join(x) for x in outres]    
+    outf.write('\n'.join(outres))
+    outf.write('\n')
+    outf.close()
+
+
+def bedToPicInterval(infile=None):
+    """
+    Picard tools requiring targets want
+    a sam style header which incidentally, MUST be sorted in natural order - not lexicographic order:
+
+    @SQ     SN:chrM LN:16571
+    @SQ     SN:chr1 LN:247249719
+    @SQ     SN:chr2 LN:242951149
+    @SQ     SN:chr3 LN:199501827
+    @SQ     SN:chr4 LN:191273063
+    added to the start of what looks like a bed style file
+    chr1    67052400        67052451        -       CCDS635.1_cds_0_0_chr1_67052401_r
+    chr1    67060631        67060788        -       CCDS635.1_cds_1_0_chr1_67060632_r
+    chr1    67065090        67065317        -       CCDS635.1_cds_2_0_chr1_67065091_r
+    chr1    67066082        67066181        -       CCDS635.1_cds_3_0_chr1_67066083_r
+
+    see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1
+    we need to add 1 to start coordinates on the way through - but length calculations are easier
+    """
+    # bedToPicard.py
+    # ross lazarus October 2010
+    # LGPL
+    # for Rgenetics
+
+    def getFlen(bedfname=None):
+        """
+        find all features in a BED file and sum their lengths
+        """
+        features = {}
+        try:
+            infile = open(bedfname,'r')
+        except:
+            print '###ERROR: getFlen unable to open bedfile %s' % bedfname
+            sys.exit(1)
+        for i,row in enumerate(infile):
+            if row[0] == '@': # shouldn't happen given a bed file!
+                print 'row %d=%s - should NOT start with @!' % (i,row)
+                sys.exit(1)
+        row = row.strip()
+        if len(row) > 0:
+            srow = row.split('\t')
+            f = srow[0]
+            spos = srow[1] # zero based from UCSC so no need to add 1 - eg 0-100 is 100 bases numbered 0-99 (!)
+            epos = srow[2] # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1
+            flen = int(epos) - int(spos)
+            features.setdefault(f,0)
+            features[f] += flen
+        infile.close()
+        return features
+
+    def keynat(string):
+        '''
+        borrowed from http://code.activestate.com/recipes/285264-natural-string-sorting/
+        A natural sort helper function for sort() and sorted()
+        without using regular expressions or exceptions.
+
+        >>> items = ('Z', 'a', '10th', '1st', '9')
+        >>> sorted(items)
+        ['10th', '1st', '9', 'Z', 'a']
+        >>> sorted(items, key=keynat)
+        ['1st', '9', '10th', 'a', 'Z']
+        '''
+        it = type(1)
+        r = []
+        for c in string:
+            if c.isdigit():
+                d = int(c)
+                if r and type( r[-1] ) == it:
+                    r[-1] = r[-1] * 10 + d
+                else:
+                    r.append(d)
+            else:
+                r.append(c.lower())
+        return r
+
+    def writePic(outfname=None,bedfname=None):
+        """
+        collect header info and rewrite bed with header for picard
+        """
+        featlen = getFlen(bedfname=bedfname)
+        try:
+            outf = open(outfname,'w')
+        except:
+            print '###ERROR: writePic unable to open output picard file %s' % outfname
+            sys.exit(1)
+        infile = open(bedfname,'r') # already tested in getFlen
+        k = featlen.keys()
+        fk = sorted(k, key=keynat)
+        header = ['@SQ\tSN:%s\tLN:%d' % (x,featlen[x]) for x in fk]
+        outf.write('\n'.join(header))
+        outf.write('\n')
+        for row in infile:
+            row = row.strip()
+            if len(row) > 0: # convert zero based start coordinate to 1 based
+                srow = row.split('\t')
+                srow[1] = '%d' % (int(srow[1])+1) # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1
+                outf.write('\t'.join(srow))
+                outf.write('\n')
+        outf.close()
+        infile.close()
+
+
+
+    # bedToPicInterval starts here
+    fd,outf = tempfile.mkstemp(prefix='rgPicardHsMetrics')
+    writePic(outfname=outf,bedfname=infile)
+    return outf
+
+
+def getFileString(fpath, outpath):
+    """
+    format a nice file size string
+    """
+    size = ''
+    fp = os.path.join(outpath, fpath)
+    s = '? ?'
+    if os.path.isfile(fp):
+        n = float(os.path.getsize(fp))
+        if n > 2**20:
+            size = ' (%1.1f MB)' % (n/2**20)
+        elif n > 2**10:
+            size = ' (%1.1f KB)' % (n/2**10)
+        elif n > 0:
+            size = ' (%d B)' % (int(n))
+        s = '%s %s' % (fpath, size) 
+    return s
+
+
+def fixPicardOutputs(tempout=None,output_dir=None,log_file=None,html_output=None,progname=None,cl=[],transpose=True):
+    """
+    picard produces long hard to read tab header files
+    make them available but present them transposed for readability
+    """
+    rstyle="""<style type="text/css">
+    tr.d0 td {background-color: oldlace; color: black;}
+    tr.d1 td {background-color: aliceblue; color: black;}
+    </style>"""    
+    cruft = []
+    dat = []    
+    try:
+        r = open(tempout,'r').readlines()
+    except:
+        r = []
+    for row in r:
+        if row.strip() > '':
+            srow = row.split('\t')
+            if row[0] == '#':
+                cruft.append(row.strip()) # want strings
+            else:
+                dat.append(srow) # want lists
+    
+    res = [rstyle,]
+    res.append(galhtmlprefix % progname)   
+    res.append(galhtmlattr % (progname,timenow()))
+    flist = os.listdir(output_dir) 
+    pdflist = [x for x in flist if os.path.splitext(x)[-1].lower() == '.pdf']
+    if len(pdflist) > 0: # assumes all pdfs come with thumbnail .jpgs
+        for p in pdflist:
+            imghref = '%s.jpg' % os.path.splitext(p)[0] # removes .pdf
+            res.append('<table cellpadding="10"><tr><td>\n')
+            res.append('<a href="%s"><img src="%s" alt="Click thumbnail to download %s" hspace="10" align="middle"></a>\n' % (p,imghref,p)) 
+            res.append('</tr></td></table>\n')   
+    res.append('<b>Your job produced the following output files.</b><hr/>\n')
+    res.append('<table>\n')
+    for i,f in enumerate(flist):
+         fn = os.path.split(f)[-1]
+         res.append('<tr><td><a href="%s">%s</a></td></tr>\n' % (fn,fn))
+    res.append('</table><p/>\n') 
+    if len(cruft) + len(dat) > 0:
+        res.append('<b>Picard on line resources</b><ul>\n')
+        res.append('<li><a href="http://picard.sourceforge.net/index.shtml">Click here for Picard Documentation</a></li>\n')
+        res.append('<li><a href="http://picard.sourceforge.net/picard-metric-definitions.shtml">Click here for Picard Metrics definitions</a></li></ul><hr/>\n')
+        if transpose:
+            res.append('<b>Picard output (transposed for readability)</b><hr/>\n')       
+        else:
+            res.append('<b>Picard output</b><hr/>\n')  
+        res.append('<table cellpadding="3" >\n')
+        if len(cruft) > 0:
+            cres = ['<tr class="d%d"><td>%s</td></tr>' % (i % 2,x) for i,x in enumerate(cruft)]
+            res += cres
+        if len(dat) > 0: 
+            maxrows = 100
+            if transpose:
+                tdat = map(None,*dat) # transpose an arbitrary list of lists
+                missing = len(tdat) - maxrows
+                tdat = ['<tr class="d%d"><td>%s</td><td>%s</td></tr>\n' % ((i+len(cruft)) % 2,x[0],x[1]) for i,x in enumerate(tdat) if i < maxrows] 
+                if len(tdat) > maxrows:
+                   tdat.append('<tr><td colspan="2">...WARNING: %d rows deleted for sanity...see raw files for all rows</td></tr>' % missing)
+            else:
+                tdat = ['<tr class="d%d"><td>%s</td></tr>\n' % ((i+len(cruft)) % 2,x) for i,x in enumerate(dat) if i < maxrows] 
+                if len(dat) > maxrows:
+                    missing = len(dat) - maxrows      
+                    tdat.append('<tr><td>...WARNING: %d rows deleted for sanity...see raw files for all rows</td></tr>' % missing)
+            res += tdat
+        res.append('</table>\n')   
+    else:
+        res.append('<b>No Picard output found - please consult the Picard log above for an explanation</b>')
+    l = open(log_file,'r').readlines()
+    if len(l) > 0: 
+        res.append('<b>Picard log</b><hr/>\n') 
+        rlog = ['<pre>',]
+        rlog += l
+        rlog.append('</pre>')
+        res += rlog
+    else:
+        res.append("Odd, Picard left no log file %s - must have really barfed badly?" % log_file)
+    res.append('<hr/>The freely available <a href="http://picard.sourceforge.net/command-line-overview.shtml">Picard software</a> \n') 
+    res.append( 'generated all outputs reported here, using this command line:<br/>\n<pre>%s</pre>\n' % ''.join(cl))   
+    res.append(galhtmlpostfix) 
+    outf = open(html_output,'w')
+    outf.write(''.join(res))   
+    outf.write('\n')
+    outf.close()
+
+def keynat(string):
+    '''
+    borrowed from http://code.activestate.com/recipes/285264-natural-string-sorting/
+    A natural sort helper function for sort() and sorted()
+    without using regular expressions or exceptions.
+
+    >>> items = ('Z', 'a', '10th', '1st', '9')
+    >>> sorted(items)
+    ['10th', '1st', '9', 'Z', 'a']
+    >>> sorted(items, key=keynat)
+    ['1st', '9', '10th', 'a', 'Z']    
+    '''
+    it = type(1)
+    r = []
+    for c in string:
+        if c.isdigit():
+            d = int(c)
+            if r and type( r[-1] ) == it: 
+                r[-1] = r[-1] * 10 + d
+            else: 
+                r.append(d)
+        else:
+            r.append(c.lower())
+    return r
+
+def getFlen(bedfname=None):
+    """
+    find all features in a BED file and sum their lengths
+    """
+    features = {}
+    otherHeaders = []
+    try:
+        infile = open(bedfname,'r')
+    except:
+        print '###ERROR: getFlen unable to open bedfile %s' % bedfname
+        sys.exit(1)
+    for i,row in enumerate(infile):
+        if row.startswith('@'): # add to headers if not @SQ
+            if not row.startswith('@SQ'):
+                otherHeaders.append(row)
+        else:
+            row = row.strip()
+            if row.startswith('#') or row.lower().startswith('browser') or row.lower().startswith('track'):
+                continue # ignore headers
+            srow = row.split('\t')
+            if len(srow) > 3:
+                srow = row.split('\t')
+                f = srow[0]
+                spos = srow[1] # zero based from UCSC so no need to add 1 - eg 0-100 is 100 bases numbered 0-99 (!)
+                epos = srow[2] # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1
+                flen = int(epos) - int(spos)
+                features.setdefault(f,0)
+                features[f] += flen
+    infile.close()
+    fk = features.keys()
+    fk = sorted(fk, key=keynat)
+    return features,fk,otherHeaders
+
+def bedToPicInterval(infile=None,outfile=None):
+    """
+    Picard tools requiring targets want 
+    a sam style header which incidentally, MUST be sorted in natural order - not lexicographic order:
+
+    @SQ     SN:chrM LN:16571
+    @SQ     SN:chr1 LN:247249719
+    @SQ     SN:chr2 LN:242951149
+    @SQ     SN:chr3 LN:199501827
+    @SQ     SN:chr4 LN:191273063
+    added to the start of what looks like a bed style file
+    chr1    67052400        67052451        -       CCDS635.1_cds_0_0_chr1_67052401_r
+    chr1    67060631        67060788        -       CCDS635.1_cds_1_0_chr1_67060632_r
+    chr1    67065090        67065317        -       CCDS635.1_cds_2_0_chr1_67065091_r
+    chr1    67066082        67066181        -       CCDS635.1_cds_3_0_chr1_67066083_r
+
+    see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1
+    we need to add 1 to start coordinates on the way through - but length calculations are easier
+    """
+    # bedToPicard.py
+    # ross lazarus October 2010
+    # LGPL 
+    # for Rgenetics
+    """
+    collect header info and rewrite bed with header for picard
+    """
+    featlen,fk,otherHeaders = getFlen(bedfname=infile)
+    try:
+        outf = open(outfile,'w')
+    except:
+        print '###ERROR: writePic unable to open output picard file %s' % outfile
+        sys.exit(1)
+    inf = open(infile,'r') # already tested in getFlen
+    header = ['@SQ\tSN:%s\tLN:%d' % (x,featlen[x]) for x in fk]
+    if len(otherHeaders) > 0:
+        header += otherHeaders
+    outf.write('\n'.join(header))
+    outf.write('\n')
+    for row in inf:
+        row = row.strip()
+        if len(row) > 0: # convert zero based start coordinate to 1 based
+            if row.startswith('@'):
+                continue
+            else:
+                srow = row.split('\t')
+                srow[1] = '%d' % (int(srow[1])+1) # see http://genome.ucsc.edu/FAQ/FAQtracks.html#tracks1
+                outf.write('\t'.join(srow))
+                outf.write('\n')
+    outf.close()
+    inf.close()
+
 
 def oRRun(rcmd=[],outdir=None,title='myR',rexe='R'):
     """
@@ -201,9 +600,12 @@
     The fine Plink docs at http://pngu.mgh.harvard.edu/~purcell/plink/summary.shtml#prune
     reproduced below
 
-Sometimes it is useful to generate a pruned subset of SNPs that are in approximate linkage equilibrium with each other. This can be achieved via two commands: --indep which prunes based on the variance inflation factor (VIF), which recursively removes SNPs within a sliding window; second, --indep-pairwise which is similar, except it is based only on pairwise genotypic correlation.
+Sometimes it is useful to generate a pruned subset of SNPs that are in approximate linkage equilibrium with each other. This can be achieved via two commands: 
+--indep which prunes based on the variance inflation factor (VIF), which recursively removes SNPs within a sliding window; second, --indep-pairwise which is 
+similar, except it is based only on pairwise genotypic correlation.
 
-Hint The output of either of these commands is two lists of SNPs: those that are pruned out and those that are not. A separate command using the --extract or --exclude option is necessary to actually perform the pruning.
+Hint The output of either of these commands is two lists of SNPs: those that are pruned out and those that are not. A separate command using the --extract or 
+--exclude option is necessary to actually perform the pruning.
 
 The VIF pruning routine is performed:
 plink --file data --indep 50 5 2
@@ -217,16 +619,23 @@
 a --extract or --exclude command.
 
 The parameters for --indep are: window size in SNPs (e.g. 50), the number of SNPs to shift the
-window at each step (e.g. 5), the VIF threshold. The VIF is 1/(1-R^2) where R^2 is the multiple correlation coefficient for a SNP being regressed on all other SNPs simultaneously. That is, this considers the correlations between SNPs but also between linear combinations of SNPs. A VIF of 10 is often taken to represent near collinearity problems in standard multiple regression analyses (i.e. implies R^2 of 0.9). A VIF of 1 would imply that the SNP is completely independent of all other SNPs. Practically, values between 1.5 and 2 should probably be used; particularly in small samples, if this threshold is too low and/or the window size is too large, too many SNPs may be removed.
+window at each step (e.g. 5), the VIF threshold. The VIF is 1/(1-R^2) where R^2 is the multiple correlation coefficient for a SNP being regressed on all other 
+SNPs simultaneously. That is, this considers the correlations between SNPs but also between linear combinations of SNPs. A VIF of 10 is often taken to represent 
+near collinearity problems in standard multiple regression analyses (i.e. implies R^2 of 0.9). A VIF of 1 would imply that the SNP is completely independent of 
+all other SNPs. Practically, values between 1.5 and 2 should probably be used; particularly in small samples, if this threshold is too low and/or the window 
+size is too large, too many SNPs may be removed.
 
 The second procedure is performed:
 plink --file data --indep-pairwise 50 5 0.5
 
 This generates the same output files as the first version; the only difference is that a
-simple pairwise threshold is used. The first two parameters (50 and 5) are the same as above (window size and step); the third parameter represents the r^2 threshold. Note: this represents the pairwise SNP-SNP metric now, not the multiple correlation coefficient; also note, this is based on the genotypic correlation, i.e. it does not involve phasing.
+simple pairwise threshold is used. The first two parameters (50 and 5) are the same as above (window size and step); the third parameter represents the r^2 
+threshold. Note: this represents the pairwise SNP-SNP metric now, not the multiple correlation coefficient; also note, this is based on the genotypic 
+correlation, i.e. it does not involve phasing.
 
 To give a concrete example: the command above that specifies 50 5 0.5 would a) consider a
-window of 50 SNPs, b) calculate LD between each pair of SNPs in the window, b) remove one of a pair of SNPs if the LD is greater than 0.5, c) shift the window 5 SNPs forward and repeat the procedure.
+window of 50 SNPs, b) calculate LD between each pair of SNPs in the window, b) remove one of a pair of SNPs if the LD is greater than 0.5, c) shift the window 5 
+SNPs forward and repeat the procedure.
 
 To make a new, pruned file, then use something like (in this example, we also convert the
 standard PED fileset to a binary one):
@@ -276,3 +685,4 @@
     mfile.close()
     return markers,snpcols,rslist,rsdict
 
+


http://bitbucket.org/galaxy/galaxy-central/changeset/a892beeb00b4/
changeset:   r5275:a892beeb00b4
user:        fubar
date:        2011-03-25 18:21:13
summary:     Remove bogus makeGFF from rgutils
affected #:  1 file (2.9 KB)

--- a/tools/rgenetics/rgutils.py	Fri Mar 25 13:12:21 2011 -0400
+++ b/tools/rgenetics/rgutils.py	Fri Mar 25 13:21:13 2011 -0400
@@ -45,77 +45,6 @@
             return os.path.join(path, program)
     return None
 
-def makeGFF(resf='',outfname='',logf=None,twd='.',name='track name',description='track description',topn=1000,pname='LOG10ARMITAGEP',rgname='rgGLM'):
-    """
-    score must be scaled to 0-1000
-    
-    Want to make some wig tracks from each analysis
-    Best n -log10(p). Make top hit the window.
-    we use our tab output which has
-    rs	chrom	offset	ADD_stat	ADD_p	ADD_log10p
-    rs3094315	1	792429	1.151	0.2528	0.597223
-
-    """
-
-    def is_number(s):
-        try:
-            float(s)
-            return True
-        except ValueError:
-            return False
-    header = 'track name=%s description="%s" visibility=2 useScore=1 color=0,60,120\n' % (name,description)          
-    column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ]
-    halfwidth=100
-    resfpath = os.path.join(twd,resf)
-    resf = open(resfpath,'r')
-    resfl = resf.readlines() # dumb but convenient for millions of rows
-    resfl = [x.split() for x in resfl]
-    headl = resfl[0]
-    resfl = resfl[1:]
-    headl = [x.strip().upper() for x in headl]
-    headIndex = dict(zip(headl,range(0,len(headl))))
-    chrpos = headIndex.get('CHROM',None)
-    rspos = headIndex.get('RS',None)
-    offspos = headIndex.get('OFFSET',None)
-    ppos = headIndex.get(pname,None)
-    wewant = [chrpos,rspos,offspos,ppos]
-    if None in wewant: # missing something
-       logf.write('### Error missing a required header in makeGFF - headIndex=%s, wewant=%s\n' % (headIndex,wewant))
-       return
-    resfl = [x for x in resfl if x[ppos] > '']
-    resfl = [(float(x[ppos]),x) for x in resfl] # decorate
-    resfl.sort()
-    resfl.reverse() # using -log10 so larger is better
-    resfl = resfl[:topn] # truncate
-    pvals = [x[0] for x in resfl] # need to scale
-    resfl = [x[1] for x in resfl] # drop decoration
-    if len(pvals) == 0:
-        logf.write('### no pvalues found in resfl - %s' % (resfl[:3]))
-        sys.exit(1)
-    maxp = max(pvals) # need to scale
-    minp = min(pvals)
-    prange = abs(maxp-minp) + 0.5 # fudge
-    scalefact = 1000.0/prange
-    logf.write('###maxp=%f,minp=%f,prange=%f,scalefact=%f\n' % (maxp,minp,prange,scalefact))
-    for i,row in enumerate(resfl):
-        row[ppos] = '%d' % (int(scalefact*pvals[i]))
-        resfl[i] = row # replace
-    outf = file(outfname,'w')
-    outf.write(header)
-    outres = [] # need to resort into chrom offset order
-    for i,lrow in enumerate(resfl):
-        chrom,snp,offset,p, = [lrow[x] for x in wewant]
-        gff = ('chr%s' % chrom,rgname,'variation','%d' % (int(offset)-halfwidth),
-               '%d' % (int(offset)+halfwidth),p,'.','.','%s logp=%1.2f' % (snp,pvals[i]))
-        outres.append(gff)
-    outres = [(x[0],int(x[3]),x) for x in outres] # decorate
-    outres.sort() # into chrom offset
-    outres=[x[2] for x in outres] # undecorate
-    outres = ['\t'.join(x) for x in outres]    
-    outf.write('\n'.join(outres))
-    outf.write('\n')
-    outf.close()
-
 
 def bedToPicInterval(infile=None):
     """

Repository URL: https://bitbucket.org/galaxy/galaxy-central/

--

This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.

    

[galaxy-commits] commit/galaxy-central: 2 new changesets

Bitbucket