[hg] galaxy 3657: Small fixes to rgGRR and rgManQQ

details: http://www.bx.psu.edu/hg/galaxy/rev/33eab99a2df5 changeset: 3657:33eab99a2df5 user: fubar: ross Lazarus at gmail period com date: Thu Apr 15 10:51:51 2010 -0400 description: Small fixes to rgGRR and rgManQQ Prevent default base_name overwriting real base_name in root.py Revert to prevent infotrack being requested - even the latest release of Haploview 4.2 is busted and fails to create any images when creating the infotrack - this only happens in headless mode and seems to be an upstream bug in a java library. No sign of a fix despite multiple pleas to the Haploview authors... Allow https in ucsc genome graphs - suggestion from Assaf Gordon - thanks. Fixes to creation of html page for rgenetics composite datatypes related to base_name problems traced to root.py when user tries to autodetect on the metadata edit page diffstat: lib/galaxy/web/controllers/root.py | 2 +- test-data/rgtestouts/rgManQQ/Allelep_manhattan.png | 0 test-data/rgtestouts/rgManQQ/Armitagep_manhattan.png | 0 test-data/rgtestouts/rgManQQ/rgManQQtest1.R | 52 ++++++++------ test-data/rgtestouts/rgManQQ/rgManQQtest1.html | 10 +-- tools/rgenetics/rgGRR.py | 66 +----------------- tools/rgenetics/rgHaploView.py | 1 + tools/rgenetics/rgHaploView.xml | 2 +- tools/rgenetics/rgManQQ.py | 14 ++-- tools/rgenetics/rgManQQ.xml | 2 +- tools/rgenetics/rgQC.py | 3 +- tools/rgenetics/rgtest_one_tool.sh | 5 +- tools/rgenetics/rgutils.py | 63 +++++++++++++++++- 13 files changed, 115 insertions(+), 105 deletions(-) diffs (453 lines): diff -r 6f43a97e7908 -r 33eab99a2df5 lib/galaxy/web/controllers/root.py --- a/lib/galaxy/web/controllers/root.py Thu Apr 15 10:35:31 2010 -0400 +++ b/lib/galaxy/web/controllers/root.py Thu Apr 15 10:51:51 2010 -0400 @@ -314,7 +314,7 @@ return trans.show_error_message( "This dataset is currently being used as input or output. You cannot change metadata until the jobs have completed or you have canceled them." ) for name, spec in data.metadata.spec.items(): # We need to be careful about the attributes we are resetting - if name not in [ 'name', 'info', 'dbkey' ]: + if name not in [ 'name', 'info', 'dbkey', 'base_name' ]: if spec.get( 'default' ): setattr( data.metadata, name, spec.unwrap( spec.get( 'default' ) ) ) if trans.app.config.set_metadata_externally: diff -r 6f43a97e7908 -r 33eab99a2df5 test-data/rgtestouts/rgManQQ/Allelep_manhattan.png Binary file test-data/rgtestouts/rgManQQ/Allelep_manhattan.png has changed diff -r 6f43a97e7908 -r 33eab99a2df5 test-data/rgtestouts/rgManQQ/Armitagep_manhattan.png Binary file test-data/rgtestouts/rgManQQ/Armitagep_manhattan.png has changed diff -r 6f43a97e7908 -r 33eab99a2df5 test-data/rgtestouts/rgManQQ/rgManQQtest1.R --- a/test-data/rgtestouts/rgManQQ/rgManQQtest1.R Thu Apr 15 10:35:31 2010 -0400 +++ b/test-data/rgtestouts/rgManQQ/rgManQQtest1.R Thu Apr 15 10:51:51 2010 -0400 @@ -22,7 +22,7 @@ manhattan = function(chrom=NULL,offset=NULL,pvals=NULL, title=NULL, max.y="max", - suggestiveline=0, genomewide=T, size.x.labels=9, size.y.labels=10, annotate=F, SNPlist=NULL,grey=F) { + suggestiveline=0, genomewide=T, size.x.labels=9, size.y.labels=10, annotate=F, SNPlist=NULL,grey=0) { if (annotate & is.null(SNPlist)) stop("You requested annotation but provided no SNPlist!") genomewideline=NULL # was genomewideline=-log10(5e-8) @@ -43,10 +43,8 @@ lastbase=0 chrlist = unique(d$CHR) nchr = length(chrlist) # may be any number? - print(paste('## manhattan got chrlist=',chrlist,'nchr',nchr)) for (x in c(1:nchr)) { i = chrlist[x] # need the chrom number - may not == index - print(paste('## manhattan got chrlist=',chrlist,'nchr',nchr,'x=',x,'i=',i)) if (x == 1) { # first time d[d$CHR==i, ]$pos=d[d$CHR==i, ]$BP } else { @@ -63,8 +61,9 @@ } if (max.y=="max") maxy=ceiling(max(d$logp)) else maxy=max.y - if (maxy<8) maxy=8 - + maxy = max(maxy,1.1*genomewideline) + # if (maxy<8) maxy=8 + # only makes sense if genome wide is assumed - we could have a fine mapping region? if (annotate) d.annotate=d[as.numeric(substr(d$SNP,3,100)) %in% SNPlist, ] plot=qplot(pos,logp,data=d, ylab=expression(-log[10](italic(p))) , colour=factor(CHR)) @@ -76,11 +75,13 @@ plot=plot + opts(title=title) plot=plot+opts( panel.background=theme_blank(), - panel.grid.minor=theme_blank(), axis.text.x=theme_text(size=size.x.labels, colour="grey50"), axis.text.y=theme_text(size=size.y.labels, colour="grey50"), axis.ticks=theme_segment(colour=NA) ) + #plot = plot + opts(panel.grid.y.minor=theme_blank(),panel.grid.y.major=theme_blank()) + #plot = plot + opts(panel.grid.major=theme_blank()) + if (suggestiveline) plot=plot+geom_hline(yintercept=suggestiveline,colour="blue", alpha=I(1/3)) if (genomewideline) plot=plot+geom_hline(yintercept=genomewideline,colour="red") plot @@ -107,44 +108,51 @@ if (spartan) plot=plot+opts(panel.background=theme_rect(col="grey50"), panel.grid.minor=theme_blank()) plot } - - -#rs Chr Offset Genop log10Genop Armitagep log10Armitagep Allelep log10Allelep Domp log10Domp -#rs3094315 1 792429 1.000 0.000000 0.122 0.912574 0.152 0.817871 1.000 0.000000 -# eg for testing -# this function needs column numbers so galaxy tool is easy to drive - rgqqMan = function(infile="/opt/galaxy/test-data/smallwgaP.xls",chromcolumn=2, offsetcolumn=3, pvalscolumns=c(6,8), - title="rgManQQtest1",outprefix="rgManQQtest1") { - d = read.table(infile,head=T,sep=' ') - print(paste('###',length(d[,1]),'values read from',infile,'read - now running plots',sep=' ')) +title="rgManQQtest1",grey=0) { +rawd = read.table(infile,head=T,sep=' ') +dn = names(rawd) +cc = dn[chromcolumn] +oc = dn[offsetcolumn] +nams = c(cc,oc) +plen = length(rawd[,1]) +doreorder=1 +print(paste('###',plen,'values read from',infile,'read - now running plots',sep=' ')) +if (plen > 0) { for (pvalscolumn in pvalscolumns) { if (pvalscolumn > 0) { - cname = names(d)[pvalscolumn] + cname = names(rawd)[pvalscolumn] mytitle = paste('p=',cname,', ',title,sep='') myfname = chartr(' ','_',cname) - myqqplot = qq(d[,pvalscolumn],title=mytitle) + myqqplot = qq(rawd[,pvalscolumn],title=mytitle) print(paste('## qqplot on',cname,'done')) if ((chromcolumn > 0) & (offsetcolumn > 0)) { + if (doreorder) { + rawd = rawd[do.call(order,rawd[nams]),] + # mmmf - suggested by http://onertipaday.blogspot.com/2007/08/sortingordering-dataframe-according.... + # in case not yet ordered + doreorder = 0 + } print(paste('## manhattan on',cname,'starting',chromcolumn,offsetcolumn,pvalscolumn)) - mymanplot= manhattan(chrom=d[,chromcolumn],offset=d[,offsetcolumn],pvals=d[,pvalscolumn],title=mytitle) + mymanplot= manhattan(chrom=rawd[,chromcolumn],offset=rawd[,offsetcolumn],pvals=rawd[,pvalscolumn],title=mytitle,grey=grey) print(paste('## manhattan plot on',cname,'done')) ggsave(file=paste(myfname,"manhattan.png",sep='_'),mymanplot,width=11,height=8,dpi=100) } else { print(paste('chrom column =',chromcolumn,'offset column = ',offsetcolumn, - 'Cannot parse - no manhattan plot possible')) + 'so no Manhattan plot - supply both chromosome and offset as numerics for Manhattan plots if required')) } - ggsave(file=paste(myfname,"qqplot.png",sep='_'),myqqplot,w=5,h=5,dpi=100) + ggsave(file=paste(myfname,"qqplot.png",sep='_'),myqqplot,width=8,height=11,dpi=100) } else { print(paste('pvalue column =',pvalscolumn,'Cannot parse it so no plots possible')) } } # for pvalscolumn + } else { print('## Problem - no values available to plot - was there really a chromosome and offset column?') } } rgqqMan() # execute with defaults as substituted -#R script autogenerated by rgenetics/rgutils.py on 25/03/2010 21:01:09 +#R script autogenerated by rgenetics/rgutils.py on 15/04/2010 09:56:14 diff -r 6f43a97e7908 -r 33eab99a2df5 test-data/rgtestouts/rgManQQ/rgManQQtest1.html --- a/test-data/rgtestouts/rgManQQ/rgManQQtest1.html Thu Apr 15 10:35:31 2010 -0400 +++ b/test-data/rgtestouts/rgManQQ/rgManQQtest1.html Thu Apr 15 10:51:51 2010 -0400 @@ -38,24 +38,16 @@ [1] "## manhattan on Armitagep starting 2 3 6" -[1] "## manhattan got chrlist= 1 nchr 1" - -[1] "## manhattan got chrlist= 1 nchr 1 x= 1 i= 1" - [1] "## manhattan plot on Armitagep done" [1] "## qqplot on Allelep done" [1] "## manhattan on Allelep starting 2 3 8" -[1] "## manhattan got chrlist= 1 nchr 1" - -[1] "## manhattan got chrlist= 1 nchr 1 x= 1 i= 1" - [1] "## manhattan plot on Allelep done" </pre> -<h3><a href="http://rgenetics.org">Rgenetics</a> tool rgManQQ.py run at 25/03/2010 21:01:19</h3> +<h3><a href="http://rgenetics.org">Rgenetics</a> tool rgManQQ.py run at 15/04/2010 09:56:23</h3> </div></body></html> diff -r 6f43a97e7908 -r 33eab99a2df5 tools/rgenetics/rgGRR.py --- a/tools/rgenetics/rgGRR.py Thu Apr 15 10:35:31 2010 -0400 +++ b/tools/rgenetics/rgGRR.py Thu Apr 15 10:51:51 2010 -0400 @@ -63,7 +63,8 @@ except NameError: from Sets import Set as set -from rgutils import timenow,pruneLD,plinke +from rgutils import timenow,pruneLD,plinke,openOrMakeLDreduced + import plinkbinJZ @@ -1002,61 +1003,6 @@ svg.close() return newfiles,explanations,repOut -def makeOpenLDR(ldreduced=None,basename=None,newfpath=None,plinke='plink'): - """we stow a thin ldreduced version of the primary file in the files_path for future runs - if none there yet - ugh the --thin option will happily return zero snps if only a few - """ - ped = None - loglines = [] - ldbedname = '%s.bed' % ldreduced - ldpedname = '%s.ped' % ldreduced - bedname = '%s.bed' % basename - pedname = '%s.ped' % basename - ldbedfn = os.path.join(newfpath,ldbedname) - ldpedfn = os.path.join(newfpath,ldpedname) - bedfn = os.path.join(newfpath,bedname) - pedfn = os.path.join(newfpath,pedname) - bmap = os.path.join(newfpath,'%s.bim' % basename) - pmap = os.path.join(newfpath,'%s.map' % basename) - if os.path.exists(ldbedfn): # joy. already done - ped = plinkbinJZ.BPed(os.path.splitext(ldbedfn)[0]) - ped.parse(quick=True) - elif os.path.exists(ldpedfn): # why bother - for completeness I guess - ped = plinkbinJZ.LPed(os.path.splitext(ldbedfn)[0]) - ped.parse() - elif os.path.exists(bedfn): # run ld prune and thin and save these for next time - nsnp = len(open(bmap,'r').readlines()) - if nsnp > 100: # if 9 snps --thin 0.1 will happily return 0 snps - plinktasks = [['--bfile',basename,'--indep-pairwise 50 40 0.2','--out %s' % basename], - ['--bfile',basename,'--extract %s.prune.in --make-bed --out %s_INDEP' % (basename, basename)], - ['--bfile %s_INDEP --thin 0.1 --make-bed --out %s' % (basename,ldreduced)]] - else: # no thin stage - plinktasks = [['--bfile',basename,'--indep-pairwise 50 40 0.2','--out %s' % basename], - ['--bfile',basename,'--extract %s.prune.in --make-bed --out %s' % (basename, ldreduced)]] - # subset of ld independent markers for eigenstrat and other requirements - vclbase = [plinke,'--noweb'] - loglines = pruneLD(plinktasks=plinktasks,cd=newfpath,vclbase = vclbase) - ped = plinkbinJZ.BPed(os.path.splitext(ldbedfn)[0]) - ped.parse(quick=True) - elif pedname and os.path.exists(pedfn): # screw it - return a bed - quicker to process - nsnp = len(open(pmap,'r').readlines()) - if nsnp > 100: - plinktasks = [['--file',basename,'--make-bed','--out',basename], - ['--bfile',basename,'--indep-pairwise 50 40 0.2','--out %s' % basename], - ['--bfile',basename,'--extract %s.prune.in --make-bed --out %s_INDEP' % (basename, basename)], - ['--bfile %s_INDEP --thin 0.1 --recode --out %s' % (bedname,ldreduced),]] - else: # no thin step - plinktasks = [['--file',basename,'--make-bed','--out',basename], - ['--bfile',basename,'--indep-pairwise 50 40 0.2','--out %s' % basename], - ['--bfile',basename,'--extract %s.prune.in --make-bed --out %s' % (basename, ldreduced)]] - - # subset of ld independent markers for eigenstrat and other requirements - vclbase = [plinke,'--noweb'] - loglines = pruneLD(plinktasks=plinktasks,cd=newfpath,vclbase = vclbase) - ped = plinkbinJZ.BPed(os.path.splitext(ldbedfn)[0]) - ped.parse(quick=True) - return ped,loglines - def doIBS(n=100): """parse parameters from galaxy expect 'input pbed path' 'basename' 'outpath' 'title' 'logpath' 'n' @@ -1078,6 +1024,7 @@ ts = '%s%s' % (string.punctuation,string.whitespace) ptran = string.maketrans(ts,'_'*len(ts)) inpath = sys.argv[1] + ldpath = os.path.split(inpath)[0] basename = sys.argv[2] outhtml = sys.argv[3] newfilepath = sys.argv[4] @@ -1095,14 +1042,13 @@ pass logf = file(logpath,'w') efp,ibase_name = os.path.split(inpath) # need to use these for outputs in files_path - ldreduced = '%s_INDEP_THIN' % ibase_name # we store ld reduced and thinned data - ped,loglines = makeOpenLDR(ldreduced=ldreduced,basename=ibase_name,newfpath=efp,plinke=plinke) + ped,loglines = openOrMakeLDreduced(basename,ldpath,plinke) if ped == None: - print >> sys.stderr, '## doIBSpy problem - cannot open %s or %s - cannot run' % (bedname,pedname) + print >> sys.stderr, '## doIBSpy problem - cannot open %s or %s - cannot run' % (ldreduced,basename) sys.exit(1) if len(loglines) > 0: logf.write('### first time for this input file - log from creating an ld reduced and thinned data set:\n') - logf.write('\n'.join(loglines)) + logf.write(''.join(loglines)) logf.write('\n') newfiles,explanations,repOut = doIBSpy(ped=ped,basename=basename,outdir=newfilepath, logf=logf,nrsSamples=n,title=title,pdftoo=0,Zcutoff=Zcutoff) diff -r 6f43a97e7908 -r 33eab99a2df5 tools/rgenetics/rgHaploView.py --- a/tools/rgenetics/rgHaploView.py Thu Apr 15 10:35:31 2010 -0400 +++ b/tools/rgenetics/rgHaploView.py Thu Apr 15 10:51:51 2010 -0400 @@ -84,6 +84,7 @@ outfpath = sys.argv[12] infotrack = False # note that otherwise this breaks haploview in headless mode #infotrack = sys.argv[13] == 'info' + # this fails in headless mode as at april 2010 with haploview 4.2 tagr2 = sys.argv[14] or '0.8' hmpanels = sys.argv[15] # eg "['CEU','YRI']" if hmpanels: diff -r 6f43a97e7908 -r 33eab99a2df5 tools/rgenetics/rgHaploView.xml --- a/tools/rgenetics/rgHaploView.xml Thu Apr 15 10:35:31 2010 -0400 +++ b/tools/rgenetics/rgHaploView.xml Thu Apr 15 10:51:51 2010 -0400 @@ -53,7 +53,7 @@ <param name="infoTrack" type="select" label="Add Hapmap information track to image" help="Refseq genes and snp density can be added to the plot if desired for orientation" > - <option value="info">Add Information track (Not available - awaiting fix from Haploview authors)</option> + <option value="info">Add Information track (DISABLED! Awaiting fix from Haploview authors)</option> <option value="noinfo" selected="True">No Information track</option> </param> diff -r 6f43a97e7908 -r 33eab99a2df5 tools/rgenetics/rgManQQ.py --- a/tools/rgenetics/rgManQQ.py Thu Apr 15 10:35:31 2010 -0400 +++ b/tools/rgenetics/rgManQQ.py Thu Apr 15 10:51:51 2010 -0400 @@ -129,27 +129,27 @@ cc = dn[chromcolumn] oc = dn[offsetcolumn] nams = c(cc,oc) -plen = length(d[,1]) -doreorder=True +plen = length(rawd[,1]) +doreorder=1 print(paste('###',plen,'values read from',infile,'read - now running plots',sep=' ')) if (plen > 0) { for (pvalscolumn in pvalscolumns) { if (pvalscolumn > 0) { - cname = names(d)[pvalscolumn] + cname = names(rawd)[pvalscolumn] mytitle = paste('p=',cname,', ',title,sep='') myfname = chartr(' ','_',cname) - myqqplot = qq(d[,pvalscolumn],title=mytitle) + myqqplot = qq(rawd[,pvalscolumn],title=mytitle) print(paste('## qqplot on',cname,'done')) if ((chromcolumn > 0) & (offsetcolumn > 0)) { if (doreorder) { - d = rawd[do.call(order,rawd[nams]),] + rawd = rawd[do.call(order,rawd[nams]),] # mmmf - suggested by http://onertipaday.blogspot.com/2007/08/sortingordering-dataframe-according.... # in case not yet ordered - doreorder = False + doreorder = 0 } print(paste('## manhattan on',cname,'starting',chromcolumn,offsetcolumn,pvalscolumn)) - mymanplot= manhattan(chrom=d[,chromcolumn],offset=d[,offsetcolumn],pvals=d[,pvalscolumn],title=mytitle,grey=grey) + mymanplot= manhattan(chrom=rawd[,chromcolumn],offset=rawd[,offsetcolumn],pvals=rawd[,pvalscolumn],title=mytitle,grey=grey) print(paste('## manhattan plot on',cname,'done')) ggsave(file=paste(myfname,"manhattan.png",sep='_'),mymanplot,width=11,height=8,dpi=100) } diff -r 6f43a97e7908 -r 33eab99a2df5 tools/rgenetics/rgManQQ.xml --- a/tools/rgenetics/rgManQQ.xml Thu Apr 15 10:35:31 2010 -0400 +++ b/tools/rgenetics/rgManQQ.xml Thu Apr 15 10:51:51 2010 -0400 @@ -37,7 +37,7 @@ <test> <param name='i' value='smallwgaP.xls' ftype='tabular' > </param> - <param name='name' value='rgManQQ_test1' /> + <param name='name' value='rgManQQtest1' /> <param name='pval_col' value='5,7' /> <param name='chrom_col' value='1' /> <param name='offset_col' value='2' /> diff -r 6f43a97e7908 -r 33eab99a2df5 tools/rgenetics/rgQC.py --- a/tools/rgenetics/rgQC.py Thu Apr 15 10:35:31 2010 -0400 +++ b/tools/rgenetics/rgQC.py Thu Apr 15 10:51:51 2010 -0400 @@ -1264,8 +1264,7 @@ plinktasks = [['--freq',],['--hwe 0.0', '--missing','--hardy'], ['--mendel',],['--check-sex',]] vclbase = [options.plinke,'--noweb','--out',basename,'--bfile',bfn,'--mind','1.0','--geno','1.0','--maf','0.0'] - runPlink(bfn = bfn,ofn = ofn,logf=plogf,plinktasks=plinktasks,cd=newfpath, - vclbase=vclbase) + runPlink(logf=plogf,plinktasks=plinktasks,cd=newfpath, vclbase=vclbase) plinktasks = [['--bfile',bfn,'--indep-pairwise 40 20 0.5','--out %s' % basename], ['--bfile',bfn,'--extract %s.prune.in --make-bed --out ldp_%s' % (basename, basename)], ['--bfile ldp_%s --het --out %s' % (basename,basename)]] diff -r 6f43a97e7908 -r 33eab99a2df5 tools/rgenetics/rgtest_one_tool.sh --- a/tools/rgenetics/rgtest_one_tool.sh Thu Apr 15 10:35:31 2010 -0400 +++ b/tools/rgenetics/rgtest_one_tool.sh Thu Apr 15 10:51:51 2010 -0400 @@ -6,11 +6,12 @@ *) esac GALAXYROOT=`pwd` +PATHTOGALAXY='/opt/galaxy' # whatever echo "using $GALAXYROOT" # change this as needed for your local install INPATH="${GALAXYROOT}/test-data" BINPATH="${GALAXYROOT}/tool-data/rg/bin" -TOOLPATH="${GALAXYROOT}/tools/rgenetics" +TOOLPATH="${PATHTOGALAXY}/tools/rgenetics" OROOT="${GALAXYROOT}/test-data/rgtestouts" NORMALOROOT="${GALAXYROOT}/test-data" case "$1" in @@ -67,6 +68,8 @@ echo "now doing $TOOL" OUTPATH="$OROOT/$TOOL" rm -rf $OUTPATH/* +cmd="$TOOLPATH/$TOOL.py "$INPATH/tinywga" "tinywga" $OUTPATH/${NPRE}.html $OUTPATH "$NPRE" '100' '6'" +echo "Doing $cmd" python $TOOLPATH/$TOOL.py "$INPATH/tinywga" "tinywga" $OUTPATH/${NPRE}.html $OUTPATH "$NPRE" '100' '6' # rgGRR.py $i.extra_files_path/$i.metadata.base_name "$i.metadata.base_name" #'$out_file1' '$out_file1.files_path' "$title" '$n' '$Z' diff -r 6f43a97e7908 -r 33eab99a2df5 tools/rgenetics/rgutils.py --- a/tools/rgenetics/rgutils.py Thu Apr 15 10:35:31 2010 -0400 +++ b/tools/rgenetics/rgutils.py Thu Apr 15 10:51:51 2010 -0400 @@ -4,7 +4,7 @@ # released under the LGPL # -import subprocess, os, sys, time, tempfile,string +import subprocess, os, sys, time, tempfile,string,plinkbinJZ galhtmlprefix = """<?xml version="1.0" encoding="utf-8" ?> <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> @@ -279,3 +279,64 @@ rsdict = dict(zip(rslist,rslist)) mfile.close() return markers,snpcols,rslist,rsdict + +def openOrMakeLDreduced(basename,newfpath,plinke): + """ move out of the way - highly specific plink function to create or open + an ld reduced and thinned data set for one of the ibs/grr methods.. + this should be part of the plinkJZ stuff - or bought in here maybe + """ + ldr = '%s_INDEP_THIN' % basename # we store ld reduced and thinned data + ldreduced = os.path.join(newfpath,ldr) + ped = None + loglines = [] + vclbase = [] + ldbedname = '%s.bed' % ldreduced + ldpedname = '%s.ped' % ldreduced + bedname = '%s.bed' % basename + pedname = '%s.ped' % basename + ldbedfn = os.path.join(newfpath,ldbedname) + ldpedfn = os.path.join(newfpath,ldpedname) + bedfn = os.path.join(newfpath,bedname) + pedfn = os.path.join(newfpath,pedname) + bmap = os.path.join(newfpath,'%s.bim' % basename) + pmap = os.path.join(newfpath,'%s.map' % basename) + if os.path.exists(ldbedfn): # joy. already done + ped = plinkbinJZ.BPed(ldreduced) + ped.parse(quick=True) + return ped,loglines + if os.path.exists(ldpedfn): # why bother - for completeness I guess + ped = plinkbinJZ.LPed(ldreduced) + ped.parse() + return ped,loglines + if os.path.exists(bedfn): # run ld prune and thin and save these for next time + nsnp = len(open(bmap,'r').readlines()) + plinktasks = [['--bfile',basename,'--indep-pairwise 50 40 0.2','--out %s' % basename], + ['--bfile',basename,'--extract %s.prune.in --make-bed --out %s_INDEP' % (basename, basename)]] + if nsnp < 100: # if 9 snps --thin 0.1 will happily return 0 snps + plinktasks += [['--bfile %s_INDEP --make-bed --out %s' % (basename,ldreduced)],] + else: # thin rather than copy + plinktasks += [['--bfile',basename,'--extract %s.prune.in --make-bed --out %s' % (basename, ldreduced)]] + # subset of ld independent markers for eigenstrat and other requirements + vclbase = [plinke,'--noweb'] + loglines = pruneLD(plinktasks=plinktasks,cd=newfpath,vclbase = vclbase) + ped = plinkbinJZ.BPed(ldreduced) + return ped,loglines + if pedname and os.path.exists(pedfn): # screw it - return a bed - quicker to process + nsnp = len(open(pmap,'r').readlines()) + if nsnp > 100: + plinktasks = [['--file',basename,'--make-bed','--out',basename], + ['--bfile',basename,'--indep-pairwise 50 40 0.2','--out %s' % basename], + ['--bfile',basename,'--extract %s.prune.in --make-bed --out %s_INDEP' % (basename, basename)], + ['--bfile %s_INDEP --thin 0.1 --recode --out %s' % (bedname,ldreduced),]] + else: # no thin step + plinktasks = [['--file',basename,'--make-bed','--out',basename], + ['--bfile',basename,'--indep-pairwise 50 40 0.2','--out %s' % basename], + ['--bfile',basename,'--extract %s.prune.in --make-bed --out %s' % (basename, ldreduced)]] + + # subset of ld independent markers for eigenstrat and other requirements + vclbase = [plinke,'--noweb'] + loglines = pruneLD(plinktasks=plinktasks,cd=newfpath,vclbase = vclbase) + ped = plinkbinJZ.BPed(os.path.splitext(ldbedfn)[0]) + ped.parse(quick=True) + return ped,loglines +
participants (1)
-
Greg Von Kuster