[galaxy-commits] galaxy-dist commit 2c2baca25528: rgHaploView.py reorganized to make a little less ugly and easier to maintain

20 Nov 2010

# HG changeset patch -- Bitbucket.org
# Project galaxy-dist
# URL http://bitbucket.org/galaxy/galaxy-dist/overview
# User Ross Lazarus <ross.lazarus@gmail.com>
# Date 1289249136 18000
# Node ID 2c2baca255289cd48ecb806cfb0ea6468d489bde
# Parent  eee0e1d344d21de7fb079fcccc3df8e251d1e025
rgHaploView.py reorganized to make a little less ugly and easier to maintain

--- a/tools/rgenetics/rgHaploView.py
+++ b/tools/rgenetics/rgHaploView.py
@@ -33,11 +33,466 @@ javabin = 'java'
 atrandic = {'A':'1','C':'2','G':'3','T':'4','N':'0','-':'0','1':'1','2':'2','3':'3','4':'4','0':'0'}
 
 class NullDevice:
-    """ """
+    """ a dev/null for ignoring output
+    """
     def write(self, s):
         pass
 
-def ld():
+class ldPlot:
+    
+    def __init__(self, argv=[]):
+        """
+        setup
+        """
+        self.args=argv
+        self.parseArgs(argv=self.args)
+        self.setupRegions()
+        
+    def parseArgs(self,argv=[]):
+        """
+        """
+        ts = '%s%s' % (string.punctuation,string.whitespace)
+        ptran =  string.maketrans(ts,'_'*len(ts))
+        ### Figure out what genomic region we are interested in
+        self.region = argv[1]
+        self.orslist = argv[2].replace('X',' ').lower() # galaxy replaces newlines with XX - go figure
+        self.title = argv[3].translate(ptran)
+        # for outputs
+        self.outfile = argv[4]
+        self.logfn = 'Log_%s.txt' % (self.title)
+        self.histextra = argv[5]
+        self.base_name = argv[6]
+        self.pedFileBase = os.path.join(self.histextra,self.base_name)
+        print 'pedfilebase=%s' % self.pedFileBase
+        self.minMaf=argv[7]
+        if self.minMaf:
+            try:
+                self.minMaf = float(self.minMaf)
+            except:
+                self.minMaf = 0.0
+        self.maxDist=argv[8] or None
+        self.ldType=argv[9] or 'RSQ'
+        self.hiRes = (argv[10].lower() == 'hi')
+        self.memSize= argv[11] or '1000'
+        self.memSize = int(self.memSize)
+        self.outfpath = argv[12]
+        self.infotrack = False # note that otherwise this breaks haploview in headless mode 
+        #infotrack = argv[13] == 'info'
+        # this fails in headless mode as at april 2010 with haploview 4.2
+        self.tagr2 = argv[14] or '0.8'
+        hmpanels = argv[15] # eg "['CEU','YRI']"
+        if hmpanels:
+           hmpanels = hmpanels.replace('[','')
+           hmpanels = hmpanels.replace(']','')
+           hmpanels = hmpanels.replace("'",'')
+           hmpanels = hmpanels.split(',')
+        self.hmpanels = hmpanels
+        self.hvbin = argv[16] # added rml june 2008
+        self.bindir = os.path.split(self.hvbin)[0]
+        # jan 2010 - always assume utes are on path to avoid platform problems
+        self.pdfjoin = 'pdfjoin' # os.path.join(bindir,'pdfjoin')
+        self.pdfnup = 'pdfnup' # os.path.join(bindir,'pdfnup')
+        self.mogrify = 'mogrify' # os.path.join(bindir,'mogrify')
+        self.convert = 'convert' # os.path.join(bindir,'convert')
+        self.log_file = os.path.join(self.outfpath,self.logfn)
+        self.MAP_FILE = '%s.map' % self.pedFileBase
+        self.DATA_FILE = '%s.ped' % self.pedFileBase
+        try:
+            os.makedirs(self.outfpath)
+            s = '## made new path %s\n' % self.outfpath
+        except:
+            pass
+        self.lf = file(self.log_file,'w')
+        s = 'PATH=%s\n' % os.environ.get('PATH','?')
+        self.lf.write(s)
+    
+    def setupRegions(self):
+        """
+        """
+        chromosome = ''
+        spos = epos = -9
+        rslist = []
+        rsdict = {}
+
+        if self.region > '':
+            useRs = []
+            useRsdict={}
+            try: # TODO make a regexp?
+                c,rest = self.region.split(':')
+                chromosome = c.replace('chr','')
+                rest = rest.replace(',','') # remove commas
+                spos,epos = rest.split('-')
+                spos = int(spos)
+                epos = int(epos)
+                s = '## %s parsing chrom %s from %d to %d\n' % (progname,chromosome,spos,epos)
+                self.lf.write(s)
+                self.lf.write('\n')
+                print >> sys.stdout, s
+            except:
+                s = '##! %s unable to parse region %s - MUST look like "chr8:10,000-100,000\n' % (progname,self.region)
+                print >> sys.stdout, s
+                self.lf.write(s)
+                self.lf.write('\n')
+                self.lf.close()
+                sys.exit(1)
+        else:
+            useRs = self.orslist.split() # galaxy replaces newlines with XX - go figure
+            useRsdict = dict(zip(useRs,useRs))
+        self.useTemp = False
+        try:
+            dfile = open(self.DATA_FILE, 'r')
+        except: # bad input file name?
+            s = '##! RGeno unable to open file %s\n' % (self.DATA_FILE)
+            self.lf.write(s)
+            self.lf.write('\n')
+            self.lf.close()
+            print >> sys.stdout, s
+            raise
+            sys.exit(1)
+        try:
+            mfile = open(self.MAP_FILE, 'r')
+        except: # bad input file name?
+            s = '##! RGeno unable to open file %s' % (self.MAP_FILE)
+            lf.write(s)
+            lf.write('\n')
+            lf.close()
+            print >> sys.stdout, s
+            raise
+            sys.exit(1)
+        if len(useRs) > 0 or spos <> -9 : # subset region
+            self.useTemp = True
+            ### Figure out which markers are in this region
+            markers = []
+            snpcols = {}
+            chroms = {}
+            minpos = 2**32
+            maxpos = 0
+            for lnum,row in enumerate(mfile):
+                line = row.strip()
+                if not line: continue
+                chrom, snp, genpos, abspos = line.split()
+                try:
+                    ic = int(chrom)
+                except:
+                    ic = None
+                if ic and ic <= 23:
+                    try:
+                        abspos = int(abspos)
+                        if abspos > maxpos:
+                            maxpos = abspos
+                        if abspos < minpos:
+                            minpos = abspos
+                    except:
+                        abspos = epos + 999999999 # so next test fails
+                if useRsdict.get(snp,None) or (spos <> -9 and chrom == chromosome and (spos <= abspos <= epos)):
+                    if chromosome == '':
+                        chromosome = chrom
+                    chroms.setdefault(chrom,chrom)
+                    markers.append((chrom,abspos,snp)) # decorate for sort into genomic
+                    snpcols[snp] = lnum # so we know which col to find genos for this marker
+            markers.sort()
+            rslist = [x[2] for x in markers] # drop decoration
+            rsdict = dict(zip(rslist,rslist))
+            if len(rslist) == 0:
+                s = '##! %s found no rs numbers in %s' % (progname,argv[1:3])
+                self.lf.write(s)
+                self.lf.write('\n')
+                self.lf.close()
+                print >> sys.stdout, s
+                sys.exit(1)
+            if spos == -9:
+                spos = minpos
+                epos = maxpos
+            s = '## %s looking for %d rs (%s)' % (progname,len(rslist),rslist[:5])
+            self.lf.write(s)
+            print >> sys.stdout, s
+            wewant = [(6+(2*snpcols[x])) for x in rslist] #
+            # column indices of first geno of each marker pair to get the markers into genomic
+            ### ... and then parse the rest of the ped file to pull out
+            ### the genotypes for all subjects for those markers
+            # /usr/local/galaxy/data/rg/1/lped/
+            self.tempMapName = os.path.join(self.outfpath,'%s.info' % self.title)
+            self.tempMap = file(self.tempMapName,'w')
+            self.tempPedName = os.path.join(self.outfpath,'%s.ped' % self.title)
+            self.tempPed = file(self.tempPedName,'w')
+            self.pngpath = '%s.LD.PNG' % self.tempPedName
+            map = ['%s\t%s' % (x[2],x[1]) for x in markers] # snp,abspos in genomic order for haploview
+            self.tempMap.write('%s\n' % '\n'.join(map))
+            self.tempMap.close()
+            nrows = 0
+            for line in dfile:
+                line = line.strip()
+                if not line:
+                    continue
+                fields = line.split()
+                preamble = fields[:6]
+                g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant]
+                g = ' '.join(g)
+                g = g.split() # we'll get there
+                g = [atrandic.get(x,'0') for x in g] # numeric alleles...
+                self.tempPed.write('%s %s\n' % (' '.join(preamble), ' '.join(g)))
+                nrows += 1
+            self.tempPed.close()
+            s = '## %s: wrote %d markers, %d subjects for region %s\n' % (progname,len(rslist),nrows,self.region)
+            self.lf.write(s)
+            self.lf.write('\n')
+            print >> sys.stdout,s
+        else: # even if using all, must set up haploview info file instead of map
+            markers = []
+            chroms = {}
+            spos = sys.maxint
+            epos = -spos
+            for lnum,row in enumerate(mfile):
+              line = row.strip()
+              if not line: continue
+              chrom, snp, genpos, abspos = line.split()
+              try:
+                ic = int(chrom)
+              except:
+                ic = None
+              if ic and ic <= 23:
+                if chromosome == '':
+                    chromosome = chrom
+                chroms.setdefault(chrom,chrom)
+                try:
+                    p = int(abspos)
+                    if p < spos and p <> 0:
+                        spos = p
+                    if p > epos and p <> 0:
+                        epos = p
+                except:
+                    pass
+                markers.append('%s %s' % (snp,abspos)) # no sort - pass
+            # now have spos and epos for hapmap if hmpanels
+            self.tempMapName = os.path.join(self.outfpath,'%s.info' % self.title)
+            self.tempMap = file(self.tempMapName,'w')
+            self.tempMap.write('\n'.join(markers))
+            self.tempMap.close()
+            self.tempPedName = os.path.join(self.outfpath,'%s.ped' % self.title)
+            try: # will fail on winblows!
+                os.symlink(self.DATA_FILE,self.tempPedName)
+            except:
+                shutil.copy(self.DATA_FILE,self.tempPedName) # wasteful but..
+        self.nchroms = len(chroms) # if > 1 can't really do this safely
+        dfile.close()
+        mfile.close()
+        self.spos = spos
+        self.epos = epos
+        self.chromosome = chromosome
+        if self.nchroms > 1:
+            s = '## warning - multiple chromosomes found in your map file - %s\n' % ','.join(chroms.keys())
+            self.lf.write(s)
+            print >> sys.stdout,s
+            sys.exit(1)
+        
+    def doPlots(self):
+        """
+        """    
+        DATA_FILE = self.tempPedName # for haploview
+        INFO_FILE = self.tempMapName
+        fblog,blog = tempfile.mkstemp()
+        ste = open(blog,'w') # to catch the blather
+        # if no need to rewrite - set up names for haploview call
+        vcl = [javabin,'-jar',self.hvbin,'-n','-memory','%d' % self.memSize,'-pairwiseTagging',
+               '-pedfile',DATA_FILE,'-info',INFO_FILE,'-tagrsqcounts',
+               '-tagrsqcutoff',self.tagr2, '-ldcolorscheme',self.ldType]
+        if self.minMaf:
+            vcl += ['-minMaf','%f' % self.minMaf]
+        if self.maxDist:
+            vcl += ['-maxDistance',self.maxDist]
+        if self.hiRes:
+            vcl.append('-png')
+        else:
+            vcl.append('-compressedpng')
+        if self.nchroms == 1:
+            vcl += ['-chromosome',self.chromosome]
+        if self.infotrack:
+            vcl.append('-infoTrack')
+        p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=ste,stdout=self.lf)
+        retval = p.wait()
+        s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
+        self.lf.write(s)
+        vcl = [self.mogrify, '-resize 800x400!', '*.PNG']
+        p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf)
+        retval = p.wait()
+        s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
+        self.lf.write(s)
+        inpng = '%s.LD.PNG' % DATA_FILE # stupid but necessary - can't control haploview name mangle
+        inpng = inpng.replace(' ','')
+        inpng = os.path.split(inpng)[-1]
+        tmppng = '%s.tmp.png' % self.title
+        tmppng = tmppng.replace(' ','')
+        outpng = '1_%s.png' % self.title
+        outpng = outpng.replace(' ','')
+        outpng = os.path.split(outpng)[-1]
+        vcl = [self.convert, '-resize 800x400!', inpng, tmppng]
+        p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf)
+        retval = p.wait()
+        s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
+        self.lf.write(s)
+        s = "text 10,300 '%s'" % self.title[:40]
+        vcl = [self.convert, '-pointsize 25','-fill maroon',
+              '-draw "%s"' % s, tmppng, outpng]
+        p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf)
+        retval = p.wait()
+        s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
+        self.lf.write(s)
+        try:
+            os.remove(os.path.join(self.outfpath,tmppng))
+        except:
+            pass    # label all the plots then delete all the .PNG files before munging
+        fnum=1
+        if self.hmpanels:
+            sp = '%d' % (self.spos/1000.) # hapmap wants kb
+            ep = '%d' % (self.epos/1000.)
+            for panel in self.hmpanels:
+                if panel > '' and panel.lower() <> 'none': # in case someone checks that option too :)
+                    ptran = panel.strip()
+                    ptran = ptran.replace('+','_')
+                    fnum += 1 # preserve an order or else we get sorted
+                    vcl = [javabin,'-jar',self.hvbin,'-n','-memory','%d' % self.memSize,
+                      '-chromosome',self.chromosome, '-panel',panel.strip(),
+                      '-hapmapDownload','-startpos',sp,'-endpos',ep,
+                      '-ldcolorscheme',self.ldType]
+                    if self.minMaf:
+                        vcl += ['-minMaf','%f' % self.minMaf]
+                    if self.maxDist:
+                        vcl += ['-maxDistance',self.maxDist]
+                    if self.hiRes:
+                        vcl.append('-png')
+                    else:
+                        vcl.append('-compressedpng')
+                    if self.infotrack:
+                        vcl.append('-infoTrack')
+                    p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=ste,stdout=self.lf)
+                    retval = p.wait()
+                    inpng = 'Chromosome%s%s.LD.PNG' % (self.chromosome,panel)
+                    inpng = inpng.replace(' ','') # mysterious spaces!
+                    outpng = '%d_HapMap_%s_%s.png' % (fnum,ptran,self.chromosome)
+                    # hack for stupid chb+jpt
+                    outpng = outpng.replace(' ','')
+                    tmppng = '%s.tmp.png' % self.title
+                    tmppng = tmppng.replace(' ','')
+                    outpng = os.path.split(outpng)[-1]
+                    vcl = [self.convert, '-resize 800x400!', inpng, tmppng]
+                    p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf)
+                    retval = p.wait()
+                    s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
+                    self.lf.write(s)
+                    s = "text 10,300 'HapMap %s'" % ptran.strip()
+                    vcl = [self.convert, '-pointsize 25','-fill maroon',
+                          '-draw "%s"' % s, tmppng, outpng]
+                    p=subprocess.Popen(' '.join(vcl),shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf)
+                    retval = p.wait()
+                    s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
+                    self.lf.write(s)
+                    try:
+                        os.remove(os.path.join(self.outfpath,tmppng))
+                    except:
+                        pass
+        nimages = len(glob.glob(os.path.join(self.outfpath,'*.png'))) # rely on HaploView shouting - PNG @!
+        self.lf.write('### nimages=%d\n' % nimages)
+        if nimages > 0: # haploview may fail?
+            vcl = '%s -format pdf -resize 800x400! *.png' % self.mogrify
+            p=subprocess.Popen(vcl,shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf)
+            retval = p.wait()
+            self.lf.write('## executing %s returned %d\n' % (vcl,retval))
+            vcl = '%s *.pdf --fitpaper true --outfile alljoin.pdf' % self.pdfjoin
+            p=subprocess.Popen(vcl,shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf)
+            retval = p.wait()
+            self.lf.write('## executing %s returned %d\n' % (vcl,retval))
+            vcl = '%s alljoin.pdf --nup 1x%d --outfile allnup.pdf' % (self.pdfnup,nimages)
+            p=subprocess.Popen(vcl,shell=True,cwd=self.outfpath,stderr=self.lf,stdout=self.lf)
+            retval = p.wait()
+            self.lf.write('## executing %s returned %d\n' % (vcl,retval))
+            #vcl = ['convert', 'allnup.pdf', 'allnup.png'] # this fails - bad pdf?
+            #p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath)
+            #retval = p.wait()
+            #lf.write('## executing %s returned %d\n' % (vcl,retval))
+        ste.close() # temp file used to catch haploview blather
+        hblather = open(blog,'r').readlines() # to catch the blather    
+        os.unlink(blog)
+        if len(hblather) > 0:
+           self.lf.write('## In addition, Haploview complained:')
+           self.lf.write(''.join(hblather))
+           self.lf.write('\n')
+        self.lf.close()
+        flist = glob.glob(os.path.join(self.outfpath, '*'))
+        flist.sort()
+        ts = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~' + string.whitespace
+        ftran =  string.maketrans(ts,'_'*len(ts))
+        outf = file(self.outfile,'w')
+        outf.write(galhtmlprefix % progname)
+        s = '<h4>rgenetics for Galaxy %s, wrapping HaploView</h4>' % (progname)
+        outf.write(s)
+        """
+        as at ashg 2009, convert fails on allnup.pdf - probably too complex...
+        mainthumb = 'allnup.png'
+        mainpdf = 'allnup.pdf'
+        if os.path.exists(mainpdf):
+            if not os.path.exists(mainthumb):
+                outf.write('<table><tr><td colspan="3"><a href="%s">Main combined LD plot</a></td></tr></table>\n' % (mainpdf))
+            else:
+                outf.write('<table><tr><td><a href="%s"><img src="%s" alt="Main combined LD image" hspace="10" align="middle">')
+                outf.write('</td><td>Click this thumbnail to display the main combined LD image</td></tr></table>\n' % (mainpdf,mainthumb))
+        else:
+            outf.write('(No main image was generated - this usually means a Haploview error connecting to Hapmap site - please try later)<br/>\n')
+        outf.write('## Called as %s' % sys.argv)
+        """
+        outf.write('<br><div><hr><ul>\n')
+        for i, data in enumerate( flist ):
+            dn = os.path.split(data)[-1]
+            if dn[:3] <> 'all':
+                continue
+            newdn = dn.translate(ftran)
+            if dn <> newdn:
+                os.rename(os.path.join(self.outfpath,dn),os.path.join(self.outfpath,newdn))
+                dn = newdn
+            dnlabel = dn
+            ext = dn.split('.')[-1]
+            if dn == 'allnup.pdf':
+                dnlabel = 'All pdf plots on a single page'
+            elif dn == 'alljoin.pdf':
+                dnlabel = 'All pdf plots, each on a separate page'
+            outf.write('<li><a href="%s">%s - %s</a></li>\n' % (dn,dn,dnlabel))
+        for i, data in enumerate( flist ):
+            dn = os.path.split(data)[-1]
+            if dn[:3] == 'all':
+                continue
+            newdn = dn.translate(ftran)
+            if dn <> newdn:
+                os.rename(os.path.join(self.outfpath,dn),os.path.join(self.outfpath,newdn))
+                dn = newdn
+            dnlabel = dn
+            ext = dn.split('.')[-1]
+            if dn == 'allnup.pdf':
+                dnlabel = 'All pdf plots on a single page'
+            elif dn == 'alljoin.pdf':
+                dnlabel = 'All pdf plots, each on a separate page'
+            elif ext == 'info':
+                dnlabel = '%s map data for Haploview input' % self.title
+            elif ext == 'ped':
+                dnlabel = '%s genotype data for Haploview input' % self.title
+            elif dn.find('CEU') <> -1 or dn.find('YRI') <> -1 or dn.find('CHB_JPT') <> -1: # is hapmap
+                dnlabel = 'Hapmap data'
+            if ext == 'TAGS' or ext == 'TESTS' or ext == 'CHAPS':
+                dnlabel = dnlabel + ' Tagger output'
+            outf.write('<li><a href="%s">%s - %s</a></li>\n' % (dn,dn,dnlabel))
+        outf.write('</ol><br>')
+        outf.write("</div><div><hr>Job Log follows below (see %s)<pre>" % self.logfn)
+        s = file(self.log_file,'r').readlines()
+        s = '\n'.join(s)
+        outf.write('%s</pre><hr></div>' % s)
+        outf.write('</body></html>')
+        outf.close()
+        if self.useTemp:
+            try:
+                os.unlink(self.tempMapName)
+                os.unlink(self.tempPedName)
+            except:
+                pass
+        
+if __name__ == "__main__":
     """  ### Sanity check the arguments
 
     <command interpreter="python">
@@ -52,444 +507,11 @@ def ld():
     skipcheck?
     """
     progname = os.path.split(sys.argv[0])[-1]
-    ts = '%s%s' % (string.punctuation,string.whitespace)
-    ptran =  string.maketrans(ts,'_'*len(ts))
     if len(sys.argv) < 16:
         s = '##!%s: Expected 16 params in sys.argv, got %d (%s)' % (progname,len(sys.argv), sys.argv)
         print s
         sys.exit(1)
+    ld = ldPlot(argv = sys.argv)
+    ld.doPlots()
 
-    ### Figure out what genomic region we are interested in
-    region = sys.argv[1]
-    orslist = sys.argv[2].replace('X',' ').lower() # galaxy replaces newlines with XX - go figure
-    title = sys.argv[3].translate(ptran)
-    # for outputs
-    outfile = sys.argv[4]
-    logfn = 'Log_%s.txt' % (title)
-    histextra = sys.argv[5]
-    base_name = sys.argv[6]
-    pedFileBase = os.path.join(histextra,base_name)
-    print 'pedfilebase=%s' % pedFileBase
-    minMaf=sys.argv[7]
-    if minMaf:
-        try:
-            minMaf = float(minMaf)
-        except:
-            minMaf = 0.0
-    maxDist=sys.argv[8] or None
-    ldType=sys.argv[9] or 'RSQ'
-    hiRes = (sys.argv[10].lower() == 'hi')
-    memSize= sys.argv[11] or '1000'
-    memSize = int(memSize)
-    outfpath = sys.argv[12]
-    infotrack = False # note that otherwise this breaks haploview in headless mode 
-    #infotrack = sys.argv[13] == 'info'
-    # this fails in headless mode as at april 2010 with haploview 4.2
-    tagr2 = sys.argv[14] or '0.8'
-    hmpanels = sys.argv[15] # eg "['CEU','YRI']"
-    if hmpanels:
-       hmpanels = hmpanels.replace('[','')
-       hmpanels = hmpanels.replace(']','')
-       hmpanels = hmpanels.replace("'",'')
-       hmpanels = hmpanels.split(',')
-    hvbin = sys.argv[16] # added rml june 2008
-    bindir = os.path.split(hvbin)[0]
-    # jan 2010 - always assume utes are on path to avoid platform problems
-    pdfjoin = 'pdfjoin' # os.path.join(bindir,'pdfjoin')
-    pdfnup = 'pdfnup' # os.path.join(bindir,'pdfnup')
-    mogrify = 'mogrify' # os.path.join(bindir,'mogrify')
-    convert = 'convert' # os.path.join(bindir,'convert')
-    log_file = os.path.join(outfpath,logfn)
-    MAP_FILE = '%s.map' % pedFileBase
-    DATA_FILE = '%s.ped' % pedFileBase
-    try:
-        os.makedirs(outfpath)
-        s = '## made new path %s\n' % outfpath
-    except:
-        pass
-    lf = file(log_file,'w')
-    s = 'PATH=%s\n' % os.environ.get('PATH','?')
-    lf.write(s)
-    hlogf = os.path.join(outfpath,'%s.log' % pedFileBase)
-    chromosome = ''
-    spos = epos = -9
-    rslist = []
-    rsdict = {}
-    hlog = []
 
-    if region > '':
-        useRs = []
-        useRsdict={}
-        try: # TODO make a regexp?
-            c,rest = region.split(':')
-            chromosome = c.replace('chr','')
-            rest = rest.replace(',','') # remove commas
-            spos,epos = rest.split('-')
-            spos = int(spos)
-            epos = int(epos)
-            s = '## %s parsing chrom %s from %d to %d\n' % (progname,chromosome,spos,epos)
-            lf.write(s)
-            lf.write('\n')
-            print >> sys.stdout, s
-        except:
-            s = '##! %s unable to parse region %s - MUST look like "chr8:10,000-100,000\n' % (progname,region)
-            print >> sys.stdout, s
-            lf.write(s)
-            lf.write('\n')
-            lf.close()
-            sys.exit(1)
-    else:
-        useRs = orslist.split() # galaxy replaces newlines with XX - go figure
-        useRsdict = dict(zip(useRs,useRs))
-    useTemp = False
-    try:
-        dfile = open(DATA_FILE, 'r')
-    except: # bad input file name?
-        s = '##! RGeno unable to open file %s\n' % (DATA_FILE)
-        lf.write(s)
-        lf.write('\n')
-        lf.close()
-        print >> sys.stdout, s
-        raise
-        sys.exit(1)
-    try:
-        mfile = open(MAP_FILE, 'r')
-    except: # bad input file name?
-        s = '##! RGeno unable to open file %s' % (MAP_FILE)
-        lf.write(s)
-        lf.write('\n')
-        lf.close()
-        print >> sys.stdout, s
-        raise
-        sys.exit(1)
-    if len(useRs) > 0 or spos <> -9 : # subset region
-        useTemp = True
-        ### Figure out which markers are in this region
-        markers = []
-        snpcols = {}
-        chroms = {}
-        minpos = 2**32
-        maxpos = 0
-        for lnum,row in enumerate(mfile):
-            line = row.strip()
-            if not line: continue
-            chrom, snp, genpos, abspos = line.split()
-            try:
-                ic = int(chrom)
-            except:
-                ic = None
-            if ic and ic <= 23:
-                try:
-                    abspos = int(abspos)
-                    if abspos > maxpos:
-                        maxpos = abspos
-                    if abspos < minpos:
-                        minpos = abspos
-                except:
-                    abspos = epos + 999999999 # so next test fails
-            if useRsdict.get(snp,None) or (spos <> -9 and chrom == chromosome and (spos <= abspos <= epos)):
-                if chromosome == '':
-                    chromosome = chrom
-                chroms.setdefault(chrom,chrom)
-                markers.append((chrom,abspos,snp)) # decorate for sort into genomic
-                snpcols[snp] = lnum # so we know which col to find genos for this marker
-        markers.sort()
-        rslist = [x[2] for x in markers] # drop decoration
-        rsdict = dict(zip(rslist,rslist))
-        if len(rslist) == 0:
-            s = '##! %s found no rs numbers in %s' % (progname,sys.argv[1:3])
-            lf.write(s)
-            lf.write('\n')
-            lf.close()
-            print >> sys.stdout, s
-            sys.exit(1)
-        if spos == -9:
-            spos = minpos
-            epos = maxpos
-        s = '## %s looking for %d rs (%s)' % (progname,len(rslist),rslist[:5])
-        lf.write(s)
-        print >> sys.stdout, s
-        wewant = [(6+(2*snpcols[x])) for x in rslist] #
-        # column indices of first geno of each marker pair to get the markers into genomic
-        ### ... and then parse the rest of the ped file to pull out
-        ### the genotypes for all subjects for those markers
-        # /usr/local/galaxy/data/rg/1/lped/
-        tempMapName = os.path.join(outfpath,'%s.info' % title)
-        tempMap = file(tempMapName,'w')
-        tempPedName = os.path.join(outfpath,'%s.ped' % title)
-        tempPed = file(tempPedName,'w')
-        pngpath = '%s.LD.PNG' % tempPedName
-        map = ['%s\t%s' % (x[2],x[1]) for x in markers] # snp,abspos in genomic order for haploview
-        tempMap.write('%s\n' % '\n'.join(map))
-        tempMap.close()
-        nrows = 0
-        for line in dfile:
-            line = line.strip()
-            if not line:
-                continue
-            fields = line.split()
-            preamble = fields[:6]
-            g = ['%s %s' % (fields[snpcol], fields[snpcol+1]) for snpcol in wewant]
-            g = ' '.join(g)
-            g = g.split() # we'll get there
-            g = [atrandic.get(x,'0') for x in g] # numeric alleles...
-            tempPed.write('%s %s\n' % (' '.join(preamble), ' '.join(g)))
-            nrows += 1
-        tempPed.close()
-        s = '## %s: wrote %d markers, %d subjects for region %s\n' % (progname,len(rslist),nrows,region)
-        lf.write(s)
-        lf.write('\n')
-        print >> sys.stdout,s
-    else: # even if using all, must set up haploview info file instead of map
-        markers = []
-        chroms = {}
-        spos = sys.maxint
-        epos = -spos
-        for lnum,row in enumerate(mfile):
-          line = row.strip()
-          if not line: continue
-          chrom, snp, genpos, abspos = line.split()
-          try:
-            ic = int(chrom)
-          except:
-            ic = None
-          if ic and ic <= 23:
-            if chromosome == '':
-                chromosome = chrom
-            chroms.setdefault(chrom,chrom)
-            try:
-                p = int(abspos)
-                if p < spos and p <> 0:
-                    spos = p
-                if p > epos and p <> 0:
-                    epos = p
-            except:
-                pass
-            markers.append('%s %s' % (snp,abspos)) # no sort - pass
-        # now have spos and epos for hapmap if hmpanels
-        tempMapName = os.path.join(outfpath,'%s.info' % title)
-        tempMap = file(tempMapName,'w')
-        tempMap.write('\n'.join(markers))
-        tempMap.close()
-        tempPedName = os.path.join(outfpath,'%s.ped' % title)
-        try: # will fail on winblows!
-            os.symlink(DATA_FILE,tempPedName)
-        except:
-            shutil.copy(DATA_FILE,tempPedName) # wasteful but..
-    nchroms = len(chroms) # if > 1 can't really do this safely
-    if nchroms > 1:
-        s = '## warning - multiple chromosomes found in your map file - %s\n' % ','.join(chroms.keys())
-        lf.write(s)
-        print >> sys.stdout,s
-        sys.exit(1)
-    DATA_FILE = tempPedName # for haploview
-    INFO_FILE = tempMapName
-    dfile.close()
-    mfile.close()
-    fblog,blog = tempfile.mkstemp()
-    ste = open(blog,'w') # to catch the blather
-    # if no need to rewrite - set up names for haploview call
-    vcl = [javabin,'-jar',hvbin,'-n','-memory','%d' % memSize,'-pairwiseTagging',
-           '-pedfile',DATA_FILE,'-info',INFO_FILE,'-tagrsqcounts',
-           '-tagrsqcutoff',tagr2,
-           '-ldcolorscheme',ldType]
-    if minMaf:
-        vcl += ['-minMaf','%f' % minMaf]
-    if maxDist:
-        vcl += ['-maxDistance',maxDist]
-    if hiRes:
-        vcl.append('-png')
-    else:
-        vcl.append('-compressedpng')
-    if nchroms == 1:
-        vcl += ['-chromosome',chromosome]
-    if infotrack:
-        vcl.append('-infoTrack')
-    p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath,stderr=ste,stdout=lf)
-    retval = p.wait()
-    s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
-    lf.write(s)
-    vcl = [mogrify, '-resize 800x400!', '*.PNG']
-    p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath,stderr=lf,stdout=lf)
-    retval = p.wait()
-    s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
-    lf.write(s)
-    inpng = '%s.LD.PNG' % DATA_FILE # stupid but necessary - can't control haploview name mangle
-    inpng = inpng.replace(' ','')
-    inpng = os.path.split(inpng)[-1]
-    tmppng = '%s.tmp.png' % title
-    tmppng = tmppng.replace(' ','')
-    outpng = '1_%s.png' % title
-    outpng = outpng.replace(' ','')
-    outpng = os.path.split(outpng)[-1]
-    vcl = [convert, '-resize 800x400!', inpng, tmppng]
-    p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath,stderr=lf,stdout=lf)
-    retval = p.wait()
-    s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
-    lf.write(s)
-    s = "text 10,300 '%s'" % title[:40]
-    vcl = ['convert', '-pointsize 25','-fill maroon',
-          '-draw "%s"' % s, tmppng, outpng]
-    p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath,stderr=lf,stdout=lf)
-    retval = p.wait()
-    s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
-    lf.write(s)
-    try:
-        os.remove(os.path.join(outfpath,tmppng))
-    except:
-        pass    # label all the plots then delete all the .PNG files before munging
-    fnum=1
-    if hmpanels:
-        sp = '%d' % (spos/1000.) # hapmap wants kb
-        ep = '%d' % (epos/1000.)
-        for panel in hmpanels:
-            if panel > '' and panel.lower() <> 'none': # in case someone checks that option too :)
-                ptran = panel.strip()
-                ptran = ptran.replace('+','_')
-                fnum += 1 # preserve an order or else we get sorted
-                vcl = [javabin,'-jar',hvbin,'-n','-memory','%d' % memSize,
-                  '-chromosome',chromosome, '-panel',panel.strip(),
-                  '-hapmapDownload','-startpos',sp,'-endpos',ep,
-                  '-ldcolorscheme',ldType]
-                if minMaf:
-                    vcl += ['-minMaf','%f' % minMaf]
-                if maxDist:
-                    vcl += ['-maxDistance',maxDist]
-                if hiRes:
-                    vcl.append('-png')
-                else:
-                    vcl.append('-compressedpng')
-                if infotrack:
-                    vcl.append('-infoTrack')
-                p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath,stderr=ste,stdout=lf)
-                retval = p.wait()
-                inpng = 'Chromosome%s%s.LD.PNG' % (chromosome,panel)
-                inpng = inpng.replace(' ','') # mysterious spaces!
-                outpng = '%d_HapMap_%s_%s.png' % (fnum,ptran,chromosome,)
-                # hack for stupid chb+jpt
-                outpng = outpng.replace(' ','')
-                tmppng = '%s.tmp.png' % title
-                tmppng = tmppng.replace(' ','')
-                outpng = os.path.split(outpng)[-1]
-                vcl = [convert, '-resize 800x400!', inpng, tmppng]
-                p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath,stderr=lf,stdout=lf)
-                retval = p.wait()
-                s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
-                lf.write(s)
-                s = "text 10,300 'HapMap %s'" % ptran.strip()
-                vcl = [convert, '-pointsize 25','-fill maroon',
-                      '-draw "%s"' % s, tmppng, outpng]
-                p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath,stderr=lf,stdout=lf)
-                retval = p.wait()
-                s = '## executing %s returned %d\n' % (' '.join(vcl),retval)
-                lf.write(s)
-                try:
-                    os.remove(os.path.join(outfpath,tmppng))
-                except:
-                    pass
-    nimages = len(glob.glob(os.path.join(outfpath,'*.png'))) # rely on HaploView shouting - PNG @!
-    lf.write('### nimages=%d\n' % nimages)
-    if nimages > 0: # haploview may fail?
-        vcl = '%s -format pdf -resize 800x400! *.png' % mogrify
-        p=subprocess.Popen(vcl,shell=True,cwd=outfpath,stderr=lf,stdout=lf)
-        retval = p.wait()
-        lf.write('## executing %s returned %d\n' % (vcl,retval))
-        vcl = '%s *.pdf --fitpaper true --outfile alljoin.pdf' % pdfjoin
-        p=subprocess.Popen(vcl,shell=True,cwd=outfpath,stderr=lf,stdout=lf)
-        retval = p.wait()
-        lf.write('## executing %s returned %d\n' % (vcl,retval))
-        vcl = '%s alljoin.pdf --nup 1x%d --outfile allnup.pdf' % (pdfnup,nimages)
-        p=subprocess.Popen(vcl,shell=True,cwd=outfpath,stderr=lf,stdout=lf)
-        retval = p.wait()
-        lf.write('## executing %s returned %d\n' % (vcl,retval))
-        #vcl = ['convert', 'allnup.pdf', 'allnup.png'] # this fails - bad pdf?
-        #p=subprocess.Popen(' '.join(vcl),shell=True,cwd=outfpath)
-        #retval = p.wait()
-        #lf.write('## executing %s returned %d\n' % (vcl,retval))
-    lf.write('\n'.join(hlog))
-    ste.close() # temp file used to catch haploview blather
-    hblather = open(blog,'r').readlines() # to catch the blather    
-    os.unlink(blog)
-    if len(hblather) > 0:
-       lf.write('## In addition, Haploview complained:')
-       lf.write(''.join(hblather))
-       lf.write('\n')
-    lf.close()
-    flist = glob.glob(os.path.join(outfpath, '*'))
-    flist.sort()
-    ts = '!"#$%&\'()*+,-/:;<=>?@[\\]^_`{|}~' + string.whitespace
-    ftran =  string.maketrans(ts,'_'*len(ts))
-    outf = file(outfile,'w')
-    outf.write(galhtmlprefix % progname)
-    s = '<h4>rgenetics for Galaxy %s, wrapping HaploView</h4>' % (progname)
-    outf.write(s)
-    """
-    as at ashg 2009, convert fails on allnup.pdf - probably too complex...
-    mainthumb = 'allnup.png'
-    mainpdf = 'allnup.pdf'
-    if os.path.exists(mainpdf):
-        if not os.path.exists(mainthumb):
-            outf.write('<table><tr><td colspan="3"><a href="%s">Main combined LD plot</a></td></tr></table>\n' % (mainpdf))
-        else:
-            outf.write('<table><tr><td><a href="%s"><img src="%s" alt="Main combined LD image" hspace="10" align="middle">')
-            outf.write('</td><td>Click this thumbnail to display the main combined LD image</td></tr></table>\n' % (mainpdf,mainthumb))
-    else:
-        outf.write('(No main image was generated - this usually means a Haploview error connecting to Hapmap site - please try later)<br/>\n')
-    outf.write('## Called as %s' % sys.argv)
-    """
-    outf.write('<br><div><hr><ul>\n')
-    for i, data in enumerate( flist ):
-        dn = os.path.split(data)[-1]
-        if dn[:3] <> 'all':
-            continue
-        newdn = dn.translate(ftran)
-        if dn <> newdn:
-            os.rename(os.path.join(outfpath,dn),os.path.join(outfpath,newdn))
-            dn = newdn
-        dnlabel = dn
-        ext = dn.split('.')[-1]
-        if dn == 'allnup.pdf':
-            dnlabel = 'All pdf plots on a single page'
-        elif dn == 'alljoin.pdf':
-            dnlabel = 'All pdf plots, each on a separate page'
-        outf.write('<li><a href="%s">%s - %s</a></li>\n' % (dn,dn,dnlabel))
-    for i, data in enumerate( flist ):
-        dn = os.path.split(data)[-1]
-        if dn[:3] == 'all':
-            continue
-        newdn = dn.translate(ftran)
-        if dn <> newdn:
-            os.rename(os.path.join(outfpath,dn),os.path.join(outfpath,newdn))
-            dn = newdn
-        dnlabel = dn
-        ext = dn.split('.')[-1]
-        if dn == 'allnup.pdf':
-            dnlabel = 'All pdf plots on a single page'
-        elif dn == 'alljoin.pdf':
-            dnlabel = 'All pdf plots, each on a separate page'
-        elif ext == 'info':
-            dnlabel = '%s map data for Haploview input' % title
-        elif ext == 'ped':
-            dnlabel = '%s genotype data for Haploview input' % title
-        elif dn.find('CEU') <> -1 or dn.find('YRI') <> -1 or dn.find('CHB_JPT') <> -1: # is hapmap
-            dnlabel = 'Hapmap data'
-        if ext == 'TAGS' or ext == 'TESTS' or ext == 'CHAPS':
-            dnlabel = dnlabel + ' Tagger output'
-        outf.write('<li><a href="%s">%s - %s</a></li>\n' % (dn,dn,dnlabel))
-    outf.write('</ol><br>')
-    outf.write("</div><div><hr>Job Log follows below (see %s)<pre>" % logfn)
-    s = file(log_file,'r').readlines()
-    s = '\n'.join(s)
-    outf.write('%s</pre><hr></div>' % s)
-    outf.write('</body></html>')
-    outf.close()
-    if useTemp:
-        try:
-            os.unlink(tempMapName)
-            os.unlink(tempPedName)
-        except:
-            pass
-        
-if __name__ == "__main__":
-    ld()
-

    

[galaxy-commits] galaxy-dist commit 2c2baca25528: rgHaploView.py reorganized to make a little less ugly and easier to maintain

commits-noreply＠bitbucket.org