[hg] galaxy 3507: Cleanup genetics.py
details: http://www.bx.psu.edu/hg/galaxy/rev/70930ea26347 changeset: 3507:70930ea26347 user: fubar: ross Lazarus at gmail period com date: Wed Mar 10 16:09:37 2010 -0500 description: Cleanup genetics.py Allow download of composite objects from libraries so content is in the archive Allow download of composite objects from histories ditto (current default is zip) Small fix to missing param in download from library Small fix to twilltestcase when testing a tool with hidden form fields diffstat: lib/galaxy/datatypes/genetics.py | 468 ++++++++++++++------------ lib/galaxy/tools/parameters/grouping.py | 10 +- lib/galaxy/web/controllers/dataset.py | 142 +++++++- lib/galaxy/web/controllers/library_common.py | 1 + test/base/twilltestcase.py | 57 +-- 5 files changed, 414 insertions(+), 264 deletions(-) diffs (1190 lines): diff -r 9701e5ee128d -r 70930ea26347 lib/galaxy/datatypes/genetics.py --- a/lib/galaxy/datatypes/genetics.py Wed Mar 10 14:25:34 2010 -0500 +++ b/lib/galaxy/datatypes/genetics.py Wed Mar 10 16:09:37 2010 -0500 @@ -1,5 +1,6 @@ """ rgenetics datatypes +Use at your peril Ross Lazarus for the rgenetics and galaxy projects @@ -10,6 +11,7 @@ ross lazarus for rgenetics august 20 2007 """ + import logging, os, sys, time, tempfile, shutil, string, glob import data from galaxy import util @@ -24,186 +26,172 @@ from galaxy.datatypes.interval import Interval from galaxy.util.hash_util import * -log = logging.getLogger(__name__) +gal_Log = logging.getLogger(__name__) +verbose = False -class GenomeGraphs(Interval): +class GenomeGraphs( Tabular ): + """ + Tab delimited data containing a marker id and any number of numeric values + """ - """gg version viewable at ucsc of Gff format""" - file_ext = "gg" - column_names = [ 'Seqname', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'Frame', 'Group' ] + MetadataElement( name="markerCol", default=1, desc="Marker ID column", param=metadata.ColumnParameter ) + MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True ) + MetadataElement( name="column_types", default=[], desc="Column types", readonly=True, visible=False ) + file_ext = 'gg' - """Add metadata elements""" - MetadataElement( name="columns", default=9, desc="Number of columns", readonly=True, visible=False ) - MetadataElement( name="column_types", default=['str','str','str','int','int','int','str','str','str'], param=metadata.ColumnTypesParameter, desc="Column types", readonly=True, visible=False ) - MetadataElement( name="chromCol", default=1, desc="Chrom column", param=metadata.ColumnParameter ) - MetadataElement( name="startCol", default=4, desc="Start column", param=metadata.ColumnParameter ) - MetadataElement( name="endCol", default=5, desc="End column", param=metadata.ColumnParameter ) - MetadataElement( name="strandCol", desc="Strand column (click box & select)", param=metadata.ColumnParameter, optional=True, no_value=0 ) - ###do we need to repeat these? they are the same as should be inherited from interval type + def __init__(self, **kwd): + """ + Initialize gg datatype, by adding UCSC display apps + """ + Tabular.__init__(self, **kwd) + self.add_display_app ( 'ucsc', 'Genome Graph', 'as_ucsc_display_file', 'ucsc_links' ) + - def __init__(self, **kwd): - """Initialize datatype, by adding GBrowse display app""" - Interval.__init__(self, **kwd) - self.add_display_app ( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' ) + def set_meta(self,dataset,**kwd): + Tabular.set_meta( self, dataset, **kwd) + dataset.metadata.markerCol = 1 + header = file(dataset.file_name,'r').readlines()[0].strip().split('\t') + dataset.metadata.columns = len(header) + t = ['numeric' for x in header] + t[0] = 'string' + dataset.metadata.column_types = t + return True + def as_ucsc_display_file( self, dataset, **kwd ): - return open( dataset.file_name ) - def set_meta( self, dataset, overwrite = True, **kwd ): - i = 0 - for i, line in enumerate( file ( dataset.file_name ) ): - line = line.rstrip('\r\n') - if line and not line.startswith( '#' ): - elems = line.split( '\t' ) - if len(elems) == 9: - try: - int( elems[3] ) - int( elems[4] ) - break - except: - pass - Interval.set_meta( self, dataset, overwrite = overwrite, skip = i ) + """ + Returns file + """ + return file(dataset.file_name,'r') + + def ucsc_links( self, dataset, type, app, base_url ): + """ + from the ever-helpful angie hinrichs angie@soe.ucsc.edu + a genome graphs call looks like this + http://genome.ucsc.edu/cgi-bin/hgGenome?clade=mammal&org=Human&db=hg18&hgGenome_dataSetName=dname + &hgGenome_dataSetDescription=test&hgGenome_formatType=best%20guess&hgGenome_markerType=best%20guess + &hgGenome_columnLabels=best%20guess&hgGenome_maxVal=&hgGenome_labelVals= + &hgGenome_maxGapToFill=25000000&hgGenome_uploadFile=http://galaxy.esphealth.org/datasets/333/display/index + &hgGenome_doSubmitUpload=submit + Galaxy gives this for an interval file + http://genome.ucsc.edu/cgi-bin/hgTracks?db=hg18&position=chr1:1-1000&hgt.customText= + http%3A%2F%2Fgalaxy.esphealth.org%2Fdisplay_as%3Fid%3D339%26display_app%3Ducsc + """ + ret_val = [] + ggtail = 'hgGenome_doSubmitUpload=submit' + if not dataset.dbkey: + dataset.dbkey = 'hg18' # punt! + if dataset.has_data: + for site_name, site_url in util.get_ucsc_by_build(dataset.dbkey): + if site_name in app.config.ucsc_display_sites: + site_url = site_url.replace('/hgTracks?','/hgGenome?') # for genome graphs + internal_url = "%s" % url_for( controller='dataset', + dataset_id=dataset.id, action='display_at', filename='ucsc_' + site_name ) + if base_url.startswith( 'https://' ): + base_url = base_url.replace( 'https', 'http', 1 ) + display_url = "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at" % (base_url, url_for( controller='root' ), dataset.id, type) + display_url = urllib.quote_plus( display_url ) + # was display_url = urllib.quote_plus( "%s/display_as?id=%i&display_app=%s" % (base_url, dataset.id, type) ) + #redirect_url = urllib.quote_plus( "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" % (site_url, dataset.dbkey, chrom, start, stop) ) + sl = ["%sdb=%s" % (site_url,dataset.dbkey ),] + #sl.append("&hgt.customText=%s") + sl.append("&hgGenome_dataSetName=%s&hgGenome_dataSetDescription=%s" % (dataset.name, 'GalaxyGG_data')) + sl.append("&hgGenome_formatType=best guess&hgGenome_markerType=best guess") + sl.append("&hgGenome_columnLabels=first row&hgGenome_maxVal=&hgGenome_labelVals=") + sl.append("&hgGenome_doSubmitUpload=submit") + sl.append("&hgGenome_maxGapToFill=25000000&hgGenome_uploadFile=%s" % display_url) + s = ''.join(sl) + s = urllib.quote_plus(s) + redirect_url = s + log.debug('## rg gg ucsc rdurl=%s; s = %s' % (redirect_url,s)) + link = '%s?redirect_url=%s&display_url=%s' % ( internal_url, redirect_url, display_url ) + ret_val.append( (site_name, link) ) + return ret_val + def make_html_table( self, dataset, skipchars=[] ): - """Create HTML table, used for displaying peek""" + """ + Create HTML table, used for displaying peek + """ + npeek = 5 out = ['<table cellspacing="0" cellpadding="3">'] - comments = [] + f = open(dataset.file_name,'r') + d = [f.next() for x in range(npeek)] + hasheader = 0 + try: + test = ['%f' % x for x in d[0][1:]] # first is name - see if starts all numerics + except: + hasheader = 1 try: # Generate column header out.append( '<tr>' ) - for i, name in enumerate( self.column_names ): - out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) ) - out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) ) + if hasheader: + for i, name in enumerate(d[0].split() ): + out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) ) + d.pop(0) + out.append('</tr>') + for row in d: + out.append('<tr>') + out.append(''.join(['<td>%s</td>' % x for x in row.split()])) + out.append('</tr>') out.append( '</table>' ) out = "".join( out ) except Exception, exc: out = "Can't create peek %s" % exc return out - def get_estimated_display_viewport( self, dataset ): + + def validate( self, dataset ): """ - Return a chrom, start, stop tuple for viewing a file. There are slight differences between gff 2 and gff 3 - formats. This function should correctly handle both... + Validate a gg file - all numeric after header row """ - if True or (dataset.has_data() and dataset.state == dataset.states.OK): - try: - seqid = '' - start = 2147483647 # Maximum value of a signed 32 bit integer ( 2**31 - 1 ) - stop = 0 - for i, line in enumerate( file( dataset.file_name ) ): - if i == 0: # track stuff there - continue - line = line.rstrip( '\r\n' ) - if not line: - continue - if not line.startswith( '#' ): - elems = line.split( '\t' ) - if not seqid: - # We can only set the viewport for a single chromosome - seqid = elems[0] - if seqid == elems[0]: - # Make sure we have not spanned chromosomes - start = min( start, int( elems[3] ) ) - stop = max( stop, int( elems[4] ) ) - else: - # We've spanned a chromosome - break - if i > 10: # span 10 features - break - except: - seqid, start, stop = ( '', '', '' ) - return ( seqid, str( start ), str( stop ) ) - else: - return ( '', '', '' ) - def gbrowse_links( self, dataset, type, app, base_url ): - ret_val = [] - if dataset.has_data: - viewport_tuple = self.get_estimated_display_viewport( dataset ) - seqid = viewport_tuple[0] - start = viewport_tuple[1] - stop = viewport_tuple[2] - if seqid and start and stop: - for site_name, site_url in util.get_gbrowse_sites_by_build( dataset.dbkey ): - if site_name in app.config.gbrowse_display_sites: - link = "%s?start=%s&stop=%s&ref=%s&dbkey=%s" % ( site_url, start, stop, seqid, dataset.dbkey ) - ret_val.append( ( site_name, link ) ) - return ret_val - def ucsc_links( self, dataset, type, app, base_url ): - ret_val = [] - if dataset.has_data: - viewport_tuple = self.get_estimated_display_viewport(dataset) - if viewport_tuple: - chrom = viewport_tuple[0] - start = viewport_tuple[1] - stop = viewport_tuple[2] - if start == '' or int(start) < 1: - start='1' - if stop == '' or int(stop) <= start: - stop = '%d' % (int(start) + 10000) - for site_name, site_url in util.get_ucsc_by_build(dataset.dbkey): - if site_name in app.config.ucsc_display_sites: - # HACK: UCSC doesn't support https, so force http even - # if our URL scheme is https. Making this work - # requires additional hackery in your upstream proxy. - # If UCSC ever supports https, remove this hack. - internal_url = "%s" % url_for( controller='dataset', - dataset_id=dataset.id, action='display_at', filename='ucsc_' + site_name ) - if base_url.startswith( 'https://' ): - base_url = base_url.replace( 'https', 'http', 1 ) - display_url = urllib.quote_plus( "%s%s/display_as?id=%i&display_app=%s&authz_method=display_at" % (base_url, url_for( controller='root' ), dataset.id, type) ) - redirect_url = urllib.quote_plus( "%sdb=%s&position=%s:%s-%s&hgt.customText=%%s" % (site_url, dataset.dbkey, chrom, start, stop) ) - link = '%s?redirect_url=%s&display_url=%s' % ( internal_url, redirect_url, display_url ) - ret_val.append( (site_name, link) ) - else: - log.debug('@@@ gg ucsc_links - no viewport_tuple') - return ret_val + errors = list() + infile = open(dataset.file_name, "r") + header= infile.next() # header + for i,row in enumerate(infile): + ll = row.strip().split('\t')[1:] # first is alpha feature identifier + badvals = [] + for j,x in enumerate(ll): + try: + x = float(x) + except: + badval.append('col%d:%s' % (j+1,x)) + if len(badvals) > 0: + errors.append('row %d, %s' % (' '.join(badvals))) + return errors + def sniff( self, filename ): """ - Determines whether the file is in gff format - - GFF lines have nine required fields that must be tab-separated. + Determines whether the file is in gg format """ f = open(filename,'r') - headers = f.readline().split - if headers[0].lower() == 'track': - headers = f.readline.split() + headers = f.readline().split() + rows = [f.readline().split()[1:] for x in range(3)] # small sample #headers = get_headers( filename, '\t' ) - try: - if len(headers) < 2: - return False - for hdr in headers: - if hdr and hdr[0].startswith( '##gff-version' ) and hdr[0].find( '2' ) < 0: - return False - if hdr and hdr[0] and not hdr[0].startswith( '#' ): - if len(hdr) != 9: - return False - try: - int( hdr[3] ) - int( hdr[4] ) - except: - return False - if hdr[5] != '.': - try: - score = int(hdr[5]) - except: - return False - if (score < 0 or score > 1000): - return False - if hdr[6] not in data.valid_strand: - return False - return True - except: - return False + for row in rows: + try: + nums = [float(x) for x in row] # first col has been removed + except: + return false + return true + class rgTabList(Tabular): - """ + """ for sampleid and for featureid lists of exclusions or inclusions in the clean tool featureid subsets on statistical criteria -> specialized display such as gg """ file_ext = "rgTList" + def __init__(self, **kwd): - """Initialize featurelistt datatype""" + """ + Initialize featurelistt datatype + """ Tabular.__init__( self, **kwd ) self.column_names = [] + def make_html_table( self, dataset, skipchars=[] ): - """Create HTML table, used for displaying peek""" + """ + Create HTML table, used for displaying peek + """ out = ['<table cellspacing="0" cellpadding="3">'] comments = [] try: @@ -222,8 +210,9 @@ out = "Can't create peek %s" % exc return out + class rgSampleList(rgTabList): - """ + """ for sampleid exclusions or inclusions in the clean tool output from QC eg excess het, gender error, ibd pair member,eigen outlier,excess mendel errors,... since they can be uploaded, should be flexible @@ -240,9 +229,8 @@ self.column_names[0] = 'FID' self.column_names[1] = 'IID' # this is what Plink wants as at 2009 + def sniff(self,filename): - """ - """ infile = open(dataset.file_name, "r") header= infile.next() # header if header[0] == 'FID' and header[1] == 'IID': @@ -264,12 +252,17 @@ rgTabList.__init__( self, **kwd ) for i,s in enumerate(['#FeatureId', 'Chr', 'Genpos', 'Mappos']): self.column_names[i] = s + class Rgenetics(Html): """ - class to use for rgenetics + base class to use for rgenetics datatypes + derived from html - composite datatype elements + stored in extra files path """ - MetadataElement( name="base_name", desc="base name for all transformed versions of this genetic dataset", default="rgenetics", readonly=True, set_in_upload=True) + + MetadataElement( name="base_name", desc="base name for all transformed versions of this genetic dataset", default="rgenetics", + readonly=True, set_in_upload=True) composite_type = 'auto_primary_file' allow_datatype_change = False @@ -279,16 +272,22 @@ rval = ['<html><head><title>Rgenetics Galaxy Composite Dataset </title></head><p/>'] rval.append('<div>This composite dataset is composed of the following files:<p/><ul>') for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems(): + fn = composite_name opt_text = '' if composite_file.optional: opt_text = ' (optional)' - rval.append( '<li><a href="%s" type="application/binary">%s</a>%s' % ( composite_name, composite_name, opt_text ) ) + if composite_file.get('description'): + rval.append( '<li><a href="%s" type="application/binary">%s (%s)</a>%s</li>' % ( fn, fn, composite_file.get('description'), opt_text ) ) + else: + rval.append( '<li><a href="%s" type="application/binary">%s</a>%s</li>' % ( fn, fn, opt_text ) ) rval.append( '</ul></div></html>' ) return "\n".join( rval ) + def regenerate_primary_file(self,dataset): """ cannot do this until we are setting metadata """ + guessmt = {'.log':'text/plain','.ped':'text/plain', '.map':'text/plain','.out':'text/plain','.in':'text/plain'} def fix(oldpath,newbase): old,e = os.path.splitext(oldpath) head,rest = os.path.split(old) @@ -301,44 +300,45 @@ efp = dataset.extra_files_path flist = os.listdir(efp) proper_base = bn - rval = ['<html><head><title>Files for Composite Dataset %s</title></head><p/>Comprises the following files:<p/><ul>' % (bn)] + rval = ['<html><head><title>Files for Composite Dataset %s</title></head><p/>Composite %s contains the following files:<p/><ul>' % (dataset.name,dataset.name)] for i,fname in enumerate(flist): newpath = fix(os.path.join(efp,fname),proper_base) sfname = os.path.split(newpath)[-1] - rval.append( '<li><a href="%s">%s</a>' % ( sfname, sfname ) ) + f,e = os.path.splitext(fname) + mt = guessmt.get(e,'application/binary') + rval.append( '<li><a href="%s" mimetype="%s">%s</a></li>' % ( sfname, mt, sfname) ) rval.append( '</ul></html>' ) f = file(dataset.file_name,'w') f.write("\n".join( rval )) f.write('\n') f.close() + def set_meta( self, dataset, **kwd ): + """ for lped/pbed eg + """ + Html.set_meta( self, dataset, **kwd ) if kwd.get('overwrite') == False: - #log.debug('@@@ rgenetics set_meta called with overwrite = False') + if verbose: + gal_Log.debug('@@@ rgenetics set_meta called with overwrite = False') return True try: efp = dataset.extra_files_path except: - #log.debug('@@@rgenetics set_meta failed %s - dataset %s has no efp ?' % (sys.exc_info()[0], dataset.name)) + if verbose: + gal_Log.debug('@@@rgenetics set_meta failed %s - dataset %s has no efp ?' % (sys.exc_info()[0], dataset.name)) return False try: flist = os.listdir(efp) except: - #log.debug('@@@rgenetics set_meta failed %s - dataset %s has no efp ?' % (sys.exc_info()[0],dataset.name)) + if verbose: gal_Log.debug('@@@rgenetics set_meta failed %s - dataset %s has no efp ?' % (sys.exc_info()[0],dataset.name)) return False if len(flist) == 0: - #log.debug('@@@rgenetics set_meta failed - %s efp %s is empty?' % (dataset.name,efp)) + if verbose: + gal_Log.debug('@@@rgenetics set_meta failed - %s efp %s is empty?' % (dataset.name,efp)) return False - bn = None - for f in flist: - n,e = os.path.splitext(f)[0] - if (not bn) and e in ('.ped','.map','.bim','.fam'): - bn = n - dataset.metadata.base_name = bn - if not bn: - bn = '?' self.regenerate_primary_file(dataset) if not dataset.info: dataset.info = 'Galaxy genotype datatype object' @@ -346,22 +346,23 @@ dataset.blurb = 'Composite file - Rgenetics Galaxy toolkit' return True + class SNPMatrix(Rgenetics): """ - fake class to distinguish different species of Rgenetics data collections + BioC SNPMatrix Rgenetics data collections """ file_ext="snpmatrix" - def set_peek( self, dataset, is_multi_byte=False ): + def set_peek( self, dataset, **kwd ): if not dataset.dataset.purged: dataset.peek = "Binary RGenetics file" dataset.blurb = data.nice_size( dataset.get_size() ) else: dataset.peek = 'file does not exist' dataset.blurb = 'file purged from disk' + def sniff(self,filename): - """ - need to check the file header hex code + """ need to check the file header hex code """ infile = open(dataset.file_name, "b") head = infile.read(16) @@ -371,9 +372,10 @@ else: return True + class Lped(Rgenetics): """ - fake class to distinguish different species of Rgenetics data collections + linkage pedigree (ped,map) Rgenetics data collections """ file_ext="lped" @@ -382,25 +384,24 @@ self.add_composite_file( '%s.ped', description = 'Pedigree File', substitute_name_with_metadata = 'base_name', is_binary = True ) self.add_composite_file( '%s.map', description = 'Map File', substitute_name_with_metadata = 'base_name', is_binary = True ) + class Pphe(Rgenetics): """ - fake class to distinguish different species of Rgenetics data collections + Plink phenotype file - header must have FID\tIID... Rgenetics data collections """ file_ext="pphe" def __init__( self, **kwd ): Rgenetics.__init__(self, **kwd) - self.add_composite_file( '%s.pphe', description = 'Plink Phenotype File', substitute_name_with_metadata = 'base_name' ) + self.add_composite_file( '%s.pphe', description = 'Plink Phenotype File', substitute_name_with_metadata = 'base_name', is_binary = True ) -class Lmap(Rgenetics): - """ - fake class to distinguish different species of Rgenetics data collections - """ - file_ext="lmap" + + class Fphe(Rgenetics): """ - fake class to distinguish different species of Rgenetics data collections + fbat pedigree file - mad format with ! as first char on header row + Rgenetics data collections """ file_ext="fphe" @@ -410,7 +411,7 @@ class Phe(Rgenetics): """ - fake class to distinguish different species of Rgenetics data collections + Phenotype file """ file_ext="phe" @@ -418,9 +419,12 @@ Rgenetics.__init__(self, **kwd) self.add_composite_file( '%s.phe', description = 'Phenotype File', substitute_name_with_metadata = 'base_name' ) + + class Fped(Rgenetics): """ - fake class to distinguish different species of Rgenetics data collections + FBAT pedigree format - single file, map is header row of rs numbers. Strange. + Rgenetics data collections """ file_ext="fped" @@ -428,9 +432,10 @@ Rgenetics.__init__(self, **kwd) self.add_composite_file( '%s.fped', description = 'FBAT format pedfile', substitute_name_with_metadata = 'base_name' ) + class Pbed(Rgenetics): """ - fake class to distinguish different species of Rgenetics data collections + Plink Binary compressed 2bit/geno Rgenetics data collections """ file_ext="pbed" @@ -442,7 +447,9 @@ class Eigenstratgeno(Rgenetics): """ - fake class to distinguish different species of Rgenetics data collections + Eigenstrat format - may be able to get rid of this + if we move to shellfish + Rgenetics data collections """ file_ext="eigenstratgeno" @@ -451,10 +458,13 @@ self.add_composite_file( '%s.eigenstratgeno', substitute_name_with_metadata = 'base_name', is_binary = True ) self.add_composite_file( '%s.ind', substitute_name_with_metadata = 'base_name', is_binary = True ) self.add_composite_file( '%s.map', substitute_name_with_metadata = 'base_name', is_binary = True ) + + class Eigenstratpca(Rgenetics): """ - fake class to distinguish different species of Rgenetics data collections + Eigenstrat PCA file for case control adjustment + Rgenetics data collections """ file_ext="eigenstratpca" @@ -462,18 +472,21 @@ Rgenetics.__init__(self, **kwd) self.add_composite_file( '%s.eigenstratpca', description = 'Eigenstrat PCA file', substitute_name_with_metadata = 'base_name' ) + class Snptest(Rgenetics): """ - fake class to distinguish different species of Rgenetics data collections + BioC snptest Rgenetics data collections """ file_ext="snptest" + class Pheno(Tabular): """ base class for pheno files """ file_ext = 'pheno' + class RexpBase( Html ): """ base class for BioC data structures in Galaxy @@ -492,16 +505,19 @@ composite_type = 'auto_primary_file' allow_datatype_change = False + def __init__( self, **kwd ): Html.__init__(self,**kwd) self.add_composite_file( '%s.pheno', description = 'Phenodata tab text file', substitute_name_with_metadata = 'base_name', is_binary=True) + def generate_primary_file( self, dataset = None ): - """ + """ This is called only at upload to write the html file cannot rename the datasets here - they come with the default unfortunately """ return '<html><head></head><body>AutoGenerated Primary File for Composite Dataset</body></html>' + def get_phecols(self, phenolist=[], maxConc=20): """ sept 2009: cannot use whitespace to split - make a more complex structure here @@ -527,7 +543,7 @@ else: for col,code in enumerate(row): # keep column order correct if col >= totcols: - log.warning('### get_phecols error in pheno file - row %d col %d (%s) longer than header %s' % (nrows, col, row, head)) + gal_Log.warning('### get_phecols error in pheno file - row %d col %d (%s) longer than header %s' % (nrows, col, row, head)) else: concordance[col].setdefault(code,0) # first one is zero concordance[col][code] += 1 @@ -573,6 +589,8 @@ res = [('no usable phenotype columns found',[('?',0),]),] return res + + def get_pheno(self,dataset): """ expects a .pheno file in the extra_files_dir - ugh @@ -591,12 +609,12 @@ else: p = [] return '\n'.join(p) - def set_peek( self, dataset, is_multi_byte=False ): + + def set_peek( self, dataset, **kwd ): """ expects a .pheno file in the extra_files_dir - ugh note that R is wierd and does not include the row.name in - the header. why? - """ + the header. why?""" if not dataset.dataset.purged: pp = os.path.join(dataset.extra_files_path,'%s.pheno' % dataset.metadata.base_name) try: @@ -608,14 +626,18 @@ else: dataset.peek = 'file does not exist\n' dataset.blurb = 'file purged from disk' + def get_peek( self, dataset ): - """expects a .pheno file in the extra_files_dir - ugh""" + """ + expects a .pheno file in the extra_files_dir - ugh + """ pp = os.path.join(dataset.extra_files_path,'%s.pheno' % dataset.metadata.base_name) try: p = file(pp,'r').readlines() except: p = ['##failed to find %s' % pp] return ''.join(p[:5]) + def get_file_peek(self,filename): """ can't really peek at a filename - need the extra_files_path and such? @@ -626,8 +648,10 @@ except: pass return ''.join(h[:5]) + def regenerate_primary_file(self,dataset): - """cannot do this until we are setting metadata + """ + cannot do this until we are setting metadata """ bn = dataset.metadata.base_name flist = os.listdir(dataset.extra_files_path) @@ -640,28 +664,34 @@ f.write("\n".join( rval )) f.write('\n') f.close() + def init_meta( self, dataset, copy_from=None ): - """Add metadata elements""" if copy_from: dataset.metadata = copy_from.metadata + def set_meta( self, dataset, **kwd ): + """ NOTE we apply the tabular machinary to the phenodata extracted from a BioC eSet or affybatch. + """ + Html.set_meta(self, dataset, **kwd) try: flist = os.listdir(dataset.extra_files_path) except: - #log.debug('@@@rexpression set_meta failed - no dataset?') + if verbose: + gal_Log.debug('@@@rexpression set_meta failed - no dataset?') return False - bn = None - for f in flist: - n = os.path.splitext(f)[0] - if not bn: - bn = n - dataset.metadata.base_name = bn + bn = dataset.metadata.base_name + if not bn: + for f in flist: + n = os.path.splitext(f)[0] + bn = n + dataset.metadata.base_name = bn if not bn: bn = '?' + dataset.metadata.base_name = bn pn = '%s.pheno' % (bn) pp = os.path.join(dataset.extra_files_path,pn) dataset.metadata.pheno_path=pp @@ -680,7 +710,7 @@ dataset.metadata.column_names = [] dataset.metadata.columns = 0 dataset.peek = 'No pheno file found' - if len(pf) > 1: + if pf and len(pf) > 1: dataset.metadata.pheCols = self.get_phecols(phenolist=pf) else: dataset.metadata.pheCols = [('','No useable phenotypes found',False),] @@ -690,8 +720,11 @@ if not dataset.blurb: dataset.blurb = 'R loadable BioC expression object for the Rexpression Galaxy toolkit' return True + def make_html_table( self, pp='nothing supplied from peek\n'): - """Create HTML table, used for displaying peek""" + """ + Create HTML table, used for displaying peek + """ out = ['<table cellspacing="0" cellpadding="3">',] p = pp.split('\n') try: @@ -712,25 +745,37 @@ except Exception, exc: out = "Can't create html table %s" % str( exc ) return out + def display_peek( self, dataset ): - """Returns formatted html of peek""" + """ + Returns formatted html of peek + """ out=self.make_html_table(dataset.peek) return out + def get_mime(self): - """Returns the mime type of the datatype""" + """ + Returns the mime type of the datatype + """ return 'text/html' + class Affybatch( RexpBase ): - """derived class for BioC data structures in Galaxy """ + """ + derived class for BioC data structures in Galaxy + """ + file_ext = "affybatch" def __init__( self, **kwd ): RexpBase.__init__(self, **kwd) self.add_composite_file( '%s.affybatch', description = 'AffyBatch R object saved to file', substitute_name_with_metadata = 'base_name', is_binary=True ) - + class Eset( RexpBase ): - """derived class for BioC data structures in Galaxy """ + """ + derived class for BioC data structures in Galaxy + """ file_ext = "eset" def __init__( self, **kwd ): @@ -738,8 +783,11 @@ self.add_composite_file( '%s.eset', description = 'ESet R object saved to file', substitute_name_with_metadata = 'base_name', is_binary = True ) + class MAlist( RexpBase ): - """derived class for BioC data structures in Galaxy """ + """ + derived class for BioC data structures in Galaxy + """ file_ext = "malist" def __init__( self, **kwd ): @@ -747,6 +795,8 @@ self.add_composite_file( '%s.malist', description = 'MAlist R object saved to file', substitute_name_with_metadata = 'base_name', is_binary = True ) + if __name__ == '__main__': import doctest, sys doctest.testmod(sys.modules[__name__]) + diff -r 9701e5ee128d -r 70930ea26347 lib/galaxy/tools/parameters/grouping.py --- a/lib/galaxy/tools/parameters/grouping.py Wed Mar 10 14:25:34 2010 -0500 +++ b/lib/galaxy/tools/parameters/grouping.py Wed Mar 10 16:09:37 2010 -0500 @@ -99,6 +99,10 @@ self.default_file_type = 'txt' self.file_type_to_ext = { 'auto':self.default_file_type } self.metadata_ref = 'files_metadata' + def get_file_base_name( self, context ): + log.debug('### uploadDataset get base name context = %s' % str(context)) + fd = context.get('files_metadata|base_name','?') + return fd def get_file_type( self, context ): return context.get( self.file_type_name, self.default_file_type ) def get_datatype_ext( self, trans, context ): @@ -291,15 +295,13 @@ temp_name, is_multi_byte = sniff.stream_to_file( StringIO.StringIO( d_type.generate_primary_file( dataset ) ), prefix='upload_auto_primary_file' ) dataset.primary_file = temp_name dataset.space_to_tab = False - dataset.precreated_name = dataset.name = 'Uploaded Composite Dataset (%s)' % ( file_type ) + dataset.precreated_name = dataset.name = dataset.metadata['base_name'] # was 'Uploaded Composite Dataset (%s)' % ( file_type ) else: file_bunch, warnings = get_one_filename( groups_incoming[ 0 ] ) - if dataset.datatype.composite_type: - precreated_name = 'Uploaded Composite Dataset (%s)' % ( file_type ) writable_files_offset = 1 dataset.primary_file = file_bunch.path dataset.space_to_tab = file_bunch.space_to_tab - dataset.precreated_name = file_bunch.precreated_name + dataset.precreated_name = dataset.metadata['base_name'] # file_bunch.precreated_name dataset.name = file_bunch.precreated_name dataset.warnings.extend( file_bunch.warnings ) if dataset.primary_file is None:#remove this before finish, this should create an empty dataset diff -r 9701e5ee128d -r 70930ea26347 lib/galaxy/web/controllers/dataset.py --- a/lib/galaxy/web/controllers/dataset.py Wed Mar 10 14:25:34 2010 -0500 +++ b/lib/galaxy/web/controllers/dataset.py Wed Mar 10 16:09:37 2010 -0500 @@ -1,4 +1,4 @@ -import logging, os, string, shutil, re, socket, mimetypes, smtplib, urllib +import logging, os, string, shutil, re, socket, mimetypes, smtplib, urllib, tempfile, zipfile, glob from galaxy.web.base.controller import * from galaxy.web.framework.helpers import time_ago, iff, grids @@ -7,11 +7,30 @@ from galaxy.datatypes.display_applications.util import encode_dataset_user, decode_dataset_user from email.MIMEText import MIMEText - import pkg_resources; pkg_resources.require( "Paste" ) import paste.httpexceptions +tmpd = tempfile.mkdtemp() +comptypes=[] +ziptype = '32' +tmpf = os.path.join( tmpd, 'compression_test.zip' ) +try: + archive = zipfile.ZipFile( tmpf, 'w', zipfile.ZIP_DEFLATED, True ) + archive.close() + comptypes.append( 'zip' ) + ziptype = '64' +except RuntimeError: + log.exception( "Compression error when testing zip compression. This option will be disabled for library downloads." ) +except (TypeError, zipfile.LargeZipFile): # ZIP64 is only in Python2.5+. Remove TypeError when 2.4 support is dropped + log.warning( 'Max zip file size is 2GB, ZIP64 not supported' ) + comptypes.append( 'zip' ) +try: + os.unlink( tmpf ) +except OSError: + pass +os.rmdir( tmpd ) + log = logging.getLogger( __name__ ) error_report_template = """ @@ -182,6 +201,97 @@ return 'This link may not be followed from within Galaxy.' @web.expose + def archive_composite_dataset( self, trans, data=None, **kwd ): + # save a composite object into a compressed archive for downloading + params = util.Params( kwd ) + if (params.do_action == None): + params.do_action = 'zip' # default + msg = util.restore_text( params.get( 'msg', '' ) ) + messagetype = params.get( 'messagetype', 'done' ) + if not data: + msg = "You must select at least one dataset" + messagetype = 'error' + else: + error = False + try: + if (params.do_action == 'zip'): + # Can't use mkstemp - the file must not exist first + tmpd = tempfile.mkdtemp() + tmpf = os.path.join( tmpd, 'library_download.' + params.do_action ) + if ziptype == '64': + archive = zipfile.ZipFile( tmpf, 'w', zipfile.ZIP_DEFLATED, True ) + else: + archive = zipfile.ZipFile( tmpf, 'w', zipfile.ZIP_DEFLATED ) + archive.add = lambda x, y: archive.write( x, y.encode('CP437') ) + elif params.do_action == 'tgz': + archive = util.streamball.StreamBall( 'w|gz' ) + elif params.do_action == 'tbz': + archive = util.streamball.StreamBall( 'w|bz2' ) + except (OSError, zipfile.BadZipFile): + error = True + log.exception( "Unable to create archive for download" ) + msg = "Unable to create archive for %s for download, please report this error" % data.name + messagetype = 'error' + if not error: + current_user_roles = trans.get_current_user_roles() + ext = data.extension + path = data.file_name + fname = os.path.split(path)[-1] + basename = data.metadata.base_name + efp = data.extra_files_path + htmlname = os.path.splitext(data.name)[0] + if not htmlname.endswith(ext): + htmlname = '%s_%s' % (htmlname,ext) + archname = '%s.html' % htmlname # fake the real nature of the html file + try: + archive.add(data.file_name,archname) + except IOError: + error = True + log.exception( "Unable to add composite parent %s to temporary library download archive" % data.file_name) + msg = "Unable to create archive for download, please report this error" + messagetype = 'error' + flist = glob.glob(os.path.join(efp,'*.*')) # glob returns full paths + for fpath in flist: + efp,fname = os.path.split(fpath) + try: + archive.add( fpath,fname ) + except IOError: + error = True + log.exception( "Unable to add %s to temporary library download archive" % fname) + msg = "Unable to create archive for download, please report this error" + messagetype = 'error' + continue + if not error: + if params.do_action == 'zip': + archive.close() + tmpfh = open( tmpf ) + # clean up now + try: + os.unlink( tmpf ) + os.rmdir( tmpd ) + except OSError: + error = True + msg = "Unable to remove temporary library download archive and directory" + log.exception( msg ) + messagetype = 'error' + if not error: + trans.response.set_content_type( "application/x-zip-compressed" ) + trans.response.headers[ "Content-Disposition" ] = "attachment; filename=GalaxyCompositeObject.zip" + return tmpfh + else: + trans.response.set_content_type( "application/x-tar" ) + outext = 'tgz' + if params.do_action == 'tbz': + outext = 'tbz' + trans.response.headers[ "Content-Disposition" ] = "attachment; filename=GalaxyLibraryFiles.%s" % outext + archive.wsgi_status = trans.response.wsgi_status() + archive.wsgi_headeritems = trans.response.wsgi_headeritems() + return archive.stream + return trans.show_error_message( msg ) + + + + @web.expose def display(self, trans, dataset_id=None, preview=False, filename=None, to_ext=None, **kwd): """Catches the dataset id and displays file contents as directed""" @@ -219,15 +329,19 @@ trans.log_event( "Display dataset id: %s" % str( dataset_id ) ) if to_ext: # Saving the file - trans.response.headers['Content-Length'] = int( os.stat( data.file_name ).st_size ) - if to_ext[0] != ".": - to_ext = "." + to_ext - valid_chars = '.,^_-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' - fname = data.name - fname = ''.join(c in valid_chars and c or '_' for c in fname)[0:150] - trans.response.headers["Content-Disposition"] = "attachment; filename=GalaxyHistoryItem-%s-[%s]%s" % (data.hid, fname, to_ext) - return open( data.file_name ) - + composite_extensions = trans.app.datatypes_registry.get_composite_extensions( ) + composite_extensions.append('html') + if data.ext in composite_extensions: + return self.archive_composite_dataset( trans, data, **kwd ) + else: + trans.response.headers['Content-Length'] = int( os.stat( data.file_name ).st_size ) + if to_ext[0] != ".": + to_ext = "." + to_ext + valid_chars = '.,^_-()[]0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ' + fname = data.name + fname = ''.join(c in valid_chars and c or '_' for c in fname)[0:150] + trans.response.headers["Content-Disposition"] = "attachment; filename=GalaxyHistoryItem-%s-[%s]%s" % (data.hid, fname, to_ext) + return open( data.file_name ) if os.path.exists( data.file_name ): max_peek_size = 1000000 # 1 MB if preview and os.stat( data.file_name ).st_size > max_peek_size: @@ -367,7 +481,10 @@ raise paste.httpexceptions.HTTPRequestRangeNotSatisfiable( "Invalid reference dataset id: %s." % str( dataset_id ) ) if 'display_url' not in kwd or 'redirect_url' not in kwd: return trans.show_error_message( 'Invalid parameters specified for "display at" link, please contact a Galaxy administrator' ) - redirect_url = kwd['redirect_url'] % urllib.quote_plus( kwd['display_url'] ) + try: + redirect_url = kwd['redirect_url'] % urllib.quote_plus( kwd['display_url'] ) + except: + redirect_url = kwd['redirect_url'] # not all will need custom text current_user_roles = trans.get_current_user_roles() if trans.app.security_agent.dataset_is_public( data.dataset ): return trans.response.send_redirect( redirect_url ) # anon access already permitted by rbac @@ -591,4 +708,3 @@ status = SUCCESS message = done_msg return status, message - \ No newline at end of file diff -r 9701e5ee128d -r 70930ea26347 lib/galaxy/web/controllers/library_common.py --- a/lib/galaxy/web/controllers/library_common.py Wed Mar 10 14:25:34 2010 -0500 +++ b/lib/galaxy/web/controllers/library_common.py Wed Mar 10 16:09:37 2010 -0500 @@ -1098,6 +1098,7 @@ def download_dataset_from_folder( self, trans, cntrller, id, library_id=None, **kwd ): """Catches the dataset id and displays file contents as directed""" show_deleted = util.string_as_bool( kwd.get( 'show_deleted', False ) ) + params = util.Params( kwd ) use_panels = util.string_as_bool( params.get( 'use_panels', False ) ) ldda = trans.sa_session.query( trans.app.model.LibraryDatasetDatasetAssociation ).get( trans.security.decode_id( id ) ) if not ldda.dataset: diff -r 9701e5ee128d -r 70930ea26347 test/base/twilltestcase.py --- a/test/base/twilltestcase.py Wed Mar 10 14:25:34 2010 -0500 +++ b/test/base/twilltestcase.py Wed Mar 10 16:09:37 2010 -0500 @@ -23,6 +23,8 @@ log = logging.getLogger( __name__ ) class TwillTestCase( unittest.TestCase ): + composite_extensions = ['html','lped','pbed','fped','pphe','eigenstratgeno','eset','affybatch','malist','test-data' ] + def setUp( self ): # Security helper @@ -61,9 +63,10 @@ else: files_differ = True if files_differ: - allowed_diff_count = attributes.get( 'lines_diff', 0 ) + allowed_diff_count = int(attributes.get( 'lines_diff', 0 )) diff = list( difflib.unified_diff( local_file, history_data, "local_file", "history_data" ) ) diff_lines = get_lines_diff( diff ) + log.debug('## files diff on %s and %s lines_diff=%d, found diff = %d' % (file1,file2,allowed_diff_count,diff_lines)) if diff_lines > allowed_diff_count: diff_slice = diff[0:40] #FIXME: This pdf stuff is rather special cased and has not been updated to consider lines_diff @@ -75,7 +78,7 @@ # PDF files contain creation dates, modification dates, ids and descriptions that change with each # new file, so we need to handle these differences. As long as the rest of the PDF file does # not differ we're ok. - valid_diff_strs = [ 'description', 'createdate', 'creationdate', 'moddate', 'id' ] + valid_diff_strs = [ 'description', 'createdate', 'creationdate', 'moddate', 'id', 'producer', 'creator' ] valid_diff = False for line in diff_slice: # Make sure to lower case strings before checking. @@ -109,7 +112,7 @@ attributes = {} if attributes.get( 'sort', False ): history_data.sort() - lines_diff = attributes.get( 'lines_diff', 0 ) + lines_diff = int(attributes.get( 'lines_diff', 0 )) line_diff_count = 0 diffs = [] for i in range( len( history_data ) ): @@ -194,36 +197,7 @@ raise AssertionError, "Invalid hid (%s) created when pasting %s" % ( hid, url_paste ) # Wait for upload processing to finish (TODO: this should be done in each test case instead) self.wait() - def upload_composite_datatype_file( self, ftype, ped_file='', map_file='', bim_file='', bed_file='', fam_file='', dbkey='unspecified (?)', base_name='rgenetics' ): - """Tests uploading either of 2 different composite data types ( lped and pbed )""" - self.visit_url( "%s/tool_runner/index?tool_id=upload1" % self.url ) - # Handle refresh_on_change - self.refresh_form( "file_type", ftype ) - tc.fv( "1", "dbkey", dbkey ) - tc.fv( "1", "files_metadata|base_name", base_name ) - if ftype == 'lped': - # lped data types include a ped_file and a map_file - ped_file = self.get_filename( ped_file ) - tc.formfile( "1", "files_0|file_data", ped_file ) - map_file = self.get_filename( map_file ) - tc.formfile( "1", "files_1|file_data", map_file ) - elif ftype == 'pbed': - # pbed data types include a bim_file, a bed_file and a fam_file - bim_file = self.get_filename( bim_file ) - tc.formfile( "1", "files_0|file_data", bim_file ) - bed_file = self.get_filename( bed_file ) - tc.formfile( "1", "files_1|file_data", bed_file ) - fam_file = self.get_filename( fam_file ) - tc.formfile( "1", "files_2|file_data", fam_file ) - else: - raise AssertionError, "Unsupported composite data type (%s) received, currently only lped and pbed data types are supported." % ftype - tc.submit( "runtool_btn" ) - self.check_page_for_string( 'The following job has been succesfully added to the queue:' ) - check_str = 'Uploaded Composite Dataset (%s)' % ftype - self.check_page_for_string( check_str ) - # Wait for upload processing to finish (TODO: this should be done in each test case instead) - self.wait() - self.check_history_for_string( check_str ) + # Functions associated with histories def check_history_for_errors( self ): """Raises an exception if there are errors in a history""" @@ -672,7 +646,7 @@ def verify_composite_datatype_file_content( self, file_name, hda_id, base_name = None, attributes = None ): local_name = self.get_filename( file_name ) if base_name is None: - base_name = file_name + base_name = os.path.split(file_name)[-1] temp_name = self.get_filename( '%s_temp' % file_name ) #This is a terrible way to generate a temp name self.visit_url( "%s/datasets/%s/display/%s" % ( self.url, self.security.encode_id( hda_id ), base_name ) ) data = self.last_page() @@ -915,9 +889,14 @@ # To help with debugging a tool, print out the form controls when the test fails print "form '%s' contains the following controls ( note the values )" % f.name control_names = [] + hidden_control_names = [] # cannot change these, so ignore or many complex page tool tests will fail + hc_prefix = '<HiddenControl(' for i, control in enumerate( f.controls ): - print "control %d: %s" % ( i, str( control ) ) - try: + print "control %d: %s" % ( i, str( control ) ) + if hc_prefix in str(control): + hidden_control_names.append(control.name) # cannot do much with these + else: + try: #check if a repeat element needs to be added if control.name not in kwd and control.name.endswith( '_add' ): #control name doesn't exist, could be repeat @@ -946,12 +925,14 @@ # Submit for refresh tc.submit( '___refresh_grouping___' ) return self.submit_form( form_no=form_no, button=button, **kwd ) - except Exception, e: + except Exception, e: log.debug( "In submit_form, continuing, but caught exception: %s" % str( e ) ) continue - control_names.append( control.name ) + control_names.append( control.name ) # No refresh_on_change attribute found in current form, so process as usual for control_name, control_value in kwd.items(): + if control_name in hidden_control_names: + continue # these cannot be handled safely - cause the test to barf out if not isinstance( control_value, list ): control_value = [ control_value ] try:
participants (1)
-
Greg Von Kuster