details: http://www.bx.psu.edu/hg/galaxy/rev/79fe45b43b4e changeset: 2886:79fe45b43b4e user: Ross Lazarus <ross.lazarus@gmail.com> date: Thu Oct 15 07:54:10 2009 -0400 description: remove debug and clean up genetics.py 1 file(s) affected in this change: lib/galaxy/datatypes/genetics.py diffs (504 lines): diff -r 7f5489d7c3d8 -r 79fe45b43b4e lib/galaxy/datatypes/genetics.py --- a/lib/galaxy/datatypes/genetics.py Wed Oct 14 18:43:38 2009 -0400 +++ b/lib/galaxy/datatypes/genetics.py Thu Oct 15 07:54:10 2009 -0400 @@ -1,463 +1,3 @@ -<<<<<<< local -""" -rgenetics datatypes -Use at your peril -Ross Lazarus -for the rgenetics and galaxy projects - -genome graphs datatypes derived from Interval datatypes -genome graphs datasets have a header row with appropriate columnames -The first column is always the marker - eg columname = rs, first row= rs12345 if the rows are snps -subsequent row values are all numeric ! Will fail if any non numeric (eg '+' or 'NA') values -ross lazarus for rgenetics -august 20 2007 -""" - -import logging, os, sys, time, tempfile, shutil -import data -from galaxy import util -from cgi import escape -import urllib -from galaxy.web import url_for -from galaxy.datatypes import metadata -from galaxy.datatypes.metadata import MetadataElement -from galaxy.datatypes.data import Text -from galaxy.datatypes.tabular import Tabular -from galaxy.datatypes.images import Html - -log = logging.getLogger(__name__) - - - -class GenomeGraphs( Tabular ): - """Tab delimited data containing a marker id and any number of numeric values""" - - """Add metadata elements""" - MetadataElement( name="markerCol", default=1, desc="Marker ID column", param=metadata.ColumnParameter ) - MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True ) - MetadataElement( name="column_types", default=[], desc="Column types", readonly=True, visible=False ) - file_ext = 'gg' - - def __init__(self, **kwd): - """Initialize gg datatype, by adding UCSC display apps""" - Tabular.__init__(self, **kwd) - self.add_display_app ( 'ucsc', 'Genome Graph', 'as_ucsc_display_file', 'ucsc_links' ) - - def set_peek( self, dataset ): - """Set the peek and blurb text""" - if not dataset.dataset.purged: - dataset.peek = data.get_file_peek( dataset.file_name ) - dataset.blurb = util.commaify( str( data.get_line_count( dataset.file_name ) ) ) + " rows" - #i don't think set_meta should not be called here, it should be called separately - self.set_meta( dataset ) - else: - dataset.peek = 'file does not exist' - dataset.blurb = 'file purged from disk' - - def get_estimated_display_viewport( self, dataset ): - """Return a chrom, start, stop tuple for viewing a file.""" - raise notImplemented - - def as_ucsc_display_file( self, dataset, **kwd ): - """Returns file""" - return file(dataset.file_name,'r') - - def ucsc_links( self, dataset, type, app, base_url ): - """ from the ever-helpful angie hinrichs angie@soe.ucsc.edu - a genome graphs call looks like this - http://genome.ucsc.edu/cgi-bin/hgGenome?clade=mammal&org=Human&db=hg18&hgGenome_dataSetName=dname - &hgGenome_dataSetDescription=test&hgGenome_formatType=best%20guess&hgGenome_markerType=best%20guess - &hgGenome_columnLabels=best%20guess&hgGenome_maxVal=&hgGenome_labelVals= - &hgGenome_maxGapToFill=25000000&hgGenome_uploadFile=http://galaxy.esphealth.org/datasets/333/display/index - &hgGenome_doSubmitUpload=submit - Galaxy gives this for an interval file - http://genome.ucsc.edu/cgi-bin/hgTracks?db=hg18&position=chr1:1-1000&hgt.customText= - http%3A%2F%2Fgalaxy.esphealth.org%2Fdisplay_as%3Fid%3D339%26display_app%3Ducsc - """ - ret_val = [] - ggtail = '&hgGenome_doSubmitUpload=submit' - if not dataset.dbkey: - dataset.dbkey = 'hg18' # punt! - if dataset.has_data: - for site_name, site_url in util.get_ucsc_by_build(dataset.dbkey): - if site_name in app.config.ucsc_display_sites: - site_url = site_url.replace('/hgTracks?','/hgGenome?') # for genome graphs - display_url = urllib.quote_plus( "%s%s/display_as?id=%i&display_app=%s" % (base_url, url_for( controller='root' ), dataset.id, type)) - sl = ["%sdb=%s" % (site_url,dataset.dbkey ),] - sl.append("&hgGenome_dataSetName=%s&hgGenome_dataSetDescription=%s" % (dataset.name, 'GalaxyGG_data')) - sl.append("&hgGenome_formatType=best%20guess&hgGenome_markerType=best%20guess") - sl.append("&hgGenome_columnLabels=first%20row&hgGenome_maxVal=&hgGenome_labelVals=") - sl.append("&hgGenome_maxGapToFill=25000000&hgGenome_uploadFile=%%s") - sl.append(ggtail) - s = urllib.quote_plus( ''.join(sl) ) - link = '%s?redirect_url=%s&display_url=%s' % ( internal_url, s, display_url ) - ret_val.append( (site_name, link) ) - return ret_val - - def validate( self, dataset ): - """Validate a gg file - all numeric after header row""" - errors = list() - infile = open(dataset.file_name, "r") - header= infile.next() # header - for i,row in enumerate(infile): - ll = row.strip().split('\t') - badvals = [] - for j,x in enumerate(ll): - try: - x = float(x) - except: - badval.append('col%d:%s' % (j+1,x)) - if len(badvals) > 0: - errors.append('row %d, %s' % (' '.join(badvals))) - return errors - - def repair_methods( self, dataset ): - """Return options for removing errors along with a description""" - return [("lines","Remove erroneous lines")] - - -class rgTabList(Tabular): - """ for sampleid and for featureid lists of exclusions or inclusions in the clean tool - featureid subsets on statistical criteria -> specialized display such as gg - """ - file_ext = "rgTList" - - - def __init__(self, **kwd): - """Initialize featurelistt datatype""" - Tabular.__init__( self, **kwd ) - self.column_names = [] - - def make_html_table( self, dataset, skipchars=[] ): - """Create HTML table, used for displaying peek""" - out = ['<table cellspacing="0" cellpadding="3">'] - comments = [] - try: - # Generate column header - out.append( '<tr>' ) - for i, name in enumerate( self.column_names ): - out.append( '<th>%s.%s</th>' % ( str( i+1 ), name ) ) - if dataset.metadata.columns - len( self.column_names ) > 0: - for i in range( len( self.column_names ), dataset.metadata.columns ): - out.append( '<th>%s</th>' % str( i+1 ) ) - out.append( '</tr>' ) - out.append( self.make_html_peek_rows( dataset, skipchars=skipchars ) ) - out.append( '</table>' ) - out = "".join( out ) - except Exception, exc: - out = "Can't create peek %s" % exc - return out - -class rgSampleList(rgTabList): - """ for sampleid exclusions or inclusions in the clean tool - output from QC eg excess het, gender error, ibd pair member,eigen outlier,excess mendel errors,... - since they can be uploaded, should be flexible - but they are persistent at least - same infrastructure for expression? - """ - file_ext = "rgSList" - - def __init__(self, **kwd): - """Initialize samplelist datatype""" - rgTabList.__init__( self, **kwd ) - self.column_names[0] = 'FID' - self.column_names[1] = 'IID' - # this is what Plink wants as at 2009 - - -class rgFeatureList( rgTabList ): - """ for featureid lists of exclusions or inclusions in the clean tool - output from QC eg low maf, high missingness, bad hwe in controls, excess mendel errors,... - featureid subsets on statistical criteria -> specialized display such as gg - same infrastructure for expression? - """ - file_ext = "rgFList" - - def __init__(self, **kwd): - """Initialize featurelist datatype""" - rgTabList.__init__( self, **kwd ) - for i,s in enumerate(['#FeatureId', 'Chr', 'Genpos', 'Mappos']): - self.column_names[i] = s - - - -class Rgenetics(Html): - """class to use for rgenetics""" - """Add metadata elements""" - MetadataElement( name="base_name", desc="base name for all transformed versions of this genetic dataset", default="galaxy", readonly=True, set_in_upload=True) - - file_ext="html" - composite_type = 'auto_primary_file' - allow_datatype_change = False - - def missing_meta( self, dataset=None, **kwargs): - """Checks for empty meta values""" - for key, value in dataset.metadata.items(): - if not value: - return True - return False - - def generate_primary_file( self, dataset = None ): - rval = ['<html><head><title>Files for Composite Dataset (%s)</title></head><p/>This composite dataset is composed of the following files:<p/><ul>' % ( self.file_ext ) ] - for composite_name, composite_file in self.get_composite_files( dataset = dataset ).iteritems(): - opt_text = '' - if composite_file.optional: - opt_text = ' (optional)' - rval.append( '<li><a href="%s">%s</a>%s' % ( composite_name, composite_name, opt_text ) ) - rval.append( '</ul></html>' ) - return "\n".join( rval ) - -class SNPMatrix(Rgenetics): - """fake class to distinguish different species of Rgenetics data collections - """ - file_ext="snpmatrix" - - def set_peek( self, dataset ): - if not dataset.dataset.purged: - dataset.peek = "Binary RGenetics file" - dataset.blurb = data.nice_size( dataset.get_size() ) - else: - dataset.peek = 'file does not exist' - dataset.blurb = 'file purged from disk' - - -class Lped(Rgenetics): - """fake class to distinguish different species of Rgenetics data collections - """ - file_ext="lped" - - def __init__( self, **kwd ): - Rgenetics.__init__( self, **kwd ) - self.add_composite_file( '%s.ped', description = 'Pedigree File', substitute_name_with_metadata = 'base_name', is_binary = True ) - self.add_composite_file( '%s.map', description = 'Map File', substitute_name_with_metadata = 'base_name', is_binary = True ) - - -class Pphe(Rgenetics): - """fake class to distinguish different species of Rgenetics data collections - """ - file_ext="pphe" - - def __init__( self, **kwd ): - Rgenetics.__init__( self, **kwd ) - self.add_composite_file( '%s.pphe', description = 'Plink Phenotype File', substitute_name_with_metadata = 'base_name' ) - - -class Lmap(Rgenetics): - """fake class to distinguish different species of Rgenetics data collections - """ - file_ext="lmap" - -class Fphe(Rgenetics): - """fake class to distinguish different species of Rgenetics data collections - """ - file_ext="fphe" - - def __init__( self, **kwd ): - Rgenetics.__init__( self, **kwd ) - self.add_composite_file( '%s.fphe', description = 'FBAT Phenotype File', substitute_name_with_metadata = 'base_name' ) - -class Phe(Rgenetics): - """fake class to distinguish different species of Rgenetics data collections - """ - file_ext="phe" - - def __init__( self, **kwd ): - Rgenetics.__init__( self, **kwd ) - self.add_composite_file( '%s.phe', description = 'Phenotype File', substitute_name_with_metadata = 'base_name' ) - - - -class Fped(Rgenetics): - """fake class to distinguish different species of Rgenetics data collections - """ - file_ext="fped" - - def __init__( self, **kwd ): - Rgenetics.__init__( self, **kwd ) - self.add_composite_file( '%s.fped', description = 'FBAT format pedfile', substitute_name_with_metadata = 'base_name' ) - - -class Pbed(Rgenetics): - """fake class to distinguish different species of Rgenetics data collections - """ - file_ext="pbed" - - def __init__( self, **kwd ): - Rgenetics.__init__( self, **kwd ) - self.add_composite_file( '%s.bim', substitute_name_with_metadata = 'base_name', is_binary = True ) - self.add_composite_file( '%s.bed', substitute_name_with_metadata = 'base_name', is_binary = True ) - self.add_composite_file( '%s.fam', substitute_name_with_metadata = 'base_name', is_binary = True ) - -class Eigenstratgeno(Rgenetics): - """fake class to distinguish different species of Rgenetics data collections - """ - file_ext="eigenstratgeno" - - def __init__( self, **kwd ): - Rgenetics.__init__( self, **kwd ) - self.add_composite_file( '%s.eigenstratgeno', substitute_name_with_metadata = 'base_name', is_binary = True ) - self.add_composite_file( '%s.ind', substitute_name_with_metadata = 'base_name', is_binary = True ) - self.add_composite_file( '%s.map', substitute_name_with_metadata = 'base_name', is_binary = True ) - - - -class Eigenstratpca(Rgenetics): - """fake class to distinguish different species of Rgenetics data collections - """ - file_ext="eigenstratpca" - -class Snptest(Rgenetics): - """fake class to distinguish different species of Rgenetics data collections - """ - file_ext="snptest" - -class RexpBase( Html ): - """base class for BioC data structures in Galaxy - must be constructed with the pheno data in place since that - goes into the metadata for each instance""" - - """Add metadata elements""" - MetadataElement( name="columns", default=0, desc="Number of columns", readonly=True, visible=False ) - MetadataElement( name="column_names", default=[], desc="Column names", readonly=True,visible=True ) - MetadataElement( name="base_name", - desc="base name for all transformed versions of this genetic dataset", readonly=True, default='galaxy', set_in_upload=True) - ### Do we really need these below? can we rely on dataset.extra_files_path: os.path.join( dataset.extra_files_path, '%s.phenodata' % dataset.metadata.base_name ) ? - ### Do these have a different purpose? Ross will need to clarify - ### Uploading these datatypes will not work until this is sorted out (set_peek fails)... - MetadataElement( name="pheno_path", - desc="Path to phenotype data for this experiment", readonly=True) - MetadataElement( name="pheno", - desc="Phenotype data for this experiment", readonly=True) - - file_ext = None - - is_binary = True - - allow_datatype_change = False - - composite_type = 'basic' - - def __init__( self, **kwd ): - Html.__init__( self, **kwd ) - self.add_composite_file( '%s.phenodata', substitute_name_with_metadata = 'base_name', is_binary = True ) - self.metadata.pheno_path = '%s.phenodata' % (self.metadata.base_name) - - def missing_meta( self, dataset=None, **kwargs): - """Checks for empty meta values""" - for key, value in dataset.metadata.items(): - if not value: - return True - return False - - def get_pheno(self,dataset): - """expects a .pheno file in the extra_files_dir - ugh - note that R is wierd and does not include the row.name in - the header. why?""" - p = file(dataset.metadata.pheno_path,'r').readlines() #this fails - head = p[0].strip().split('\t') - head.insert(0,'ChipFileName') # fix R write.table b0rken-ness - p[0] = '\t'.join(head) - p = '\n'.join(p) - return p - - def set_peek( self, dataset ): - """expects a .pheno file in the extra_files_dir - ugh - note that R is wierd and does not include the row.name in - the header. why?""" - p = self.get_pheno(dataset) - dataset.peek = p[:20] - dataset.info = p[0] - dataset.blurb = 'R loadable BioC expression object for the Rexpression Galaxy toolkit' - - # stolen from Tabular - # class Tabular( data.Text ): - """Tab delimited data""" - - """Add metadata elements""" - def init_meta( self, dataset, copy_from=None ): - if copy_from: - dataset.metadata = copy_from.metadata - - def set_readonly_meta( self, dataset, **kwd ): - """Resets the values of readonly metadata elements.""" - RexpBase.set_meta( self, dataset ) - - def set_meta( self, dataset, **kwd ): - - """ - NOTE we apply the tabular machinary to the phenodata extracted - from a BioC eSet or affybatch. - - """ - if not dataset.peek: - dataset.set_peek() - pk = dataset.get_pheno # read the basename.phenodata in the extra_files_path - ###this is probably not the best source, can we just access the raw data directly? - if pk: - p = pk.split('\n') - h = p[0].strip().split('\t') # hope is header - h = [escape(x) for x in h] - dataset.metadata.column_names = h - dataset.metadata.columns = len(h) - else: - dataset.metadata.column_names = [] - dataset.metadata.columns = 0 - - def make_html_table( self, dataset): - """Create HTML table, used for displaying peek""" - out = ['<table cellspacing="0" cellpadding="3">',] - try: - # Generate column header - pk = dataset.peek - p = pk.split('\n') - for i,row in enumerate(p): - lrow = row.strip().split('\t') - if i == 0: - orow = ['<th>%s</th>' % escape(x) for x in lrow] - orow.insert(0,'<tr>') - orow.append('</tr>') - else: - orow = ['<td>%s</td>' % escape(x) for x in lrow] - orow.insert(0,'<tr>') - orow.append('</tr>') - out.append(''.join(orow)) - out.append( '</table>' ) - out = "\n".join( out ) - except Exception, exc: - out = "Can't create peek %s" % str( exc ) - return out - - def display_peek( self, dataset ): - """Returns formatted html of peek""" - if not dataset.peek: - dataset.set_peek() - return self.make_html_table( dataset ) - - def get_mime(self): - """Returns the mime type of the datatype""" - return 'application/gzip' - - -class AffyBatch( RexpBase ): - """derived class for BioC data structures in Galaxy """ - file_ext = "affybatch" - - -class ESet( RexpBase ): - """derived class for BioC data structures in Galaxy """ - file_ext = "eset" - - -class MAList( RexpBase ): - """derived class for BioC data structures in Galaxy """ - file_ext = "malist" - - -if __name__ == '__main__': - import doctest, sys - doctest.testmod(sys.modules[__name__]) - -======= """ rgenetics datatypes Use at your peril @@ -957,7 +497,6 @@ p = file(pp,'r').readlines() except: p = ['##failed to find %s' % pp,] - gal_Log.debug('@@@rexpression set_peek, dataset.name=%s,\npp=%s,\np=%s' % (dataset.name,pp,p[:3])) dataset.peek = ''.join(p[:5]) dataset.blurb = 'Galaxy Rexpression composite file' else: @@ -972,7 +511,6 @@ p = file(pp,'r').readlines() except: p = ['##failed to find %s' % pp] - gal_Log.debug('@@@rexpression get_peek, dataset.file_name=%s,\npp=%s,\np=%s' % (dataset.file_name,pp,p[:3])) return ''.join(p[:5]) def get_file_peek(self,filename): @@ -996,7 +534,6 @@ sfname = os.path.split(fname)[-1] rval.append( '<li><a href="%s">%s</a>' % ( sfname, sfname ) ) rval.append( '</ul></html>' ) - gal_Log.debug('rexpression regenerate primary file, writing %s' % rval) f = file(dataset.file_name,'w') f.write("\n".join( rval )) f.write('\n') @@ -1054,7 +591,6 @@ dataset.info = 'Galaxy Expression datatype object' if not dataset.blurb: dataset.blurb = 'R loadable BioC expression object for the Rexpression Galaxy toolkit' - gal_Log.debug('@@@rexpression set_meta on dsn=%s, pf=%s, peek=%s' % (dataset.file_name,''.join(pf[:5]),dataset.peek)) return True def make_html_table( self, pp='nothing supplied from peek\n'): @@ -1125,4 +661,3 @@ doctest.testmod(sys.modules[__name__]) ->>>>>>> other