details: http://www.bx.psu.edu/hg/galaxy/rev/ba9ee5828e7b
changeset: 3162:ba9ee5828e7b
user: Greg Von Kuster <greg(a)bx.psu.edu>
date: Wed Dec 09 12:18:27 2009 -0500
description:
Fix the recently introduced ColumnListParameter.from_html() method to handle multi-select lists - should fix at least the tabular2fasta broken functional test.
diffstat:
lib/galaxy/tools/parameters/basic.py | 10 +++++++++-
1 files changed, 9 insertions(+), 1 deletions(-)
diffs (20 lines):
diff -r b400212305b6 -r ba9ee5828e7b lib/galaxy/tools/parameters/basic.py
--- a/lib/galaxy/tools/parameters/basic.py Wed Dec 09 10:15:15 2009 -0500
+++ b/lib/galaxy/tools/parameters/basic.py Wed Dec 09 12:18:27 2009 -0500
@@ -769,7 +769,15 @@
Label convention prepends column number with a 'c', but tool uses the integer. This
removes the 'c' when entered into a workflow.
"""
- if value.startswith("c"):
+ if type( value ) == list:
+ # We have a multi-select list
+ new_value = []
+ for item in value:
+ if item.startswith( "c" ):
+ item = item[1:]
+ new_value.append( item )
+ value = new_value
+ elif value and value.startswith( "c" ):
value = value[1:]
return super( ColumnListParameter, self ).from_html( value, trans, context )
def get_column_list( self, trans, other_values ):
details: http://www.bx.psu.edu/hg/galaxy/rev/022a8c94883f
changeset: 3157:022a8c94883f
user: Greg Von Kuster <greg(a)bx.psu.edu>
date: Tue Dec 08 11:46:13 2009 -0500
description:
Better approach to altering initial content of output dataset if necessary. The upload tool will now call the data type's groom_output_dataset() method ( a better name than before_setting_metadata since it is not related to metadata ). This will now also run on the cluster.
diffstat:
lib/galaxy/datatypes/binary.py | 75 +++++++++----------------
lib/galaxy/datatypes/chrominfo.py | 3 -
lib/galaxy/datatypes/coverage.py | 5 +-
lib/galaxy/datatypes/data.py | 9 +--
lib/galaxy/datatypes/genetics.py | 64 ---------------------
lib/galaxy/datatypes/images.py | 14 +----
lib/galaxy/datatypes/interval.py | 18 ------
lib/galaxy/datatypes/qualityscore.py | 13 +----
lib/galaxy/datatypes/sequence.py | 29 ---------
lib/galaxy/datatypes/tabular.py | 9 ---
lib/galaxy/datatypes/tracks.py | 3 -
lib/galaxy/datatypes/xml.py | 3 -
lib/galaxy/jobs/__init__.py | 1 -
lib/galaxy/tools/__init__.py | 3 -
lib/galaxy/web/controllers/library.py | 2 -
lib/galaxy/web/controllers/library_admin.py | 2 -
lib/galaxy/web/controllers/root.py | 3 -
tools/data_source/hbvar_filter.py | 1 -
tools/data_source/upload.py | 4 +
tools/maf/maf_to_bed_code.py | 1 -
20 files changed, 38 insertions(+), 224 deletions(-)
diffs (833 lines):
diff -r 83dc9642a59e -r 022a8c94883f lib/galaxy/datatypes/binary.py
--- a/lib/galaxy/datatypes/binary.py Tue Dec 08 09:05:35 2009 -0500
+++ b/lib/galaxy/datatypes/binary.py Tue Dec 08 11:46:13 2009 -0500
@@ -17,9 +17,6 @@
class Binary( data.Data ):
"""Binary data"""
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def set_peek( self, dataset, is_multi_byte=False ):
"""Set the peek and blurb text"""
if not dataset.dataset.purged:
@@ -36,9 +33,6 @@
"""Class describing an ab1 binary sequence file"""
file_ext = "ab1"
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def set_peek( self, dataset, is_multi_byte=False ):
if not dataset.dataset.purged:
export_url = "/history_add_to?" + urlencode( {'history_id':dataset.history_id,'ext':'ab1','name':'ab1 sequence','info':'Sequence file','dbkey':dataset.dbkey} )
@@ -58,39 +52,32 @@
file_ext = "bam"
MetadataElement( name="bam_index", desc="BAM Index File", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
- def before_setting_metadata( self, dataset ):
- """ Ensures that the Bam file contents are sorted. This function is called on the dataset before set_meta() is called."""
- sorted = False
- try:
- index_file = dataset.metadata.bam_index
- except:
- index_file = None
- if index_file:
- # If an index file already exists on disk, then the data must have previously been sorted
- # since samtools requires a sorted Bam file in order to create an index.
- sorted = os.path.exists( index_file.file_name )
- if not sorted:
- # Use samtools to sort the Bam file
- tmp_dir = tempfile.gettempdir()
- # Create a symlink from the temporary directory to the dataset file so that samtools can mess with it.
- tmp_dataset_file_name = os.path.join( tmp_dir, os.path.basename( dataset.file_name ) )
- # Here tmp_dataset_file_name looks something like /tmp/dataset_XX.dat
- os.symlink( dataset.file_name, tmp_dataset_file_name )
- # Sort alignments by leftmost coordinates. File <out.prefix>.bam will be created.
- # TODO: This command may also create temporary files <out.prefix>.%d.bam when the
- # whole alignment cannot be fitted into memory ( controlled by option -m ). We're
- # not handling this case here.
- tmp_sorted_dataset_file = tempfile.NamedTemporaryFile( prefix=tmp_dataset_file_name )
- tmp_sorted_dataset_file_name = tmp_sorted_dataset_file.name
- tmp_sorted_dataset_file.close()
- command = "samtools sort %s %s 2>/dev/null" % ( tmp_dataset_file_name, tmp_sorted_dataset_file_name )
- proc = subprocess.Popen( args=command, shell=True )
- proc.wait()
- tmp_sorted_bam_file_name = '%s.bam' % tmp_sorted_dataset_file_name
- # Move tmp_sorted_bam_file_name to our output dataset location
- shutil.move( tmp_sorted_bam_file_name, dataset.file_name )
- # Remove all remaining temporary files
- os.unlink( tmp_dataset_file_name )
+ def groom_dataset_content( self, file_name ):
+ """
+ Ensures that the Bam file contents are sorted. This function is called
+ on an output dataset after the content is initially generated.
+ """
+ # Use samtools to sort the Bam file
+ tmp_dir = tempfile.gettempdir()
+ # Create a symlink from the temporary directory to the dataset file so that samtools can mess with it.
+ tmp_dataset_file_name = os.path.join( tmp_dir, os.path.basename( file_name ) )
+ # Here tmp_dataset_file_name looks something like /tmp/dataset_XX.dat
+ os.symlink( file_name, tmp_dataset_file_name )
+ # Sort alignments by leftmost coordinates. File <out.prefix>.bam will be created.
+ # TODO: This command may also create temporary files <out.prefix>.%d.bam when the
+ # whole alignment cannot be fitted into memory ( controlled by option -m ). We're
+ # not handling this case here.
+ tmp_sorted_dataset_file = tempfile.NamedTemporaryFile( prefix=tmp_dataset_file_name )
+ tmp_sorted_dataset_file_name = tmp_sorted_dataset_file.name
+ tmp_sorted_dataset_file.close()
+ command = "samtools sort %s %s 2>/dev/null" % ( tmp_dataset_file_name, tmp_sorted_dataset_file_name )
+ proc = subprocess.Popen( args=command, shell=True )
+ proc.wait()
+ tmp_sorted_bam_file_name = '%s.bam' % tmp_sorted_dataset_file_name
+ # Move tmp_sorted_bam_file_name to our output dataset location
+ shutil.move( tmp_sorted_bam_file_name, file_name )
+ # Remove all remaining temporary files
+ os.unlink( tmp_dataset_file_name )
def init_meta( self, dataset, copy_from=None ):
Binary.init_meta( self, dataset, copy_from=copy_from )
def set_meta( self, dataset, overwrite = True, **kwd ):
@@ -151,9 +138,6 @@
"""Class describing a zip archive of binary sequence files"""
file_ext = "binseq.zip"
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def set_peek( self, dataset, is_multi_byte=False ):
if not dataset.dataset.purged:
zip_file = zipfile.ZipFile( dataset.file_name, "r" )
@@ -176,9 +160,6 @@
"""Class describing an scf binary sequence file"""
file_ext = "scf"
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def set_peek( self, dataset, is_multi_byte=False ):
if not dataset.dataset.purged:
export_url = "/history_add_to?" + urlencode({'history_id':dataset.history_id,'ext':'scf','name':'scf sequence','info':'Sequence file','dbkey':dataset.dbkey})
@@ -196,11 +177,9 @@
class Sff( Binary ):
""" Standard Flowgram Format (SFF) """
file_ext = "sff"
+
def __init__( self, **kwd ):
Binary.__init__( self, **kwd )
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def sniff( self, filename ):
# The first 4 bytes of any sff file is '.sff', and the file is binary. For details
# about the format, see http://www.ncbi.nlm.nih.gov/Traces/trace.cgi?cmd=show&f=formats&m=doc&s=for…
diff -r 83dc9642a59e -r 022a8c94883f lib/galaxy/datatypes/chrominfo.py
--- a/lib/galaxy/datatypes/chrominfo.py Tue Dec 08 09:05:35 2009 -0500
+++ b/lib/galaxy/datatypes/chrominfo.py Tue Dec 08 11:46:13 2009 -0500
@@ -12,6 +12,3 @@
MetadataElement( name="chrom", default=1, desc="Chrom column", param=metadata.ColumnParameter )
MetadataElement( name="length", default=2, desc="Length column", param=metadata.ColumnParameter )
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
diff -r 83dc9642a59e -r 022a8c94883f lib/galaxy/datatypes/coverage.py
--- a/lib/galaxy/datatypes/coverage.py Tue Dec 08 09:05:35 2009 -0500
+++ b/lib/galaxy/datatypes/coverage.py Tue Dec 08 11:46:13 2009 -0500
@@ -28,10 +28,7 @@
MetadataElement( name="forwardCol", default=3, desc="Forward or aggregate read column", param=metadata.ColumnParameter )
MetadataElement( name="reverseCol", desc="Optional reverse read column", param=metadata.ColumnParameter, optional=True, no_value=0 )
MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False )
-
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
+
def get_track_window(self, dataset, data, start, end):
"""
Assumes we have a numpy file.
diff -r 83dc9642a59e -r 022a8c94883f lib/galaxy/datatypes/data.py
--- a/lib/galaxy/datatypes/data.py Tue Dec 08 09:05:35 2009 -0500
+++ b/lib/galaxy/datatypes/data.py Tue Dec 08 11:46:13 2009 -0500
@@ -84,6 +84,9 @@
except OSError, e:
log.exception('%s reading a file that does not exist %s' % (self.__class__.__name__, dataset.file_name))
return ''
+ def groom_dataset_content( self, file_name ):
+ """This function is called on an output dataset file after the content is initially generated."""
+ pass
def init_meta( self, dataset, copy_from=None ):
# Metadata should be left mostly uninitialized. Dataset will
# handle returning default values when metadata is not set.
@@ -256,9 +259,6 @@
if return_output:
return converted_dataset
return "The file conversion of %s on data %s has been added to the Queue." % (converter.name, original_dataset.hid)
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is set."""
- pass
def after_setting_metadata( self, dataset ):
"""This function is called on the dataset after metadata is set."""
dataset.clear_associated_files( metadata_safe = True )
@@ -346,9 +346,6 @@
def get_mime(self):
"""Returns the mime type of the datatype"""
return 'text/plain'
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is set."""
- pass
def set_meta( self, dataset, **kwd ):
"""
Set the number of lines of data in dataset,
diff -r 83dc9642a59e -r 022a8c94883f lib/galaxy/datatypes/genetics.py
--- a/lib/galaxy/datatypes/genetics.py Tue Dec 08 09:05:35 2009 -0500
+++ b/lib/galaxy/datatypes/genetics.py Tue Dec 08 11:46:13 2009 -0500
@@ -47,9 +47,6 @@
self.add_display_app ( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' )
def as_ucsc_display_file( self, dataset, **kwd ):
return open( dataset.file_name )
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def set_meta( self, dataset, overwrite = True, **kwd ):
i = 0
for i, line in enumerate( file ( dataset.file_name ) ):
@@ -205,9 +202,6 @@
"""Initialize featurelistt datatype"""
Tabular.__init__( self, **kwd )
self.column_names = []
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def make_html_table( self, dataset, skipchars=[] ):
"""Create HTML table, used for displaying peek"""
out = ['<table cellspacing="0" cellpadding="3">']
@@ -246,9 +240,6 @@
self.column_names[0] = 'FID'
self.column_names[1] = 'IID'
# this is what Plink wants as at 2009
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def sniff(self,filename):
"""
"""
@@ -273,9 +264,6 @@
rgTabList.__init__( self, **kwd )
for i,s in enumerate(['#FeatureId', 'Chr', 'Genpos', 'Mappos']):
self.column_names[i] = s
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
class Rgenetics(Html):
"""
@@ -329,9 +317,6 @@
f.write("\n".join( rval ))
f.write('\n')
f.close()
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def set_meta( self, dataset, **kwd ):
"""
for lped/pbed eg
@@ -373,9 +358,6 @@
"""
file_ext="snpmatrix"
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def set_peek( self, dataset, is_multi_byte=False ):
if not dataset.dataset.purged:
dataset.peek = "Binary RGenetics file"
@@ -405,9 +387,6 @@
Rgenetics.__init__(self, **kwd)
self.add_composite_file( '%s.ped', description = 'Pedigree File', substitute_name_with_metadata = 'base_name', is_binary = True )
self.add_composite_file( '%s.map', description = 'Map File', substitute_name_with_metadata = 'base_name', is_binary = True )
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
class Pphe(Rgenetics):
"""
@@ -418,9 +397,6 @@
def __init__( self, **kwd ):
Rgenetics.__init__(self, **kwd)
self.add_composite_file( '%s.pphe', description = 'Plink Phenotype File', substitute_name_with_metadata = 'base_name' )
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
class Lmap(Rgenetics):
"""
@@ -428,10 +404,6 @@
"""
file_ext="lmap"
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
-
class Fphe(Rgenetics):
"""
fake class to distinguish different species of Rgenetics data collections
@@ -441,9 +413,6 @@
def __init__( self, **kwd ):
Rgenetics.__init__(self, **kwd)
self.add_composite_file( '%s.fphe', description = 'FBAT Phenotype File', substitute_name_with_metadata = 'base_name' )
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
class Phe(Rgenetics):
"""
@@ -454,9 +423,6 @@
def __init__( self, **kwd ):
Rgenetics.__init__(self, **kwd)
self.add_composite_file( '%s.phe', description = 'Phenotype File', substitute_name_with_metadata = 'base_name' )
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
class Fped(Rgenetics):
"""
@@ -467,9 +433,6 @@
def __init__( self, **kwd ):
Rgenetics.__init__(self, **kwd)
self.add_composite_file( '%s.fped', description = 'FBAT format pedfile', substitute_name_with_metadata = 'base_name' )
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
class Pbed(Rgenetics):
"""
@@ -482,9 +445,6 @@
self.add_composite_file( '%s.bim', substitute_name_with_metadata = 'base_name', is_binary = True )
self.add_composite_file( '%s.bed', substitute_name_with_metadata = 'base_name', is_binary = True )
self.add_composite_file( '%s.fam', substitute_name_with_metadata = 'base_name', is_binary = True )
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
class Eigenstratgeno(Rgenetics):
"""
@@ -497,9 +457,6 @@
self.add_composite_file( '%s.eigenstratgeno', substitute_name_with_metadata = 'base_name', is_binary = True )
self.add_composite_file( '%s.ind', substitute_name_with_metadata = 'base_name', is_binary = True )
self.add_composite_file( '%s.map', substitute_name_with_metadata = 'base_name', is_binary = True )
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
class Eigenstratpca(Rgenetics):
"""
@@ -510,27 +467,18 @@
def __init__( self, **kwd ):
Rgenetics.__init__(self, **kwd)
self.add_composite_file( '%s.eigenstratpca', description = 'Eigenstrat PCA file', substitute_name_with_metadata = 'base_name' )
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
class Snptest(Rgenetics):
"""
fake class to distinguish different species of Rgenetics data collections
"""
file_ext="snptest"
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
class Pheno(Tabular):
"""
base class for pheno files
"""
file_ext = 'pheno'
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
class RexpBase( Html ):
"""
@@ -698,9 +646,6 @@
f.write("\n".join( rval ))
f.write('\n')
f.close()
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def init_meta( self, dataset, copy_from=None ):
"""Add metadata elements"""
if copy_from:
@@ -789,9 +734,6 @@
RexpBase.__init__(self, **kwd)
self.add_composite_file( '%s.affybatch', description = 'AffyBatch R object saved to file',
substitute_name_with_metadata = 'base_name', is_binary=True )
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
class Eset( RexpBase ):
"""derived class for BioC data structures in Galaxy """
@@ -801,9 +743,6 @@
RexpBase.__init__(self, **kwd)
self.add_composite_file( '%s.eset', description = 'ESet R object saved to file',
substitute_name_with_metadata = 'base_name', is_binary = True )
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
class MAlist( RexpBase ):
"""derived class for BioC data structures in Galaxy """
@@ -813,9 +752,6 @@
RexpBase.__init__(self, **kwd)
self.add_composite_file( '%s.malist', description = 'MAlist R object saved to file',
substitute_name_with_metadata = 'base_name', is_binary = True )
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
if __name__ == '__main__':
import doctest, sys
diff -r 83dc9642a59e -r 022a8c94883f lib/galaxy/datatypes/images.py
--- a/lib/galaxy/datatypes/images.py Tue Dec 08 09:05:35 2009 -0500
+++ b/lib/galaxy/datatypes/images.py Tue Dec 08 11:46:13 2009 -0500
@@ -15,9 +15,6 @@
class Image( data.Data ):
"""Class describing an image"""
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def set_peek( self, dataset, is_multi_byte=False ):
if not dataset.dataset.purged:
dataset.peek = 'Image in %s format' % dataset.extension
@@ -54,9 +51,6 @@
"""Class describing a GMAJ Applet"""
file_ext = "gmaj.zip"
copy_safe_peek = False
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def set_peek( self, dataset, is_multi_byte=False ):
if not dataset.dataset.purged:
if hasattr( dataset, 'history_id' ):
@@ -108,9 +102,7 @@
class Html( data.Text ):
"""Class describing an html file"""
file_ext = "html"
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
+
def set_peek( self, dataset, is_multi_byte=False ):
if not dataset.dataset.purged:
dataset.peek = "HTML file"
@@ -145,9 +137,7 @@
"""Class describing a LAJ Applet"""
file_ext = "laj"
copy_safe_peek = False
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
+
def set_peek( self, dataset, is_multi_byte=False ):
if not dataset.dataset.purged:
if hasattr( dataset, 'history_id' ):
diff -r 83dc9642a59e -r 022a8c94883f lib/galaxy/datatypes/interval.py
--- a/lib/galaxy/datatypes/interval.py Tue Dec 08 09:05:35 2009 -0500
+++ b/lib/galaxy/datatypes/interval.py Tue Dec 08 11:46:13 2009 -0500
@@ -75,9 +75,6 @@
else:
dataset.peek = 'file does not exist'
dataset.blurb = 'file purged from disk'
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def set_meta( self, dataset, overwrite = True, first_line_is_header = False, **kwd ):
Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 0 )
@@ -343,9 +340,6 @@
MetadataElement( name="columns", default=3, desc="Number of columns", readonly=True, visible=False )
###do we need to repeat these? they are the same as should be inherited from interval type
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def set_meta( self, dataset, overwrite = True, **kwd ):
"""Sets the metadata information for datasets previously determined to be in bed format."""
i = 0
@@ -504,9 +498,6 @@
"""Initialize datatype, by adding GBrowse display app"""
Tabular.__init__(self, **kwd)
self.add_display_app ( 'c_elegans', 'display in Wormbase', 'as_gbrowse_display_file', 'gbrowse_links' )
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def set_meta( self, dataset, overwrite = True, **kwd ):
i = 0
for i, line in enumerate( file ( dataset.file_name ) ):
@@ -644,9 +635,6 @@
def __init__(self, **kwd):
"""Initialize datatype, by adding GBrowse display app"""
Gff.__init__(self, **kwd)
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def set_meta( self, dataset, overwrite = True, **kwd ):
i = 0
for i, line in enumerate( file ( dataset.file_name ) ):
@@ -810,9 +798,6 @@
return ret_val
def make_html_table( self, dataset ):
return Tabular.make_html_table( self, dataset, skipchars=['track', '#'] )
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def set_meta( self, dataset, overwrite = True, **kwd ):
i = 0
for i, line in enumerate( file ( dataset.file_name ) ):
@@ -904,9 +889,6 @@
"""Initialize interval datatype, by adding UCSC display app"""
Tabular.__init__(self, **kwd)
self.add_display_app ( 'ucsc', 'display at UCSC', 'as_ucsc_display_file', 'ucsc_links' )
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def set_meta( self, dataset, overwrite = True, **kwd ):
Tabular.set_meta( self, dataset, overwrite = overwrite, skip = 1 )
def display_peek( self, dataset ):
diff -r 83dc9642a59e -r 022a8c94883f lib/galaxy/datatypes/qualityscore.py
--- a/lib/galaxy/datatypes/qualityscore.py Tue Dec 08 09:05:35 2009 -0500
+++ b/lib/galaxy/datatypes/qualityscore.py Tue Dec 08 11:46:13 2009 -0500
@@ -15,9 +15,6 @@
"""
file_ext = "qualsolid"
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def sniff( self, filename ):
"""
>>> fname = get_test_fname( 'sequence.fasta' )
@@ -67,9 +64,6 @@
"""
file_ext = "qual454"
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def sniff( self, filename ):
"""
>>> fname = get_test_fname( 'sequence.fasta' )
@@ -108,9 +102,4 @@
until we know more about quality score formats
"""
file_ext = "qualsolexa"
-
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
-
-
\ No newline at end of file
+
\ No newline at end of file
diff -r 83dc9642a59e -r 022a8c94883f lib/galaxy/datatypes/sequence.py
--- a/lib/galaxy/datatypes/sequence.py Tue Dec 08 09:05:35 2009 -0500
+++ b/lib/galaxy/datatypes/sequence.py Tue Dec 08 11:46:13 2009 -0500
@@ -21,9 +21,6 @@
"""Add metadata elements"""
MetadataElement( name="sequences", default=0, desc="Number of sequences", readonly=True, visible=False, optional=True, no_value=0 )
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def set_meta( self, dataset, **kwd ):
"""
Set the number of sequences and the number of data lines in dataset.
@@ -59,17 +56,10 @@
"""Add metadata elements"""
MetadataElement( name="species", desc="Species", default=[], param=metadata.SelectParameter, multiple=True, readonly=True, no_value=None )
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
-
class Fasta( Sequence ):
"""Class representing a FASTA sequence"""
file_ext = "fasta"
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def sniff( self, filename ):
"""
Determines whether the file is in fasta format
@@ -122,9 +112,6 @@
""" Class representing the SOLID Color-Space sequence ( csfasta ) """
file_ext = "csfasta"
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def sniff( self, filename ):
"""
Color-space sequence:
@@ -166,9 +153,6 @@
"""Class representing a generic FASTQ sequence"""
file_ext = "fastq"
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def set_meta( self, dataset, **kwd ):
"""
Set the number of sequences and the number of data lines
@@ -220,10 +204,6 @@
"""Class representing a FASTQ sequence ( the Sanger variant )"""
file_ext = "fastqsanger"
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
-
try:
from galaxy import eggs
import pkg_resources; pkg_resources.require( "bx-python" )
@@ -316,9 +296,6 @@
MetadataElement( name="species_chromosomes", desc="Species Chromosomes", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
MetadataElement( name="maf_index", desc="MAF Index File", param=metadata.FileParameter, readonly=True, no_value=None, visible=False, optional=True )
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def init_meta( self, dataset, copy_from=None ):
Alignment.init_meta( self, dataset, copy_from=copy_from )
def set_meta( self, dataset, overwrite = True, **kwd ):
@@ -425,9 +402,6 @@
file_ext = "axt"
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def sniff( self, filename ):
"""
Determines whether the file is in axt format
@@ -480,9 +454,6 @@
# here simply for backward compatibility ( although it is still in the datatypes registry ). Subclassing
# from data.Text eliminates managing metadata elements inherited from the Alignemnt class.
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def sniff( self, filename ):
"""
Determines whether the file is in lav format
diff -r 83dc9642a59e -r 022a8c94883f lib/galaxy/datatypes/tabular.py
--- a/lib/galaxy/datatypes/tabular.py Tue Dec 08 09:05:35 2009 -0500
+++ b/lib/galaxy/datatypes/tabular.py Tue Dec 08 11:46:13 2009 -0500
@@ -23,9 +23,6 @@
MetadataElement( name="columns", default=0, desc="Number of columns", readonly=True, visible=False, no_value=0 )
MetadataElement( name="column_types", default=[], desc="Column types", param=metadata.ColumnTypesParameter, readonly=True, visible=False, no_value=[] )
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def init_meta( self, dataset, copy_from=None ):
data.Text.init_meta( self, dataset, copy_from=copy_from )
def set_meta( self, dataset, overwrite = True, skip = None, **kwd ):
@@ -227,9 +224,6 @@
'Superorder', 'Order', 'Suborder', 'Superfamily', 'Family', 'Subfamily',
'Tribe', 'Subtribe', 'Genus', 'Subgenus', 'Species', 'Subspecies'
]
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def make_html_table( self, dataset, skipchars=[] ):
"""Create HTML table, used for displaying peek"""
out = ['<table cellspacing="0" cellpadding="3">']
@@ -259,9 +253,6 @@
self.column_names = ['QNAME', 'FLAG', 'RNAME', 'POS', 'MAPQ', 'CIGAR',
'MRNM', 'MPOS', 'ISIZE', 'SEQ', 'QUAL', 'OPT'
]
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def make_html_table( self, dataset, skipchars=[] ):
"""Create HTML table, used for displaying peek"""
out = ['<table cellspacing="0" cellpadding="3">']
diff -r 83dc9642a59e -r 022a8c94883f lib/galaxy/datatypes/tracks.py
--- a/lib/galaxy/datatypes/tracks.py Tue Dec 08 09:05:35 2009 -0500
+++ b/lib/galaxy/datatypes/tracks.py Tue Dec 08 11:46:13 2009 -0500
@@ -23,9 +23,6 @@
def __init__(self, **kwargs):
super( GeneTrack, self ).__init__( **kwargs )
self.add_display_app( 'genetrack', 'View in', '', 'genetrack_link' )
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def get_display_links( self, dataset, type, app, base_url, target_frame='galaxy_main', **kwd ):
return data.Data.get_display_links( self, dataset, type, app, base_url, target_frame=target_frame, **kwd )
def genetrack_link( self, hda, type, app, base_url ):
diff -r 83dc9642a59e -r 022a8c94883f lib/galaxy/datatypes/xml.py
--- a/lib/galaxy/datatypes/xml.py Tue Dec 08 09:05:35 2009 -0500
+++ b/lib/galaxy/datatypes/xml.py Tue Dec 08 11:46:13 2009 -0500
@@ -11,9 +11,6 @@
"""NCBI Blast XML Output data"""
file_ext = "blastxml"
- def before_setting_metadata( self, dataset ):
- """This function is called on the dataset before metadata is edited."""
- pass
def set_peek( self, dataset, is_multi_byte=False ):
"""Set the peek and blurb text"""
if not dataset.dataset.purged:
diff -r 83dc9642a59e -r 022a8c94883f lib/galaxy/jobs/__init__.py
--- a/lib/galaxy/jobs/__init__.py Tue Dec 08 09:05:35 2009 -0500
+++ b/lib/galaxy/jobs/__init__.py Tue Dec 08 11:46:13 2009 -0500
@@ -537,7 +537,6 @@
#it would be quicker to just copy the metadata from the originating output dataset,
#but somewhat trickier (need to recurse up the copied_from tree), for now we'll call set_meta()
if not self.external_output_metadata.external_metadata_set_successfully( dataset, self.sa_session ):
- dataset.datatype.before_setting_metadata( dataset )
# Only set metadata values if they are missing...
dataset.set_meta( overwrite = False )
else:
diff -r 83dc9642a59e -r 022a8c94883f lib/galaxy/tools/__init__.py
--- a/lib/galaxy/tools/__init__.py Tue Dec 08 09:05:35 2009 -0500
+++ b/lib/galaxy/tools/__init__.py Tue Dec 08 11:46:13 2009 -0500
@@ -1418,7 +1418,6 @@
if data.extension != data_type:
data = app.datatypes_registry.change_datatype( data, data_type )
elif not isinstance( data.datatype, datatypes.interval.Bed ) and isinstance( data.datatype, datatypes.interval.Interval ):
- data.datatype.before_setting_metadata( data )
data.set_meta()
if data.missing_meta():
data = app.datatypes_registry.change_datatype( data, 'tabular' )
@@ -1473,7 +1472,6 @@
self.sa_session.flush()
child_dataset.set_size()
child_dataset.name = "Secondary Dataset (%s)" % ( designation )
- child_dataset.datatype.before_setting_metadata( child_dataset )
child_dataset.init_meta()
child_dataset.set_meta()
child_dataset.set_peek()
@@ -1533,7 +1531,6 @@
primary_data.set_size()
primary_data.name = outdata.name
primary_data.info = outdata.info
- primary_dataset.datatype.before_setting_metadata( primary_dataset )
primary_data.init_meta( copy_from=outdata )
primary_data.dbkey = dbkey
primary_data.set_meta()
diff -r 83dc9642a59e -r 022a8c94883f lib/galaxy/web/controllers/library.py
--- a/lib/galaxy/web/controllers/library.py Tue Dec 08 09:05:35 2009 -0500
+++ b/lib/galaxy/web/controllers/library.py Tue Dec 08 11:46:13 2009 -0500
@@ -487,7 +487,6 @@
if name not in [ 'name', 'info', 'dbkey' ]:
if spec.get( 'default' ):
setattr( ldda.metadata, name, spec.unwrap( spec.get( 'default' ) ) )
- ldda.datatype.before_setting_metadata( ldda )
ldda.datatype.set_meta( ldda )
ldda.datatype.after_setting_metadata( ldda )
trans.sa_session.flush()
@@ -521,7 +520,6 @@
msg=msg,
messagetype=messagetype )
if trans.app.security_agent.can_modify_library_item( user, roles, ldda ):
- ldda.datatype.before_setting_metadata( ldda )
if "dbkey" in ldda.datatype.metadata_spec and not ldda.metadata.dbkey:
# Copy dbkey into metadata, for backwards compatability
# This looks like it does nothing, but getting the dbkey
diff -r 83dc9642a59e -r 022a8c94883f lib/galaxy/web/controllers/library_admin.py
--- a/lib/galaxy/web/controllers/library_admin.py Tue Dec 08 09:05:35 2009 -0500
+++ b/lib/galaxy/web/controllers/library_admin.py Tue Dec 08 11:46:13 2009 -0500
@@ -493,7 +493,6 @@
if name not in [ 'name', 'info', 'dbkey' ]:
if spec.get( 'default' ):
setattr( ldda.metadata, name, spec.unwrap( spec.get( 'default' ) ) )
- ldda.datatype.before_setting_metadata( ldda )
ldda.datatype.set_meta( ldda )
ldda.datatype.after_setting_metadata( ldda )
trans.sa_session.flush()
@@ -517,7 +516,6 @@
widgets=widgets,
msg=msg,
messagetype=messagetype )
- ldda.datatype.before_setting_metadata( ldda )
if "dbkey" in ldda.datatype.metadata_spec and not ldda.metadata.dbkey:
# Copy dbkey into metadata, for backwards compatability
# This looks like it does nothing, but getting the dbkey
diff -r 83dc9642a59e -r 022a8c94883f lib/galaxy/web/controllers/root.py
--- a/lib/galaxy/web/controllers/root.py Tue Dec 08 09:05:35 2009 -0500
+++ b/lib/galaxy/web/controllers/root.py Tue Dec 08 11:46:13 2009 -0500
@@ -321,7 +321,6 @@
trans.app.datatypes_registry.set_external_metadata_tool.tool_action.execute( trans.app.datatypes_registry.set_external_metadata_tool, trans, incoming = { 'input1':data } )
else:
msg = 'Attributes updated'
- data.datatype.before_setting_metadata( data )
data.set_meta()
data.datatype.after_setting_metadata( data )
trans.sa_session.flush()
@@ -346,7 +345,6 @@
trans.sa_session.refresh( data.dataset )
else:
return trans.show_error_message( "You are not authorized to change this dataset's permissions" )
- data.datatype.before_setting_metadata( data )
if "dbkey" in data.datatype.metadata_spec and not data.metadata.dbkey:
# Copy dbkey into metadata, for backwards compatability
# This looks like it does nothing, but getting the dbkey
@@ -521,7 +519,6 @@
data_file.close()
data.state = data.states.OK
data.set_size()
- data.datatype.before_setting_metadata( data )
data.init_meta()
data.set_meta()
trans.sa_session.flush()
diff -r 83dc9642a59e -r 022a8c94883f tools/data_source/hbvar_filter.py
--- a/tools/data_source/hbvar_filter.py Tue Dec 08 09:05:35 2009 -0500
+++ b/tools/data_source/hbvar_filter.py Tue Dec 08 11:46:13 2009 -0500
@@ -46,7 +46,6 @@
fp.close()
#Set meta data, format file to be valid interval type
if isinstance(data.datatype, datatypes.interval.Interval):
- data.datatype.before_setting_metadata( data )
data.set_meta(first_line_is_header=True)
#check for missing meta data, if all there, comment first line and process file
if not data.missing_meta():
diff -r 83dc9642a59e -r 022a8c94883f tools/data_source/upload.py
--- a/tools/data_source/upload.py Tue Dec 08 09:05:35 2009 -0500
+++ b/tools/data_source/upload.py Tue Dec 08 11:46:13 2009 -0500
@@ -10,6 +10,7 @@
import galaxy.model
from galaxy.datatypes import sniff
from galaxy.datatypes.binary import *
+from galaxy.datatypes.registry import Registry
from galaxy import util
from galaxy.util.json import *
@@ -264,6 +265,9 @@
name = dataset.name,
line_count = line_count )
json_file.write( to_json_string( info ) + "\n" )
+ # Groom the dataset content if necessary
+ datatype = Registry().get_datatype_by_extension( ext )
+ datatype.groom_dataset_content( output_path )
def add_composite_file( dataset, json_file, output_path, files_path ):
if dataset.composite_files:
diff -r 83dc9642a59e -r 022a8c94883f tools/maf/maf_to_bed_code.py
--- a/tools/maf/maf_to_bed_code.py Tue Dec 08 09:05:35 2009 -0500
+++ b/tools/maf/maf_to_bed_code.py Tue Dec 08 11:46:13 2009 -0500
@@ -45,7 +45,6 @@
newdata.info = "The requested file is missing from the system."
newdata.state = newdata.states.ERROR
newdata.dbkey = dbkey
- newdata.datatype.before_setting_metadata( newdata )
newdata.init_meta()
newdata.set_meta()
newdata.set_peek()
details: http://www.bx.psu.edu/hg/galaxy/rev/8feff3bc14bc
changeset: 3155:8feff3bc14bc
user: Greg Von Kuster <greg(a)bx.psu.edu>
date: Mon Dec 07 16:04:33 2009 -0500
description:
Fix for uploading a Bam file that has not yet been sorted, the call to samtools sort has been moved from the sam_to_bam tool to the Bam()set_meta() method to ensure all Bam datasets are sorted prior to indexing. Added new functional tests to cover uploaing unsorted Bam files. Also cleaned up code in the upload tool for uploaing various binary data formats.
diffstat:
lib/galaxy/datatypes/binary.py | 92 ++++++++++++++++++----
lib/galaxy/datatypes/sniff.py | 3 +
lib/galaxy/datatypes/test/3.bam |
test-data/3.bam |
test/functional/test_get_data.py | 26 ++++++-
tools/data_source/upload.py | 138 +++++++++++++++-------------------
tools/samtools/sam_to_bam.py | 21 +----
tools/samtools/sam_to_bam.xml | 4 -
8 files changed, 164 insertions(+), 120 deletions(-)
diffs (440 lines):
diff -r 119315b57656 -r 8feff3bc14bc lib/galaxy/datatypes/binary.py
--- a/lib/galaxy/datatypes/binary.py Mon Dec 07 15:57:06 2009 -0500
+++ b/lib/galaxy/datatypes/binary.py Mon Dec 07 16:04:33 2009 -0500
@@ -12,7 +12,6 @@
log = logging.getLogger(__name__)
-sniffable_binary_formats = [ 'sff', 'bam' ]
# Currently these supported binary data types must be manually set on upload
unsniffable_binary_formats = [ 'ab1', 'scf' ]
@@ -55,29 +54,84 @@
def init_meta( self, dataset, copy_from=None ):
Binary.init_meta( self, dataset, copy_from=copy_from )
def set_meta( self, dataset, overwrite = True, **kwd ):
- """ Sets index for BAM file. """
+ """ Ensures that the Bam file contents are sorted and creates the index for the BAM file. """
+ errors = False
# These metadata values are not accessible by users, always overwrite
index_file = dataset.metadata.bam_index
- if not index_file:
+ if index_file:
+ # If an index file already exists on disk, then the data must have previously been sorted
+ # since samtools requires a sorted Bam file in order to create an index.
+ sorted = os.path.exists( index_file.file_name )
+ else:
index_file = dataset.metadata.spec['bam_index'].param.new_file( dataset = dataset )
+ sorted = False
+ tmp_dir = tempfile.gettempdir()
try:
- # Using a symlink from ~/database/files/dataset_XX.dat, create a temporary file
- # to store the indexex generated from samtools, something like ~/tmp/dataset_XX.dat.bai
- tmp_dir = tempfile.gettempdir()
- tmp_file_path = os.path.join( tmp_dir, os.path.basename( dataset.file_name ) )
- # Here tmp_file_path looks something like /tmp/dataset_XX.dat
- os.symlink( dataset.file_name, tmp_file_path )
- command = 'samtools index %s' % tmp_file_path
- proc = subprocess.Popen( args=command, shell=True )
- proc.wait()
- except:
- err_msg = 'Error creating index file (%s) for BAM file (%s)' % ( str( tmp_file_path ), str( dataset.file_name ) )
+ # Create a symlink from the temporary directory to the dataset file so that samtools can mess with it.
+ tmp_dataset_file_name = os.path.join( tmp_dir, os.path.basename( dataset.file_name ) )
+ # Here tmp_dataset_file_name looks something like /tmp/dataset_XX.dat
+ os.symlink( dataset.file_name, tmp_dataset_file_name )
+ except Exception, e:
+ errors = True
+ err_msg = 'Error creating tmp symlink to file (%s). ' % str( dataset.file_name )
log.exception( err_msg )
- sys.stderr.write( err_msg )
- # Move the temporary index file ~/tmp/dataset_XX.dat.bai to be ~/database/files/_metadata_files/dataset_XX.dat
- shutil.move( '%s.bai' % ( tmp_file_path ), index_file.file_name )
- os.unlink( tmp_file_path )
- dataset.metadata.bam_index = index_file
+ sys.stderr.write( err_msg + str( e ) )
+ if not errors and not sorted:
+ try:
+ # Sort alignments by leftmost coordinates. File <out.prefix>.bam will be created.
+ # TODO: This command may also create temporary files <out.prefix>.%d.bam when the
+ # whole alignment cannot be fitted into memory ( controlled by option -m ). We're
+ # not handling this case here.
+ tmp_sorted_dataset_file = tempfile.NamedTemporaryFile( prefix=tmp_dataset_file_name )
+ tmp_sorted_dataset_file_name = tmp_sorted_dataset_file.name
+ tmp_sorted_dataset_file.close()
+ command = "samtools sort %s %s 2>/dev/null" % ( tmp_dataset_file_name, tmp_sorted_dataset_file_name )
+ proc = subprocess.Popen( args=command, shell=True )
+ proc.wait()
+ except Exception, e:
+ errors = True
+ err_msg = 'Error sorting alignments from (%s). ' % tmp_dataset_file_name
+ log.exception( err_msg )
+ sys.stderr.write( err_msg + str( e ) )
+ if not errors:
+ if sorted:
+ try:
+ # Create the Bam index
+ command = 'samtools index %s' % tmp_dataset_file_name
+ proc = subprocess.Popen( args=command, shell=True )
+ proc.wait()
+ except Exception, e:
+ errors = True
+ err_msg = 'Error creating index for BAM file (%s)' % str( tmp_dataset_file_name )
+ log.exception( err_msg )
+ sys.stderr.write( err_msg + str( e ) )
+ else:
+ tmp_sorted_bam_file_name = '%s.bam' % tmp_sorted_dataset_file_name
+ try:
+ # Create the Bam index
+ command = 'samtools index %s' % tmp_sorted_bam_file_name
+ proc = subprocess.Popen( args=command, shell=True )
+ proc.wait()
+ except Exception, e:
+ errors = True
+ err_msg = 'Error creating index for BAM file (%s)' % str( tmp_sorted_dataset_file_name )
+ log.exception( err_msg )
+ sys.stderr.write( err_msg + str( e ) )
+ if not errors:
+ if sorted:
+ # Move the temporary index file ~/tmp/dataset_XX.dat.bai to our metadata file
+ # storage location ~/database/files/_metadata_files/dataset_XX.dat
+ shutil.move( '%s.bai' % ( tmp_dataset_file_name ), index_file.file_name )
+ else:
+ # Move tmp_sorted_bam_file_name to our output dataset location
+ shutil.move( tmp_sorted_bam_file_name, dataset.file_name )
+ # Move the temporary sorted index file ~/tmp/dataset_XX.dat.bai to our metadata file
+ # storage location ~/database/files/_metadata_files/dataset_XX.dat
+ shutil.move( '%s.bai' % ( tmp_sorted_bam_file_name ), index_file.file_name )
+ # Remove all remaining temporary files
+ os.unlink( tmp_dataset_file_name )
+ # Set the metadata
+ dataset.metadata.bam_index = index_file
def sniff( self, filename ):
# BAM is compressed in the BGZF format, and must not be uncompressed in Galaxy.
# The first 4 bytes of any bam file is 'BAM\1', and the file is binary.
diff -r 119315b57656 -r 8feff3bc14bc lib/galaxy/datatypes/sniff.py
--- a/lib/galaxy/datatypes/sniff.py Mon Dec 07 15:57:06 2009 -0500
+++ b/lib/galaxy/datatypes/sniff.py Mon Dec 07 16:04:33 2009 -0500
@@ -255,6 +255,9 @@
>>> fname = get_test_fname('1.bam')
>>> guess_ext(fname)
'bam'
+ >>> fname = get_test_fname('3.bam')
+ >>> guess_ext(fname)
+ 'bam'
"""
if sniff_order is None:
datatypes_registry = registry.Registry()
diff -r 119315b57656 -r 8feff3bc14bc lib/galaxy/datatypes/test/3.bam
Binary file lib/galaxy/datatypes/test/3.bam has changed
diff -r 119315b57656 -r 8feff3bc14bc test-data/3.bam
Binary file test-data/3.bam has changed
diff -r 119315b57656 -r 8feff3bc14bc test/functional/test_get_data.py
--- a/test/functional/test_get_data.py Mon Dec 07 15:57:06 2009 -0500
+++ b/test/functional/test_get_data.py Mon Dec 07 16:04:33 2009 -0500
@@ -521,7 +521,7 @@
self.check_metadata_for_string( 'value="1.axt" value="\?" Change data type selected value="axt" selected="yes"' )
self.delete_history( id=self.security.encode_id( history.id ) )
def test_0150_upload_file( self ):
- """Test uploading 1.bam, NOT setting the file format"""
+ """Test uploading 1.bam, which is a sorted Bam file creaed by the Galaxy sam_to_bam tool, NOT setting the file format"""
self.check_history_for_string( 'Your history is empty' )
history = sa_session.query( galaxy.model.History ) \
.filter( and_( galaxy.model.History.table.c.deleted==False,
@@ -535,8 +535,30 @@
assert hda is not None, "Problem retrieving hda from database"
self.verify_dataset_correctness( '1.bam', hid=str( hda.hid ) )
self.check_history_for_string( '<span class="bam">bam</span>' )
+ # Make sure the Bam index was created
+ assert hda.metadata.bam_index is not None, "Bam index was not correctly created for 1.bam"
self.delete_history( id=self.security.encode_id( history.id ) )
- def test_0155_url_paste( self ):
+ def test_0155_upload_file( self ):
+ """Test uploading 3.bam, which is an unsorted Bam file, NOT setting the file format"""
+ self.check_history_for_string( 'Your history is empty' )
+ history = sa_session.query( galaxy.model.History ) \
+ .filter( and_( galaxy.model.History.table.c.deleted==False,
+ galaxy.model.History.table.c.user_id==admin_user.id ) ) \
+ .order_by( desc( galaxy.model.History.table.c.create_time ) ) \
+ .first()
+ self.upload_file( '3.bam' )
+ hda = sa_session.query( galaxy.model.HistoryDatasetAssociation ) \
+ .order_by( desc( galaxy.model.HistoryDatasetAssociation.table.c.create_time ) ) \
+ .first()
+ assert hda is not None, "Problem retrieving hda from database"
+ # Since 3.bam is not sorted, we cannot verify dataset correctness since the uploaded
+ # dataset will be sorted. However, the check below to see if the index was created is
+ # sufficient.
+ self.check_history_for_string( '<span class="bam">bam</span>' )
+ # Make sure the Bam index was created
+ assert hda.metadata.bam_index is not None, "Bam index was not correctly created for 3.bam"
+ self.delete_history( id=self.security.encode_id( history.id ) )
+ def test_0160_url_paste( self ):
"""Test url paste behavior"""
# Logged in as admin_user
# Deleting the current history should have created a new history
diff -r 119315b57656 -r 8feff3bc14bc tools/data_source/upload.py
--- a/tools/data_source/upload.py Mon Dec 07 15:57:06 2009 -0500
+++ b/tools/data_source/upload.py Mon Dec 07 16:04:33 2009 -0500
@@ -9,7 +9,7 @@
# need to import model before sniff to resolve a circular import dependency
import galaxy.model
from galaxy.datatypes import sniff
-from galaxy.datatypes.binary import sniffable_binary_formats, unsniffable_binary_formats
+from galaxy.datatypes.binary import *
from galaxy import util
from galaxy.util.json import *
@@ -61,62 +61,54 @@
if chunk is None:
temp.close()
return False
-def check_binary( temp_name, chunk=None ):
- if chunk is None:
+def check_binary( temp_name ):
+ is_binary = False
+ temp = open( temp_name, "U" )
+ chars_read = 0
+ for chars in temp:
+ for char in chars:
+ chars_read += 1
+ if ord( char ) > 128:
+ is_binary = True
+ break
+ if chars_read > 100:
+ break
+ if chars_read > 100:
+ break
+ temp.close()
+ return is_binary
+def check_bam( temp_name ):
+ return Bam().sniff( temp_name )
+def check_sff( temp_name ):
+ return Sff().sniff( temp_name )
+def check_gzip( temp_name ):
+ # This method returns a tuple of booleans representing ( is_gzipped, is_valid )
+ # Make sure we have a gzipped file
+ try:
temp = open( temp_name, "U" )
- else:
- temp = chunk
- lineno = 0
- for line in temp:
- lineno += 1
- line = line.strip()
- if line:
- for char in line:
- if ord( char ) > 128:
- if chunk is None:
- temp.close()
- return True
- if lineno > 10:
- break
- if chunk is None:
+ magic_check = temp.read( 2 )
temp.close()
- return False
-def check_gzip( temp_name ):
- # This is sort of hacky. BAM is compressed in the BGZF format, and must
- # not be uncompressed in upon upload ( it will be detected as gzipped ).
- # The tuple we're returning from here contains boolean values for
- # ( is_compressed, is_valid, is_bam ).
- temp = open( temp_name, "U" )
- magic_check = temp.read( 2 )
- temp.close()
- if magic_check != util.gzip_magic:
- return ( False, False, False )
+ if magic_check != util.gzip_magic:
+ return ( False, False )
+ except:
+ return ( False, False )
+ # We support some binary data types, so check if the compressed binary file is valid
+ # If the file is Bam, it should already have been detected as such, so we'll just check
+ # for sff format.
+ try:
+ header = gzip.open( temp_name ).read(4)
+ if binascii.b2a_hex( header ) == binascii.hexlify( '.sff' ):
+ return ( True, True )
+ except:
+ return( False, False )
CHUNK_SIZE = 2**15 # 32Kb
- gzipped_file = gzip.GzipFile( temp_name )
+ gzipped_file = gzip.GzipFile( temp_name, mode='rb' )
chunk = gzipped_file.read( CHUNK_SIZE )
gzipped_file.close()
+ # See if we have a compressed HTML file
if check_html( temp_name, chunk=chunk ):
- return ( True, False, False )
- if check_binary( temp_name, chunk=chunk ):
- # We do support some binary data types, so check if the compressed binary file is valid
- # We currently only check for [ 'sff', 'bam' ]
- # TODO: this should be fixed to more easily support future-supported binary data types.
- # This is currently just copied from the sniff methods.
- # The first 4 bytes of any bam file is 'BAM\1', and the file is binary.
- try:
- header = gzip.open( temp_name ).read(4)
- if binascii.b2a_hex( header ) == binascii.hexlify( 'BAM\1' ):
- return ( True, True, True )
- except:
- pass
- try:
- header = gzip.open( temp_name ).read(4)
- if binascii.b2a_hex( header ) == binascii.hexlify( '.sff' ):
- return ( True, True, False )
- except:
- pass
- return ( True, False, False )
- return ( True, True, False )
+ return ( True, False )
+ return ( True, True )
def check_zip( temp_name ):
if not zipfile.is_zipfile( temp_name ):
return ( False, False, None )
@@ -126,7 +118,7 @@
# 2. All file extensions within an archive must be the same
name = zip_file.namelist()[0]
test_ext = name.split( "." )[1].strip().lower()
- if not ( test_ext == 'scf' or test_ext == 'ab1' or test_ext == 'txt' ):
+ if not ( test_ext in unsniffable_binary_formats or test_ext == 'txt' ):
return ( True, False, test_ext )
for name in zip_file.namelist():
ext = name.split( "." )[1].strip().lower()
@@ -163,21 +155,25 @@
dataset.is_multi_byte = util.is_multi_byte( codecs.open( dataset.path, 'r', 'utf-8' ).read( 100 ) )
except UnicodeDecodeError, e:
dataset.is_multi_byte = False
+ # Is dataset content multi-byte?
if dataset.is_multi_byte:
data_type = 'multi-byte char'
ext = sniff.guess_ext( dataset.path, is_multi_byte=True )
+ # Is dataset content supported sniffable binary?
+ elif check_bam( dataset.path ):
+ ext = 'bam'
+ data_type = 'bam'
+ elif check_sff( dataset.path ):
+ ext = 'sff'
+ data_type = 'sff'
else:
# See if we have a gzipped file, which, if it passes our restrictions, we'll uncompress
- is_gzipped, is_valid, is_bam = check_gzip( dataset.path )
+ is_gzipped, is_valid = check_gzip( dataset.path )
if is_gzipped and not is_valid:
file_err( 'The uploaded file contains inappropriate content', dataset, json_file )
return
- elif is_gzipped and is_valid and is_bam:
- ext = 'bam'
- data_type = 'bam'
- elif is_gzipped and is_valid and not is_bam:
- # We need to uncompress the temp_name file, but BAM files must remain compressed
- # in order for samtools to function on them
+ elif is_gzipped and is_valid:
+ # We need to uncompress the temp_name file, but BAM files must remain compressed in the BGZF format
CHUNK_SIZE = 2**20 # 1Mb
fd, uncompressed = tempfile.mkstemp( prefix='data_id_%s_upload_gunzip_' % dataset.dataset_id, dir=os.path.dirname( dataset.path ), text=False )
gzipped_file = gzip.GzipFile( dataset.path, 'rb' )
@@ -207,7 +203,7 @@
elif is_zipped and is_valid:
# Currently, we force specific tools to handle this case. We also require the user
# to manually set the incoming file_type
- if ( test_ext == 'ab1' or test_ext == 'scf' ) and dataset.file_type != 'binseq.zip':
+ if ( test_ext in unsniffable_binary_formats ) and dataset.file_type != 'binseq.zip':
file_err( "Invalid 'File Format' for archive consisting of binary files - use 'Binseq.zip'", dataset, json_file )
return
elif test_ext == 'txt' and dataset.file_type != 'txtseq.zip':
@@ -220,35 +216,25 @@
ext = dataset.file_type
if not data_type:
if check_binary( dataset.path ):
+ # We have a binary dataset, but it is not Bam or Sff
data_type = 'binary'
- binary_ok = False
+ #binary_ok = False
parts = dataset.name.split( "." )
if len( parts ) > 1:
ext = parts[1].strip().lower()
- if ext in unsniffable_binary_formats and dataset.file_type == ext:
- binary_ok = True
+ if ext not in unsniffable_binary_formats:
+ file_err( 'The uploaded file contains inappropriate content', dataset, json_file )
+ return
elif ext in unsniffable_binary_formats and dataset.file_type != ext:
err_msg = "You must manually set the 'File Format' to '%s' when uploading %s files." % ( ext.capitalize(), ext )
file_err( err_msg, dataset, json_file )
return
- if not binary_ok and ext in sniffable_binary_formats:
- # Sniff the file to confirm it's data type
- tmp_ext = sniff.guess_ext( dataset.path )
- if tmp_ext == ext:
- binary_ok = True
- else:
- err_msg = "The content of the file does not match its type (%s)." % ext.capitalize()
- file_err( err_msg, dataset, json_file )
- return
- if not binary_ok:
- file_err( 'The uploaded file contains inappropriate content', dataset, json_file )
- return
if not data_type:
# We must have a text file
if check_html( dataset.path ):
file_err( 'The uploaded file contains inappropriate content', dataset, json_file )
return
- if data_type != 'bam' and data_type != 'binary' and data_type != 'zip':
+ if data_type != 'binary' and data_type != 'zip':
if dataset.space_to_tab:
line_count = sniff.convert_newlines_sep2tabs( dataset.path )
else:
diff -r 119315b57656 -r 8feff3bc14bc tools/samtools/sam_to_bam.py
--- a/tools/samtools/sam_to_bam.py Mon Dec 07 15:57:06 2009 -0500
+++ b/tools/samtools/sam_to_bam.py Mon Dec 07 16:04:33 2009 -0500
@@ -79,35 +79,18 @@
tmp_aligns_file = tempfile.NamedTemporaryFile()
tmp_aligns_file_name = tmp_aligns_file.name
tmp_aligns_file.close()
- # IMPORTANT NOTE: for some reason the samtools view command gzips the resulting bam file without warning,
- # and the docs do not currently state that this occurs ( very bad ).
command = "samtools view -bt %s -o %s %s 2>/dev/null" % ( fai_index_file_path, tmp_aligns_file_name, options.input1 )
proc = subprocess.Popen( args=command, shell=True )
proc.wait()
+ shutil.move( tmp_aligns_file_name, options.output1 )
except Exception, e:
stop_err( 'Error extracting alignments from (%s), %s' % ( options.input1, str( e ) ) )
- try:
- # Sort alignments by leftmost coordinates. File <out.prefix>.bam will be created. This command
- # may also create temporary files <out.prefix>.%d.bam when the whole alignment cannot be fitted
- # into memory ( controlled by option -m ).
- tmp_sorted_aligns_file = tempfile.NamedTemporaryFile()
- tmp_sorted_aligns_file_name = tmp_sorted_aligns_file.name
- tmp_sorted_aligns_file.close()
- command = "samtools sort %s %s 2>/dev/null" % ( tmp_aligns_file_name, tmp_sorted_aligns_file_name )
- proc = subprocess.Popen( args=command, shell=True )
- proc.wait()
- except Exception, e:
- stop_err( 'Error sorting alignments from (%s), %s' % ( tmp_aligns_file_name, str( e ) ) )
- # Move tmp_aligns_file_name to our output dataset location
- sorted_bam_file = '%s.bam' % tmp_sorted_aligns_file_name
- shutil.move( sorted_bam_file, options.output1 )
+ # NOTE: samtools requires the Bam file to be sorted, but this occurs in Bam().set_meta() to ensure that uploaded Bam files are sorted as well.
if options.ref_file != "None":
# Remove the symlink from /tmp/dataset_13.dat to ~/database/files/000/dataset_13.dat
os.unlink( fai_index_file_path )
# Remove the index file
index_file_name = '%s.fai' % fai_index_file_path
os.unlink( index_file_name )
- # Remove the tmp_aligns_file_name
- os.unlink( tmp_aligns_file_name )
if __name__=="__main__": __main__()
diff -r 119315b57656 -r 8feff3bc14bc tools/samtools/sam_to_bam.xml
--- a/tools/samtools/sam_to_bam.xml Mon Dec 07 15:57:06 2009 -0500
+++ b/tools/samtools/sam_to_bam.xml Mon Dec 07 16:04:33 2009 -0500
@@ -31,10 +31,6 @@
<data name="output1" format="bam"/>
</outputs>
<tests>
- <!--
- # IMPORTANT NOTE: for some reason the samtools view command gzips the resulting bam file without warning,
- # and the docs do not currently state that this occurs ( very bad ).
- -->
<test>
<param name="index_source" value="history" />
<param name="input1" value="3.sam" ftype="sam" />