details: http://www.bx.psu.edu/hg/galaxy/rev/97226da136bf
changeset: 3171:97226da136bf
user: Dan Blankenberg <dan(a)bx.psu.edu>
date: Thu Dec 10 14:44:05 2009 -0500
description:
Fix for external metadata auto-detect when using track_jobs_in_database.
diffstat:
lib/galaxy/datatypes/metadata.py | 4 ++--
lib/galaxy/tools/actions/metadata.py | 3 +++
2 files changed, 5 insertions(+), 2 deletions(-)
diffs (39 lines):
diff -r 9666acdf74f4 -r 97226da136bf lib/galaxy/datatypes/metadata.py
--- a/lib/galaxy/datatypes/metadata.py Thu Dec 10 14:37:22 2009 -0500
+++ b/lib/galaxy/datatypes/metadata.py Thu Dec 10 14:44:05 2009 -0500
@@ -510,8 +510,8 @@
.first() #there should only be one or None
return None
def get_dataset_metadata_key( self, dataset ):
- # Set meta can be called on library items and history items,
- # need to make different keys for them, since ids can overlap
+ # Set meta can be called on library items and history items,
+ # need to make different keys for them, since ids can overlap
return "%s_%d" % ( dataset.__class__.__name__, dataset.id )
def setup_external_metadata( self, datasets, sa_session, exec_dir=None, tmp_dir=None, dataset_files_path=None,
output_fnames=None, config_root=None, datatypes_config=None, job_metadata=None, kwds={} ):
diff -r 9666acdf74f4 -r 97226da136bf lib/galaxy/tools/actions/metadata.py
--- a/lib/galaxy/tools/actions/metadata.py Thu Dec 10 14:37:22 2009 -0500
+++ b/lib/galaxy/tools/actions/metadata.py Thu Dec 10 14:44:05 2009 -0500
@@ -21,11 +21,13 @@
job.session_id = trans.get_galaxy_session().id
job.history_id = trans.history.id
job.tool_id = tool.id
+ start_job_state = job.state #should be job.states.NEW
try:
# For backward compatibility, some tools may not have versions yet.
job.tool_version = tool.version
except:
job.tool_version = "1.0.0"
+ job.state = job.states.WAITING #we need to set job state to something other than NEW, or else when tracking jobs in db it will be picked up before we have added input / output parameters
trans.sa_session.add( job )
trans.sa_session.flush() #ensure job.id is available
@@ -51,6 +53,7 @@
#Need a special state here to show that metadata is being set and also allow the job to run
# i.e. if state was set to 'running' the set metadata job would never run, as it would wait for input (the dataset to set metadata on) to be in a ready state
dataset.state = dataset.states.SETTING_METADATA
+ job.state = start_job_state #job inputs have been configured, restore initial job state
trans.sa_session.flush()
# Queue the job for execution
details: http://www.bx.psu.edu/hg/galaxy/rev/f7dee0438854
changeset: 3173:f7dee0438854
user: Dan Blankenberg <dan(a)bx.psu.edu>
date: Fri Dec 11 11:11:15 2009 -0500
description:
Modify the way that data_source tools and tool_types are handled.
Data source tools are now of the DataSourceTool class. A lot of cleanup here is possible in the existing datasource tool code, see comments.
diffstat:
lib/galaxy/jobs/__init__.py | 6 +-
lib/galaxy/tools/__init__.py | 150 +++++++++++++++++++++----------------
2 files changed, 87 insertions(+), 69 deletions(-)
diffs (243 lines):
diff -r ab7877640903 -r f7dee0438854 lib/galaxy/jobs/__init__.py
--- a/lib/galaxy/jobs/__init__.py Thu Dec 10 15:37:00 2009 -0500
+++ b/lib/galaxy/jobs/__init__.py Fri Dec 11 11:11:15 2009 -0500
@@ -377,8 +377,7 @@
param_dict = self.tool.build_param_dict( incoming, inp_data, out_data, self.get_output_fnames(), self.working_directory )
# Certain tools require tasks to be completed prior to job execution
# ( this used to be performed in the "exec_before_job" hook, but hooks are deprecated ).
- if self.tool.tool_type is not None:
- out_data = self.tool.exec_before_job( self.queue.app, inp_data, out_data, param_dict )
+ self.tool.exec_before_job( self.queue.app, inp_data, out_data, param_dict )
# Run the before queue ("exec_before_job") hook
self.tool.call_hook( 'exec_before_job', self.queue.app, inp_data=inp_data,
out_data=out_data, tool=self.tool, param_dict=incoming)
@@ -600,8 +599,7 @@
param_dict.update({'__collected_datasets__':collected_datasets})
# Certain tools require tasks to be completed after job execution
# ( this used to be performed in the "exec_after_process" hook, but hooks are deprecated ).
- if self.tool.tool_type is not None:
- self.tool.exec_after_process( self.queue.app, inp_data, out_data, param_dict, job = job )
+ self.tool.exec_after_process( self.queue.app, inp_data, out_data, param_dict, job = job )
# Call 'exec_after_process' hook
self.tool.call_hook( 'exec_after_process', self.queue.app, inp_data=inp_data,
out_data=out_data, param_dict=param_dict,
diff -r ab7877640903 -r f7dee0438854 lib/galaxy/tools/__init__.py
--- a/lib/galaxy/tools/__init__.py Thu Dec 10 15:37:00 2009 -0500
+++ b/lib/galaxy/tools/__init__.py Fri Dec 11 11:11:15 2009 -0500
@@ -134,6 +134,8 @@
cls = type_elem.get( 'class' )
mod = __import__( module, globals(), locals(), [cls])
ToolClass = getattr( mod, cls )
+ elif root.get( 'tool_type', None ) is not None:
+ ToolClass = tool_types.get( root.get( 'tool_type' ) )
else:
ToolClass = Tool
return ToolClass( config_file, root, self.app )
@@ -263,6 +265,7 @@
"""
Represents a computational tool that can be executed through Galaxy.
"""
+ tool_type = 'default'
def __init__( self, config_file, root, app ):
"""
Load a tool from the config named by `config_file`
@@ -296,8 +299,6 @@
self.version = "1.0.0"
# Support multi-byte tools
self.is_multi_byte = util.string_as_bool( root.get( "is_multi_byte", False ) )
- # Type of tool
- self.tool_type = root.get( "tool_type", None )
#Force history to fully refresh after job execution for this tool. Useful i.e. when an indeterminate number of outputs are created by a tool.
self.force_history_refresh = util.string_as_bool( root.get( 'force_history_refresh', 'False' ) )
#load input translator, used by datasource tools to change names/values of incoming parameters
@@ -696,7 +697,7 @@
rval = dict()
for key, param in self.inputs_by_page[page].iteritems():
if not isinstance( param, ToolParameter ):
- raise Exception( "'get_param_html_map' only supported for simple paramters" )
+ raise Exception( "'get_param_html_map' only supported for simple paramters" )
rval[key] = param.get_html( trans, other_values=other_values )
return rval
@@ -1236,8 +1237,8 @@
param_dict[ "_CHILD___%s___%s" % ( name, child.designation ) ] = DatasetFilenameWrapper( child )
for out_name, output in self.outputs.iteritems():
if out_name not in param_dict and output.filters:
- #assume the reason we lack this output is because a filter failed to pass; for tool writing convienence, provide a NoneDataset
- param_dict[ out_name ] = NoneDataset( datatypes_registry = self.app.datatypes_registry, ext = output.format )
+ #assume the reason we lack this output is because a filter failed to pass; for tool writing convienence, provide a NoneDataset
+ param_dict[ out_name ] = NoneDataset( datatypes_registry = self.app.datatypes_registry, ext = output.format )
# We add access to app here, this allows access to app.config, etc
param_dict['__app__'] = RawObjectWrapper( self.app )
# More convienent access to app.config.new_file_path; we don't need to wrap a string
@@ -1345,9 +1346,9 @@
redirect_url += "&%s=%s" % ( p_name, rup_dict[ p_name ] )
# Add the current user email to redirect_url
if data.history.user:
- USERNAME = str( data.history.user.email )
+ USERNAME = str( data.history.user.email )
else:
- USERNAME = 'Anonymous'
+ USERNAME = 'Anonymous'
redirect_url += "&USERNAME=%s" % USERNAME
return redirect_url
@@ -1365,65 +1366,10 @@
raise
def exec_before_job( self, app, inp_data, out_data, param_dict={} ):
- if self.tool_type == 'data_source':
- dbkey = param_dict.get( 'dbkey' )
- organism = param_dict.get( 'organism' )
- table = param_dict.get( 'table' )
- description = param_dict.get( 'description' )
- info = param_dict.get( 'info' )
- if description == 'range':
- description = param_dict.get( 'position', '' )
- if not description:
- description = 'unknown position'
- gb_landmark_region = param_dict.get( 'q' )
- data_type = param_dict.get( 'data_type' )
- items = out_data.items()
- for name, data in items:
- if organism and table and description:
- # This is UCSC
- data.name = '%s on %s: %s (%s)' % ( data.name, organism, table, description )
- elif gb_landmark_region:
- # This is GBrowse
- data.name = '%s on %s' % ( data.name, gb_landmark_region )
- data.info = info
- data.dbkey = dbkey
- if data_type not in app.datatypes_registry.datatypes_by_extension:
- # Setting data_type to tabular will force the data to be sniffed in exec_after_process()
- data_type = 'tabular'
- data = app.datatypes_registry.change_datatype( data, data_type )
- # Store external data source's request parameters temporarily in output file.
- # In case the config setting for "outputs_to_working_directory" is True, we must write to
- # the DatasetFilenameWrapper object in the param_dict since it's "false_path" attribute
- # is the temporary path to the output dataset ( until the job is run ). However,
- # even if the "outputs_to_working_directory" setting is False, we can still open the file
- # the same way for temporarily storing the request parameters.
- out = open( str( param_dict.get( name ) ), 'w' )
- for key, value in param_dict.items():
- print >> out, '%s\t%s' % ( key, value )
- out.close()
- out_data[ name ] = data
- return out_data
+ pass
def exec_after_process( self, app, inp_data, out_data, param_dict, job = None ):
- if self.tool_type == 'data_source':
- name, data = out_data.items()[0]
- data.set_size()
- if data.state == data.states.OK:
- data.name = param_dict.get( 'name', data.name )
- data.info = param_dict.get( 'info', data.name )
- data.dbkey = param_dict.get( 'dbkey', data.dbkey )
- data.extension = param_dict.get( 'data_type', data.extension )
- if data.extension in [ 'txt', 'tabular' ]:
- data_type = sniff.guess_ext( data.file_name, sniff_order=app.datatypes_registry.sniff_order )
- if data.extension != data_type:
- data = app.datatypes_registry.change_datatype( data, data_type )
- elif not isinstance( data.datatype, datatypes.interval.Bed ) and isinstance( data.datatype, datatypes.interval.Interval ):
- data.set_meta()
- if data.missing_meta():
- data = app.datatypes_registry.change_datatype( data, 'tabular' )
- data.set_peek()
- self.sa_session.add( data )
- self.sa_session.flush()
+ pass
def collect_associated_files( self, output, job_working_directory ):
for name, hda in output.items():
@@ -1559,7 +1505,77 @@
self.sa_session.flush()
return primary_datasets
+class DataSourceTool( Tool ):
+ tool_type = 'data_source'
+ def exec_before_job( self, app, inp_data, out_data, param_dict={} ):
+ #TODO: Allow for a generic way for all Tools to have output dataset properties be set to input parameter values
+ #as defined in a tool XML
+ dbkey = param_dict.get( 'dbkey' )
+ organism = param_dict.get( 'organism' )
+ table = param_dict.get( 'table' )
+ description = param_dict.get( 'description' )
+ info = param_dict.get( 'info' )
+ if description == 'range':
+ description = param_dict.get( 'position', '' )
+ if not description:
+ description = 'unknown position'
+ gb_landmark_region = param_dict.get( 'q' )
+ data_type = param_dict.get( 'data_type' )
+ items = out_data.items()
+ for name, data in items:
+ if organism and table and description:
+ # This is UCSC
+ data.name = '%s on %s: %s (%s)' % ( data.name, organism, table, description )
+ elif gb_landmark_region:
+ # This is GBrowse
+ data.name = '%s on %s' % ( data.name, gb_landmark_region )
+ data.info = info
+ data.dbkey = dbkey
+ if data_type not in app.datatypes_registry.datatypes_by_extension:
+ # Setting data_type to tabular will force the data to be sniffed in exec_after_process()
+ data_type = 'tabular'
+ data.change_datatype( data_type )
+ # Store external data source's request parameters temporarily in output file.
+ # In case the config setting for "outputs_to_working_directory" is True, we must write to
+ # the DatasetFilenameWrapper object in the param_dict since it's "false_path" attribute
+ # is the temporary path to the output dataset ( until the job is run ). However,
+ # even if the "outputs_to_working_directory" setting is False, we can still open the file
+ # the same way for temporarily storing the request parameters.
+
+ ## TODO: Input parameters should be jsonified and written into a <configfile> and passed to data_source.py,
+ ## instead of writing tab separated key, value pairs to the output file
+ out = open( str( param_dict.get( name ) ), 'w' )
+ for key, value in param_dict.items():
+ print >> out, '%s\t%s' % ( key, value )
+ out.close()
+
+ def exec_after_process( self, app, inp_data, out_data, param_dict, job = None ):
+ log.debug('after proc called')
+ name, data = out_data.items()[0]
+ data.set_size()
+ #TODO: these should be already be set before the tool runs:
+ if data.state == data.states.OK:
+ data.name = param_dict.get( 'name', data.name )
+ data.info = param_dict.get( 'info', data.name )
+ data.dbkey = param_dict.get( 'dbkey', data.dbkey )
+ data.extension = param_dict.get( 'data_type', data.extension )
+ #TODO: these should be possible as part of data_source.py and external set_meta, see the upload tool:
+ if data.extension in [ 'txt', 'tabular' ]:
+ data_type = sniff.guess_ext( data.file_name, sniff_order=app.datatypes_registry.sniff_order )
+ if data.extension != data_type:
+ data.change_datatype( data_type )
+ elif not isinstance( data.datatype, datatypes.interval.Bed ) and isinstance( data.datatype, datatypes.interval.Interval ):
+ if data.missing_meta():
+ data.change_datatype( 'tabular' )
+ data.set_peek()
+ self.sa_session.add( data )
+ self.sa_session.flush()
+
+class DataDestinationTool( Tool ):
+ tool_type = 'data_destination'
+
class SetMetadataTool( Tool ):
+ tool_type = 'set_metadata'
def exec_after_process( self, app, inp_data, out_data, param_dict, job = None ):
for name, dataset in inp_data.iteritems():
external_metadata = galaxy.datatypes.metadata.JobExternalOutputMetadataWrapper( job )
@@ -1572,7 +1588,11 @@
self.sa_session.add( dataset )
self.sa_session.flush()
-
+#load tool_type to ToolClass mappings
+tool_types = {}
+for tool_class in [ Tool, DataDestinationTool, SetMetadataTool, DataSourceTool ]:
+ tool_types[ tool_class.tool_type ] = tool_class
+
# ---- Utility classes to be factored out -----------------------------------
class BadValue( object ):
details: http://www.bx.psu.edu/hg/galaxy/rev/7e1e7c5d8dbe
changeset: 3167:7e1e7c5d8dbe
user: Kelly Vincent <kpvincent(a)bx.psu.edu>
date: Wed Dec 09 18:50:12 2009 -0500
description:
Updated megablast_wrapper tool to allow for date to be included with database name and displayed (so user knows how current it is)
diffstat:
tool-data/blastdb.loc.sample | 19 ++++++++++++-------
tools/metag_tools/megablast_wrapper.py | 30 +++++++++++++++---------------
tools/metag_tools/megablast_wrapper.xml | 5 +++--
3 files changed, 30 insertions(+), 24 deletions(-)
diffs (138 lines):
diff -r 0ba4a2b77f65 -r 7e1e7c5d8dbe tool-data/blastdb.loc.sample
--- a/tool-data/blastdb.loc.sample Wed Dec 09 16:20:12 2009 -0500
+++ b/tool-data/blastdb.loc.sample Wed Dec 09 18:50:12 2009 -0500
@@ -1,14 +1,18 @@
#This is a sample file distributed with Galaxy that is used by some
-#short read tools. The blastdb.loc file has this format (white space
-#characters are TAB characters):
+#short read tools. The blastdb.loc file has this format:
#
-#<database> <path to base name>
+#<database> <build_date> <path to base name>
+#
+#where a single space separates the first two and a tab the last two.
+#It is important that the actual database name does not have a space in it,
+#and that the first tab that appears in the line is right before the path.
+#The <build_date> can look any way you want.
#
#So, for example, if your database is nt and the path to your base name
#is /depot/data2/galaxy/blastdb/nt/nt.chunk, then the blastdb.loc entry
#would look like this:
#
-#nt /depot/data2/galaxy/blastdb/nt/nt.chunk
+#nt 02 Dec 2009 /depot/data2/galaxy/blastdb/nt/nt.chunk
#
#and your /depot/data2/galaxy/blastdb/nt directory would contain all of
#your "base names" (e.g.):
@@ -16,11 +20,12 @@
#-rw-r--r-- 1 wychung galaxy 23437408 2008-04-09 11:26 nt.chunk.00.nhr
#-rw-r--r-- 1 wychung galaxy 3689920 2008-04-09 11:26 nt.chunk.00.nin
#-rw-r--r-- 1 wychung galaxy 251215198 2008-04-09 11:26 nt.chunk.00.nsq
+#...etc...
#
#Your blastdb.loc file should include an entry per line for each "base name"
#you have stored. For example:
#
-#nt /depot/data2/galaxy/blastdb/nt/nt.chunk
-#wgs /depot/data2/galaxy/blastdb/wgs/wgs.chunk
-#test /depot/data2/galaxy/blastdb/test/test.fa
+#nt 02 Dec 2009 /depot/data2/galaxy/blastdb/nt/nt.chunk
+#wgs 30 Nov 2009 /depot/data2/galaxy/blastdb/wgs/wgs.chunk
+#test 20 Sep 2008 /depot/data2/galaxy/blastdb/test/test
#...etc...
diff -r 0ba4a2b77f65 -r 7e1e7c5d8dbe tools/metag_tools/megablast_wrapper.py
--- a/tools/metag_tools/megablast_wrapper.py Wed Dec 09 16:20:12 2009 -0500
+++ b/tools/metag_tools/megablast_wrapper.py Wed Dec 09 18:50:12 2009 -0500
@@ -30,7 +30,7 @@
#Parse Command Line
options, args = doc_optparse.parse( __doc__ )
- db_build = options.db_build
+ db_build = options.db_build.split( ' ' )[0]
query_filename = options.input.strip()
output_filename = options.output.strip()
mega_word_size = options.word_size # -W
@@ -43,33 +43,33 @@
# megablast parameters
try:
- int(mega_word_size)
+ int( mega_word_size )
except:
- stop_err('Invalid value for word size')
+ stop_err( 'Invalid value for word size' )
try:
float(mega_iden_cutoff)
except:
- stop_err('Invalid value for identity cut-off')
+ stop_err( 'Invalid value for identity cut-off' )
try:
float(mega_evalue_cutoff)
except:
- stop_err('Invalid value for Expectation value')
+ stop_err( 'Invalid value for Expectation value' )
# prepare the database
db = {}
for i, line in enumerate( file( DB_LOC ) ):
line = line.rstrip( '\r\n' )
- if not line or line.startswith('#'):
+ if not line or line.startswith( '#' ):
continue
- fields = line.split()
- if len(fields) == 2:
- db[(fields[0])] = fields[1]
+ fields = line.split( '\t' )
+ if len( fields ) == 2:
+ db[ fields[0].split( ' ' )[0] ] = fields[1]
- if not db.has_key(db_build):
- stop_err('Cannot locate the target database. Please check your location file.')
+ if not db.has_key( db_build ):
+ stop_err( 'Cannot locate the target database. Please check your location file.' )
# arguments for megablast
- chunk = db[(db_build)]
+ chunk = db[ ( db_build ) ]
megablast_command = "megablast -d %s -i %s -o %s -m 8 -a 8 -W %s -p %s -e %s -F %s > /dev/null 2>&1 " \
% ( chunk, query_filename, mega_temp_output, mega_word_size, mega_iden_cutoff, mega_evalue_cutoff, mega_filter )
@@ -80,16 +80,16 @@
except Exception, e:
stop_err( str( e ) )
- output = open(output_filename,'w')
+ output = open( output_filename, 'w' )
invalid_lines = 0
for i, line in enumerate( file( mega_temp_output ) ):
line = line.rstrip( '\r\n' )
fields = line.split()
try:
# get gi and length of that gi seq
- gi, gi_len = fields[1].split('_')
+ gi, gi_len = fields[1].split( '_' )
# convert the last column (causing problem in filter tool) to float
- fields[-1] = float(fields[-1])
+ fields[-1] = float( fields[-1] )
new_line = "%s\t%s\t%s\t%s\t%0.1f" % ( fields[0], gi, gi_len, '\t'.join( fields[2:-1] ), fields[-1] )
except:
diff -r 0ba4a2b77f65 -r 7e1e7c5d8dbe tools/metag_tools/megablast_wrapper.xml
--- a/tools/metag_tools/megablast_wrapper.xml Wed Dec 09 16:20:12 2009 -0500
+++ b/tools/metag_tools/megablast_wrapper.xml Wed Dec 09 18:50:12 2009 -0500
@@ -1,5 +1,5 @@
-<tool id="megablast_wrapper" name="Megablast" version="1.0.0">
- <description> compare short reads against nt and wgs databases</description>
+<tool id="megablast_wrapper" name="Megablast" version="1.0.5">
+ <description> compare short reads against htgs, nt, and wgs databases</description>
<command interpreter="python">
megablast_wrapper.py
--db_build=$source_select
@@ -39,6 +39,7 @@
<tests>
<test>
<param name="input_query" value="megablast_wrapper_test1.fa" ftype="fasta"/>
+ <!-- source_select needs to match the entry in the blastdb.loc file, which includes the last update date if appropriate -->
<param name="source_select" value="phiX" />
<param name="word_size" value="28" />
<param name="iden_cutoff" value="99.0" />