[galaxy-dev] [hg] galaxy 3173: Modify the way that data_source tools and tool...

11 Dec 2009

details:   http://www.bx.psu.edu/hg/galaxy/rev/f7dee0438854
changeset: 3173:f7dee0438854
user:      Dan Blankenberg <dan@bx.psu.edu>
date:      Fri Dec 11 11:11:15 2009 -0500
description:
Modify the way that data_source tools and tool_types are handled.

Data source tools are now of the DataSourceTool class.  A lot of cleanup here is possible in the existing datasource tool code, see comments.

diffstat:

 lib/galaxy/jobs/__init__.py  |    6 +-
 lib/galaxy/tools/__init__.py |  150 +++++++++++++++++++++----------------
 2 files changed, 87 insertions(+), 69 deletions(-)

diffs (243 lines):

diff -r ab7877640903 -r f7dee0438854 lib/galaxy/jobs/__init__.py

--- a/lib/galaxy/jobs/__init__.py	Thu Dec 10 15:37:00 2009 -0500
+++ b/lib/galaxy/jobs/__init__.py	Fri Dec 11 11:11:15 2009 -0500
@@ -377,8 +377,7 @@
         param_dict = self.tool.build_param_dict( incoming, inp_data, out_data, self.get_output_fnames(), self.working_directory )
         # Certain tools require tasks to be completed prior to job execution
         # ( this used to be performed in the "exec_before_job" hook, but hooks are deprecated ).
-        if self.tool.tool_type is not None:
-            out_data = self.tool.exec_before_job( self.queue.app, inp_data, out_data, param_dict )
+        self.tool.exec_before_job( self.queue.app, inp_data, out_data, param_dict )
         # Run the before queue ("exec_before_job") hook
         self.tool.call_hook( 'exec_before_job', self.queue.app, inp_data=inp_data, 
                              out_data=out_data, tool=self.tool, param_dict=incoming)
@@ -600,8 +599,7 @@
         param_dict.update({'__collected_datasets__':collected_datasets})
         # Certain tools require tasks to be completed after job execution
         # ( this used to be performed in the "exec_after_process" hook, but hooks are deprecated ).
-        if self.tool.tool_type is not None:
-            self.tool.exec_after_process( self.queue.app, inp_data, out_data, param_dict, job = job )
+        self.tool.exec_after_process( self.queue.app, inp_data, out_data, param_dict, job = job )
         # Call 'exec_after_process' hook
         self.tool.call_hook( 'exec_after_process', self.queue.app, inp_data=inp_data, 
                              out_data=out_data, param_dict=param_dict, 
diff -r ab7877640903 -r f7dee0438854 lib/galaxy/tools/__init__.py
--- a/lib/galaxy/tools/__init__.py	Thu Dec 10 15:37:00 2009 -0500
+++ b/lib/galaxy/tools/__init__.py	Fri Dec 11 11:11:15 2009 -0500
@@ -134,6 +134,8 @@
             cls = type_elem.get( 'class' )
             mod = __import__( module, globals(), locals(), [cls])
             ToolClass = getattr( mod, cls )
+        elif root.get( 'tool_type', None ) is not None:
+            ToolClass = tool_types.get( root.get( 'tool_type' ) )
         else:
             ToolClass = Tool
         return ToolClass( config_file, root, self.app )
@@ -263,6 +265,7 @@
     """
     Represents a computational tool that can be executed through Galaxy. 
     """
+    tool_type = 'default'
     def __init__( self, config_file, root, app ):
         """
         Load a tool from the config named by `config_file`
@@ -296,8 +299,6 @@
             self.version = "1.0.0"
         # Support multi-byte tools
         self.is_multi_byte = util.string_as_bool( root.get( "is_multi_byte", False ) )
-        # Type of tool
-        self.tool_type = root.get( "tool_type", None )
         #Force history to fully refresh after job execution for this tool. Useful i.e. when an indeterminate number of outputs are created by a tool.
         self.force_history_refresh = util.string_as_bool( root.get( 'force_history_refresh', 'False' ) )
         #load input translator, used by datasource tools to change names/values of incoming parameters
@@ -696,7 +697,7 @@
         rval = dict()
         for key, param in self.inputs_by_page[page].iteritems():
             if not isinstance( param, ToolParameter ):
-               raise Exception( "'get_param_html_map' only supported for simple paramters" )
+                raise Exception( "'get_param_html_map' only supported for simple paramters" )
             rval[key] = param.get_html( trans, other_values=other_values )
         return rval
 
@@ -1236,8 +1237,8 @@
                 param_dict[ "_CHILD___%s___%s" % ( name, child.designation ) ] = DatasetFilenameWrapper( child )
         for out_name, output in self.outputs.iteritems():
             if out_name not in param_dict and output.filters:
-               #assume the reason we lack this output is because a filter failed to pass; for tool writing convienence, provide a NoneDataset
-               param_dict[ out_name ] = NoneDataset( datatypes_registry = self.app.datatypes_registry, ext = output.format )
+                #assume the reason we lack this output is because a filter failed to pass; for tool writing convienence, provide a NoneDataset
+                param_dict[ out_name ] = NoneDataset( datatypes_registry = self.app.datatypes_registry, ext = output.format )
         # We add access to app here, this allows access to app.config, etc
         param_dict['__app__'] = RawObjectWrapper( self.app )
         # More convienent access to app.config.new_file_path; we don't need to wrap a string
@@ -1345,9 +1346,9 @@
             redirect_url += "&%s=%s" % ( p_name, rup_dict[ p_name ] )
         # Add the current user email to redirect_url
         if data.history.user:
-             USERNAME = str( data.history.user.email )
+            USERNAME = str( data.history.user.email )
         else:
-             USERNAME = 'Anonymous'
+            USERNAME = 'Anonymous'
         redirect_url += "&USERNAME=%s" % USERNAME
         return redirect_url
 
@@ -1365,65 +1366,10 @@
             raise
 
     def exec_before_job( self, app, inp_data, out_data, param_dict={} ):
-        if self.tool_type == 'data_source':
-            dbkey = param_dict.get( 'dbkey' )
-            organism = param_dict.get( 'organism' )
-            table = param_dict.get( 'table' )
-            description = param_dict.get( 'description' )
-            info = param_dict.get( 'info' )
-            if description == 'range':
-                description = param_dict.get( 'position', '' )
-                if not description:
-                    description = 'unknown position'
-            gb_landmark_region = param_dict.get( 'q' )
-            data_type = param_dict.get( 'data_type' )
-            items = out_data.items()
-            for name, data in items:
-                if organism and table and description:
-                    # This is UCSC
-                    data.name  = '%s on %s: %s (%s)' % ( data.name, organism, table, description )
-                elif gb_landmark_region:
-                    # This is GBrowse
-                    data.name = '%s on %s' % ( data.name, gb_landmark_region )
-                data.info = info
-                data.dbkey = dbkey
-                if data_type not in app.datatypes_registry.datatypes_by_extension:
-                    # Setting data_type to tabular will force the data to be sniffed in exec_after_process()
-                    data_type = 'tabular'
-                data = app.datatypes_registry.change_datatype( data, data_type )
-                # Store external data source's request parameters temporarily in output file.
-                # In case the config setting for "outputs_to_working_directory" is True, we must write to
-                # the DatasetFilenameWrapper object in the param_dict since it's "false_path" attribute
-                # is the temporary path to the output dataset ( until the job is run ).  However,
-                # even if the "outputs_to_working_directory" setting is False, we can still open the file
-                # the same way for temporarily storing the request parameters.
-                out = open( str( param_dict.get( name ) ), 'w' )
-                for key, value in param_dict.items():
-                    print >> out, '%s\t%s' % ( key, value )
-                out.close()
-                out_data[ name ] = data
-            return out_data
+        pass
 
     def exec_after_process( self, app, inp_data, out_data, param_dict, job = None ):
-        if self.tool_type == 'data_source':
-            name, data = out_data.items()[0]
-            data.set_size()
-            if data.state == data.states.OK:
-                data.name = param_dict.get( 'name', data.name )
-                data.info = param_dict.get( 'info', data.name )
-                data.dbkey = param_dict.get( 'dbkey', data.dbkey )
-                data.extension = param_dict.get( 'data_type', data.extension )
-            if data.extension in [ 'txt', 'tabular' ]:
-                data_type = sniff.guess_ext( data.file_name, sniff_order=app.datatypes_registry.sniff_order )
-                if data.extension != data_type:
-                    data = app.datatypes_registry.change_datatype( data, data_type )
-            elif not isinstance( data.datatype, datatypes.interval.Bed ) and isinstance( data.datatype, datatypes.interval.Interval ):
-                data.set_meta()
-                if data.missing_meta(): 
-                    data = app.datatypes_registry.change_datatype( data, 'tabular' )
-            data.set_peek()
-            self.sa_session.add( data )
-            self.sa_session.flush()
+        pass
 
     def collect_associated_files( self, output, job_working_directory ):
         for name, hda in output.items():
@@ -1559,7 +1505,77 @@
                     self.sa_session.flush()
         return primary_datasets
 
+class DataSourceTool( Tool ):
+    tool_type = 'data_source'
+    def exec_before_job( self, app, inp_data, out_data, param_dict={} ):
+        #TODO: Allow for a generic way for all Tools to have output dataset properties be set to input parameter values
+        #as defined in a tool XML
+        dbkey = param_dict.get( 'dbkey' )
+        organism = param_dict.get( 'organism' )
+        table = param_dict.get( 'table' )
+        description = param_dict.get( 'description' )
+        info = param_dict.get( 'info' )
+        if description == 'range':
+            description = param_dict.get( 'position', '' )
+            if not description:
+                description = 'unknown position'
+        gb_landmark_region = param_dict.get( 'q' )
+        data_type = param_dict.get( 'data_type' )
+        items = out_data.items()
+        for name, data in items:
+            if organism and table and description:
+                # This is UCSC
+                data.name  = '%s on %s: %s (%s)' % ( data.name, organism, table, description )
+            elif gb_landmark_region:
+                # This is GBrowse
+                data.name = '%s on %s' % ( data.name, gb_landmark_region )
+            data.info = info
+            data.dbkey = dbkey
+            if data_type not in app.datatypes_registry.datatypes_by_extension:
+                # Setting data_type to tabular will force the data to be sniffed in exec_after_process()
+                data_type = 'tabular'
+            data.change_datatype( data_type )
+            # Store external data source's request parameters temporarily in output file.
+            # In case the config setting for "outputs_to_working_directory" is True, we must write to
+            # the DatasetFilenameWrapper object in the param_dict since it's "false_path" attribute
+            # is the temporary path to the output dataset ( until the job is run ).  However,
+            # even if the "outputs_to_working_directory" setting is False, we can still open the file
+            # the same way for temporarily storing the request parameters.
+            
+            ## TODO: Input parameters should be jsonified and written into a <configfile> and passed to data_source.py,
+            ## instead of writing tab separated key, value pairs to the output file
+            out = open( str( param_dict.get( name ) ), 'w' )
+            for key, value in param_dict.items():
+                print >> out, '%s\t%s' % ( key, value )
+            out.close()
+
+    def exec_after_process( self, app, inp_data, out_data, param_dict, job = None ):
+        log.debug('after proc called')
+        name, data = out_data.items()[0]
+        data.set_size()
+        #TODO: these should be already be set before the tool runs:
+        if data.state == data.states.OK:
+            data.name = param_dict.get( 'name', data.name )
+            data.info = param_dict.get( 'info', data.name )
+            data.dbkey = param_dict.get( 'dbkey', data.dbkey )
+            data.extension = param_dict.get( 'data_type', data.extension )
+        #TODO: these should be possible as part of data_source.py and external set_meta, see the upload tool:
+        if data.extension in [ 'txt', 'tabular' ]:
+            data_type = sniff.guess_ext( data.file_name, sniff_order=app.datatypes_registry.sniff_order )
+            if data.extension != data_type:
+                data.change_datatype( data_type )
+        elif not isinstance( data.datatype, datatypes.interval.Bed ) and isinstance( data.datatype, datatypes.interval.Interval ):
+            if data.missing_meta(): 
+                data.change_datatype( 'tabular' )
+        data.set_peek()
+        self.sa_session.add( data )
+        self.sa_session.flush()
+
+class DataDestinationTool( Tool ):
+    tool_type = 'data_destination'
+
 class SetMetadataTool( Tool ):
+    tool_type = 'set_metadata'
     def exec_after_process( self, app, inp_data, out_data, param_dict, job = None ):
         for name, dataset in inp_data.iteritems():
             external_metadata = galaxy.datatypes.metadata.JobExternalOutputMetadataWrapper( job )
@@ -1572,7 +1588,11 @@
             self.sa_session.add( dataset )
             self.sa_session.flush()
 
-        
+#load tool_type to ToolClass mappings
+tool_types = {}
+for tool_class in [ Tool, DataDestinationTool, SetMetadataTool, DataSourceTool ]:
+    tool_types[ tool_class.tool_type ] = tool_class
+
 # ---- Utility classes to be factored out -----------------------------------
         
 class BadValue( object ):

    

[galaxy-dev] [hg] galaxy 3173: Modify the way that data_source tools and tool...

Greg Von Kuster