commit/galaxy-central: jmchilton: Simpler, more powerful subcollection mapping.

7 May 2014

1 new commit in galaxy-central:

https://bitbucket.org/galaxy/galaxy-central/commits/9e502242af28/
Changeset:   9e502242af28
User:        jmchilton
Date:        2014-05-07 18:29:51
Summary:     Simpler, more powerful subcollection mapping.
Collections can be mapped over 'data' parameters and sufficiently nested collections can be mapped over 'data_collection' parameters (for instance a list of 5 pairs can be supplied to a tool taking in a pair and 5 jobs will be executed). I (perhaps poorly) term these concepts collection mapping and subcollection mapping.

Prior to this changeset - the API for doing collection 'mapping' and 'subcollection mapping' was somewhat more divergent and the tool execution code explicitly forbid doing both kinds of mappings in the same tool execution even if the effective collections could be matched (e.g. it could not map a 'data' parameter over a list of 5 datasets and a pair parameter a list of 5 pairs in the same execution).

This changeset should remedy this - as long as the effective collection mappings can match up such jobs should be possible. The workflow editor (and I think runner) already thought this was possible, so this changeset reduces the tool-workflow impedance mismatch - an existing problem exacerbated by recent dataset collections introduction.

This all needs much more testing - test workflows execute this way, functional test of a tool execution that combines collection mapping and subcollection mapping, etc....
Affected #:  4 files

diff -r c5b736d3e4e5027391a04fd187e2f9d2aa4ebe46 -r 9e502242af28473660ee70ee0f98345f3604e137 lib/galaxy/tools/__init__.py

--- a/lib/galaxy/tools/__init__.py
+++ b/lib/galaxy/tools/__init__.py
@@ -1925,7 +1925,7 @@
         # Fixed set of input parameters may correspond to any number of jobs.
         # Expand these out to individual parameters for given jobs (tool
         # executions).
-        expanded_incomings, collection_info = expand_meta_parameters( trans, incoming, self.inputs )
+        expanded_incomings, collection_info = expand_meta_parameters( trans, self, incoming )
 
         if not expanded_incomings:
             raise exceptions.MessageException( "Tool execution failed, trying to run a tool over an empty collection." )

diff -r c5b736d3e4e5027391a04fd187e2f9d2aa4ebe46 -r 9e502242af28473660ee70ee0f98345f3604e137 lib/galaxy/tools/parameters/basic.py
--- a/lib/galaxy/tools/parameters/basic.py
+++ b/lib/galaxy/tools/parameters/basic.py
@@ -2021,7 +2021,7 @@
         self._ensure_selection( field )
         return field
 
-    def _get_select_dataset_collection_field( self, trans, history, multiple=False, suffix="|__subcollection_multirun__", value=None, other_values=None ):
+    def _get_select_dataset_collection_field( self, trans, history, multiple=False, suffix="|__collection_multirun__", value=None, other_values=None ):
         field_name = "%s%s" % ( self.name, suffix )
         field = form_builder.SelectField( field_name, multiple, None, self.refresh_on_change, refresh_on_change_values=self.refresh_on_change_values )
         dataset_matcher = DatasetMatcher( trans, self, value, other_values )

diff -r c5b736d3e4e5027391a04fd187e2f9d2aa4ebe46 -r 9e502242af28473660ee70ee0f98345f3604e137 lib/galaxy/tools/parameters/meta.py
--- a/lib/galaxy/tools/parameters/meta.py
+++ b/lib/galaxy/tools/parameters/meta.py
@@ -9,7 +9,7 @@
 log = logging.getLogger( __name__ )
 
 
-def expand_meta_parameters( trans, incoming, inputs ):
+def expand_meta_parameters( trans, tool, incoming ):
     """
     Take in a dictionary of raw incoming parameters and expand to a list
     of expanded incoming parameters (one set of parameters per tool
@@ -34,26 +34,23 @@
     def collection_classifier( input_key ):
         multirun_key = "%s|__collection_multirun__" % input_key
         if multirun_key in incoming:
-            encoded_hdc_id = incoming[ multirun_key ]
-            hdc_id = trans.app.security.decode_id( encoded_hdc_id )
-            hdc = trans.sa_session.query( model.HistoryDatasetCollectionAssociation ).get( hdc_id )
-            collections_to_match.add( input_key, hdc )
-            hdas = hdc.collection.dataset_instances
-            return permutations.input_classification.MATCHED, hdas
-        else:
-            return permutations.input_classification.SINGLE, incoming[ input_key ]
-
-    def subcollection_classifier( input_key ):
-        multirun_key = "%s|__subcollection_multirun__" % input_key
-        if multirun_key in incoming:
             incoming_val = incoming[ multirun_key ]
-            # value will be "hdca_id|subcollection_type"
-            encoded_hdc_id, subcollection_type = incoming_val.split( "|", 1 )
+            # If subcollectin multirun of data_collection param - value will
+            # be "hdca_id|subcollection_type" else it will just be hdca_id
+            if "|" in incoming_val:
+                encoded_hdc_id, subcollection_type = incoming_val.split( "|", 1 )
+            else:
+                encoded_hdc_id = incoming_val
+                subcollection_type = None
             hdc_id = trans.app.security.decode_id( encoded_hdc_id )
             hdc = trans.sa_session.query( model.HistoryDatasetCollectionAssociation ).get( hdc_id )
             collections_to_match.add( input_key, hdc, subcollection_type=subcollection_type )
-            subcollection_elements = subcollections.split_dataset_collection_instance( hdc, subcollection_type )
-            return permutations.input_classification.MATCHED, subcollection_elements
+            if subcollection_type is not None:
+                subcollection_elements = subcollections.split_dataset_collection_instance( hdc, subcollection_type )
+                return permutations.input_classification.MATCHED, subcollection_elements
+            else:
+                hdas = hdc.collection.dataset_instances
+                return permutations.input_classification.MATCHED, hdas
         else:
             return permutations.input_classification.SINGLE, incoming[ input_key ]
 
@@ -72,26 +69,17 @@
 
     multirun_found = False
     collection_multirun_found = False
-    subcollection_multirun_found = False
     for key, value in incoming.iteritems():
         multirun_found = try_replace_key( key, "|__multirun__" ) or multirun_found
         collection_multirun_found = try_replace_key( key, "|__collection_multirun__" ) or collection_multirun_found
-        subcollection_multirun_found = try_replace_key( key, "|__subcollection_multirun__" ) or subcollection_multirun_found
 
-    if sum( [ 1 if f else 0 for f in [ multirun_found, collection_multirun_found, subcollection_multirun_found ] ] ) > 1:
+    if sum( [ 1 if f else 0 for f in [ multirun_found, collection_multirun_found ] ] ) > 1:
         # In theory doable, but to complicated for a first pass.
         message = "Cannot specify parallel execution across both multiple datasets and dataset collections."
         raise exceptions.ToolMetaParameterException( message )
 
     if multirun_found:
         return permutations.expand_multi_inputs( incoming_template, classifier ), None
-    elif subcollection_multirun_found:
-        expanded_incomings = permutations.expand_multi_inputs( incoming_template, subcollection_classifier )
-        if collections_to_match.has_collections():
-            collection_info = trans.app.dataset_collections_service.match_collections( collections_to_match )
-        else:
-            collection_info = None
-        return expanded_incomings, collection_info
     else:
         expanded_incomings = permutations.expand_multi_inputs( incoming_template, collection_classifier )
         if collections_to_match.has_collections():

diff -r c5b736d3e4e5027391a04fd187e2f9d2aa4ebe46 -r 9e502242af28473660ee70ee0f98345f3604e137 test/api/test_tools.py
--- a/test/api/test_tools.py
+++ b/test/api/test_tools.py
@@ -264,7 +264,7 @@
         history_id = self.dataset_populator.new_history()
         hdca_list_id = self.__build_nested_list( history_id )
         inputs = {
-            "f1|__subcollection_multirun__": "%s|paired" % hdca_list_id
+            "f1|__collection_multirun__": "%s|paired" % hdca_list_id
         }
         # Following wait not really needed - just getting so many database
         # locked errors with sqlite.

Repository URL: https://bitbucket.org/galaxy/galaxy-central/

--

This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.

    

commits-noreply＠bitbucket.org

tags

participants (1)