commit/galaxy-central: jmchilton: Expose improved sample tracking to tools for implicit map/reduce ops.

2 Feb 2015

1 new commit in galaxy-central:

https://bitbucket.org/galaxy/galaxy-central/commits/b9ec027d4b1e/
Changeset:   b9ec027d4b1e
User:        jmchilton
Date:        2015-02-02 21:29:42+00:00
Summary:     Expose improved sample tracking to tools for implicit map/reduce ops.
Tools may now use $input.element_identifier during tool evalution for input 'data' parameters with the following semantics:

 - If the input was specified as a single dataset by the user - this just fallbacks to providing the $input.name.
 - If the input was mapped over a collection (to produce many jobs) or if the input is a 'multiple="true"' input that was provided a collection - the $input.element_identifier will be the element identifier for the corresponding collection item (generally much more useful the dataset name - since if preserved throughout workflows).

'data_collection' parameters already can access this kind of information - but it is something of a best practice to use simple 'data' parameters since they are compatible with more traditional un-collected datasets.

This commit really needs more comments - but Philip Mabon has been patiently waiting for this functionality for a long time.
Affected #:  9 files

diff -r ff3c5721f74857cf689966abfdeb0dc9390bf605 -r b9ec027d4b1e1b67f26632a55076dbf53e3225bd lib/galaxy/model/__init__.py

--- a/lib/galaxy/model/__init__.py
+++ b/lib/galaxy/model/__init__.py
@@ -2685,6 +2685,16 @@
         return instances
 
     @property
+    def dataset_elements( self ):
+        elements = []
+        for element in self.elements:
+            if element.is_collection:
+                elements.extend( element.child_collection.dataset_elements )
+            else:
+                elements.append( element )
+        return elements
+
+    @property
     def state( self ):
         # TODO: DatasetCollection state handling...
         return 'ok'

diff -r ff3c5721f74857cf689966abfdeb0dc9390bf605 -r b9ec027d4b1e1b67f26632a55076dbf53e3225bd lib/galaxy/tools/actions/__init__.py
--- a/lib/galaxy/tools/actions/__init__.py
+++ b/lib/galaxy/tools/actions/__init__.py
@@ -191,6 +191,10 @@
             if data.dbkey not in [None, '?']:
                 input_dbkey = data.dbkey
 
+            identifier = getattr( data, "element_identifier", None )
+            if identifier is not None:
+                incoming[ "%s|__identifier__" % name ] = identifier
+
         # Collect chromInfo dataset and add as parameters to incoming
         ( chrom_info, db_dataset ) = trans.app.genome_builds.get_chrom_info( input_dbkey, trans=trans, custom_build_hack_get_len_from_fasta_conversion=tool.id != 'CONVERTER_fasta_to_len' )
         if db_dataset:

diff -r ff3c5721f74857cf689966abfdeb0dc9390bf605 -r b9ec027d4b1e1b67f26632a55076dbf53e3225bd lib/galaxy/tools/evaluation.py
--- a/lib/galaxy/tools/evaluation.py
+++ b/lib/galaxy/tools/evaluation.py
@@ -154,7 +154,7 @@
             if isinstance( input, DataToolParameter ) and input.multiple:
                 dataset_instances = input_values[ input.name ]
                 if isinstance( dataset_instances, model.HistoryDatasetCollectionAssociation ):
-                    dataset_instances = dataset_instances.collection.dataset_instances[:]
+                    dataset_instances = dataset_instances.collection.dataset_elements[:]
                 input_values[ input.name ] = \
                     DatasetListWrapper( dataset_instances,
                                         dataset_paths=input_dataset_paths,
@@ -199,6 +199,9 @@
                     tool=self,
                     name=input.name
                 )
+                identifier_key = "%s|__identifier__" % input.name
+                if identifier_key in param_dict:
+                    wrapper_kwds["identifier"] = param_dict[identifier_key]
                 if dataset:
                     #A None dataset does not have a filename
                     real_path = dataset.file_name

diff -r ff3c5721f74857cf689966abfdeb0dc9390bf605 -r b9ec027d4b1e1b67f26632a55076dbf53e3225bd lib/galaxy/tools/parameters/meta.py
--- a/lib/galaxy/tools/parameters/meta.py
+++ b/lib/galaxy/tools/parameters/meta.py
@@ -14,6 +14,13 @@
     execution).
     """
 
+    to_remove = []
+    for key in incoming.keys():
+        if key.endswith("|__identifier__"):
+            to_remove.append(key)
+    for key in to_remove:
+        incoming.pop(key)
+
     def classify_unmodified_parameter( input_key ):
         value = incoming[ input_key ]
         if isinstance( value, dict ) and 'values' in value:
@@ -118,7 +125,11 @@
         subcollection_elements = subcollections.split_dataset_collection_instance( hdc, subcollection_type )
         return subcollection_elements
     else:
-        hdas = hdc.collection.dataset_instances
+        hdas = []
+        for element in hdc.collection.dataset_elements:
+            hda = element.dataset_instance
+            hda.element_identifier = element.element_identifier
+            hdas.append( hda )
         return hdas
 
 

diff -r ff3c5721f74857cf689966abfdeb0dc9390bf605 -r b9ec027d4b1e1b67f26632a55076dbf53e3225bd lib/galaxy/tools/wrappers.py
--- a/lib/galaxy/tools/wrappers.py
+++ b/lib/galaxy/tools/wrappers.py
@@ -187,7 +187,7 @@
         def items( self ):
             return iter( [ ( k, self.get( k ) ) for k, v in self.metadata.items() ] )
 
-    def __init__( self, dataset, datatypes_registry=None, tool=None, name=None, dataset_path=None ):
+    def __init__( self, dataset, datatypes_registry=None, tool=None, name=None, dataset_path=None, identifier=None ):
         if not dataset:
             try:
                 # TODO: allow this to work when working with grouping
@@ -205,6 +205,14 @@
         self.datatypes_registry = datatypes_registry
         self.false_path = getattr( dataset_path, "false_path", None )
         self.false_extra_files_path = getattr( dataset_path, "false_extra_files_path", None )
+        self._element_identifier = identifier
+
+    @property
+    def element_identifier( self ):
+        identifier = self._element_identifier
+        if identifier is None:
+            identifier = self.name
+        return identifier
 
     @property
     def is_collection( self ):
@@ -270,6 +278,10 @@
             datasets = [datasets]
 
         def to_wrapper( dataset ):
+            if hasattr(dataset, "element_identifier"):
+                element = dataset
+                dataset = element.dataset_instance
+                kwargs["identifier"] = element.element_identifier
             return self._dataset_wrapper( dataset, dataset_paths, **kwargs )
 
         list.__init__( self, map( to_wrapper, datasets ) )

diff -r ff3c5721f74857cf689966abfdeb0dc9390bf605 -r b9ec027d4b1e1b67f26632a55076dbf53e3225bd test/api/test_tools.py
--- a/test/api/test_tools.py
+++ b/test/api/test_tools.py
@@ -553,6 +553,93 @@
         self.assertEquals( output1_content.strip(), "123" )
         self.assertEquals( output2_content.strip(), "456" )
 
+    @skip_without_tool( "identifier_single" )
+    def test_identifier_in_map( self ):
+        history_id = self.dataset_populator.new_history()
+        hdca_id = self.__build_pair( history_id, [ "123", "456" ] )
+        inputs = {
+            "input1": { 'batch': True, 'values': [ { 'src': 'hdca', 'id': hdca_id } ] },
+        }
+        create_response = self._run( "identifier_single", history_id, inputs )
+        self._assert_status_code_is( create_response, 200 )
+        create = create_response.json()
+        outputs = create[ 'outputs' ]
+        jobs = create[ 'jobs' ]
+        implicit_collections = create[ 'implicit_collections' ]
+        self.assertEquals( len( jobs ), 2 )
+        self.assertEquals( len( outputs ), 2 )
+        self.assertEquals( len( implicit_collections ), 1 )
+        output1 = outputs[ 0 ]
+        output2 = outputs[ 1 ]
+        output1_content = self.dataset_populator.get_history_dataset_content( history_id, dataset=output1 )
+        output2_content = self.dataset_populator.get_history_dataset_content( history_id, dataset=output2 )
+        self.assertEquals( output1_content.strip(), "forward" )
+        self.assertEquals( output2_content.strip(), "reverse" )
+
+    @skip_without_tool( "identifier_single" )
+    def test_identifier_outside_map( self ):
+        history_id = self.dataset_populator.new_history()
+        new_dataset1 = self.dataset_populator.new_dataset( history_id, content='123' )
+        inputs = {
+            "input1": { 'src': 'hda', 'id': new_dataset1["id"] },
+        }
+        create_response = self._run( "identifier_single", history_id, inputs )
+        self._assert_status_code_is( create_response, 200 )
+        create = create_response.json()
+        outputs = create[ 'outputs' ]
+        jobs = create[ 'jobs' ]
+        implicit_collections = create[ 'implicit_collections' ]
+        self.assertEquals( len( jobs ), 1 )
+        self.assertEquals( len( outputs ), 1 )
+        self.assertEquals( len( implicit_collections ), 0 )
+        output1 = outputs[ 0 ]
+        output1_content = self.dataset_populator.get_history_dataset_content( history_id, dataset=output1 )
+        self.assertEquals( output1_content.strip(), "Pasted Entry" )
+
+    @skip_without_tool( "identifier_multiple" )
+    def test_identifier_in_multiple_reduce( self ):
+        history_id = self.dataset_populator.new_history()
+        hdca_id = self.__build_pair( history_id, [ "123", "456" ] )
+        inputs = {
+            "input1": { 'src': 'hdca', 'id': hdca_id },
+        }
+        create_response = self._run( "identifier_multiple", history_id, inputs )
+        self._assert_status_code_is( create_response, 200 )
+        create = create_response.json()
+        outputs = create[ 'outputs' ]
+        jobs = create[ 'jobs' ]
+        implicit_collections = create[ 'implicit_collections' ]
+        self.assertEquals( len( jobs ), 1 )
+        self.assertEquals( len( outputs ), 1 )
+        self.assertEquals( len( implicit_collections ), 0 )
+        output1 = outputs[ 0 ]
+        output1_content = self.dataset_populator.get_history_dataset_content( history_id, dataset=output1 )
+        self.assertEquals( output1_content.strip(), "forward\nreverse" )
+
+    @skip_without_tool( "identifier_multiple" )
+    def test_identifier_with_multiple_normal_datasets( self ):
+        history_id = self.dataset_populator.new_history()
+        new_dataset1 = self.dataset_populator.new_dataset( history_id, content='123' )
+        new_dataset2 = self.dataset_populator.new_dataset( history_id, content='456' )
+        inputs = {
+            "input1": [
+                { 'src': 'hda', 'id': new_dataset1["id"] },
+                { 'src': 'hda', 'id': new_dataset2["id"] }
+            ]
+        }
+        create_response = self._run( "identifier_multiple", history_id, inputs )
+        self._assert_status_code_is( create_response, 200 )
+        create = create_response.json()
+        outputs = create[ 'outputs' ]
+        jobs = create[ 'jobs' ]
+        implicit_collections = create[ 'implicit_collections' ]
+        self.assertEquals( len( jobs ), 1 )
+        self.assertEquals( len( outputs ), 1 )
+        self.assertEquals( len( implicit_collections ), 0 )
+        output1 = outputs[ 0 ]
+        output1_content = self.dataset_populator.get_history_dataset_content( history_id, dataset=output1 )
+        self.assertEquals( output1_content.strip(), "Pasted Entry\nPasted Entry" )
+
     @skip_without_tool( "cat1" )
     def test_map_over_nested_collections_legacy( self ):
         history_id = self.dataset_populator.new_history()

diff -r ff3c5721f74857cf689966abfdeb0dc9390bf605 -r b9ec027d4b1e1b67f26632a55076dbf53e3225bd test/functional/tools/identifier_multiple.xml
--- /dev/null
+++ b/test/functional/tools/identifier_multiple.xml
@@ -0,0 +1,15 @@
+<tool id="identifier_multiple" name="identifier_multiple">
+  <command>
+    #for $input in $input1#
+    echo '$input.element_identifier' >> 'output1';
+    #end for#
+  </command>
+  <inputs>
+    <param type="data" name="input1" label="Input 1" multiple="true" />
+  </inputs>
+  <outputs>
+    <data name="output1" type="tabular" from_work_dir="output1" />
+  </outputs>
+  <tests>
+  </tests>
+</tool>

diff -r ff3c5721f74857cf689966abfdeb0dc9390bf605 -r b9ec027d4b1e1b67f26632a55076dbf53e3225bd test/functional/tools/identifier_single.xml
--- /dev/null
+++ b/test/functional/tools/identifier_single.xml
@@ -0,0 +1,13 @@
+<tool id="identifier_single" name="identifier_single">
+  <command>
+    echo '$input1.element_identifier' > 'output1'
+  </command>
+  <inputs>
+    <param type="data" name="input1" label="Input 1" />
+  </inputs>
+  <outputs>
+    <data name="output1" type="tabular" from_work_dir="output1" />
+  </outputs>
+  <tests>
+  </tests>
+</tool>

diff -r ff3c5721f74857cf689966abfdeb0dc9390bf605 -r b9ec027d4b1e1b67f26632a55076dbf53e3225bd test/functional/tools/samples_tool_conf.xml
--- a/test/functional/tools/samples_tool_conf.xml
+++ b/test/functional/tools/samples_tool_conf.xml
@@ -30,6 +30,8 @@
   <tool file="validation_default.xml" /><tool file="validation_sanitizer.xml" /><tool file="validation_repeat.xml" />
+  <tool file="identifier_single.xml" />
+  <tool file="identifier_multiple.xml" /><tool file="collection_paired_test.xml" /><tool file="collection_nested_test.xml" /><tool file="collection_mixed_param.xml" />

Repository URL: https://bitbucket.org/galaxy/galaxy-central/

--

This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.

    

commits-noreply＠bitbucket.org

tags

participants (1)