commit/galaxy-central: 7 new changesets

6 May 2014

7 new commits in galaxy-central:

https://bitbucket.org/galaxy/galaxy-central/commits/92c23f534d2f/
Changeset:   92c23f534d2f
User:        jmchilton
Date:        2014-03-29 23:11:17
Summary:     Include job id in dataset provenance API.
This allows linking this information with information produced by the jobs API.
Affected #:  1 file

diff -r f83f4d9965283e8dc8640698262da52080081496 -r 92c23f534d2ff322476e988c998d14672d558e31 lib/galaxy/webapps/galaxy/api/provenance.py

--- a/lib/galaxy/webapps/galaxy/api/provenance.py
+++ b/lib/galaxy/webapps/galaxy/api/provenance.py
@@ -46,6 +46,7 @@
             return {
                 "id": trans.security.encode_id(item.id),
                 "uuid": ( lambda uuid: str( uuid ) if uuid else None )( item.dataset.uuid),
+                "job_id": trans.security.encode_id( job.id ),
                 "tool_id": job.tool_id,
                 "parameters": self._get_job_record(trans, job, follow),
                 "stderr": job.stderr,


https://bitbucket.org/galaxy/galaxy-central/commits/839a099d99a3/
Changeset:   839a099d99a3
User:        jmchilton
Date:        2014-03-29 23:11:17
Summary:     Allow fetching input/output dataset association information through jobs API.
This will allow tool test framework to discover runtime discovered datastes in subsequent changesets.
Affected #:  2 files

diff -r 92c23f534d2ff322476e988c998d14672d558e31 -r 839a099d99a3d1f35c35c76740087655eacfe46f lib/galaxy/webapps/galaxy/api/jobs.py
--- a/lib/galaxy/webapps/galaxy/api/jobs.py
+++ b/lib/galaxy/webapps/galaxy/api/jobs.py
@@ -13,6 +13,7 @@
 from galaxy.web.base.controller import UsesLibraryMixinItems
 from galaxy import exceptions
 from galaxy import util
+from galaxy import model
 
 import logging
 log = logging.getLogger( __name__ )
@@ -91,10 +92,62 @@
         :rtype:     dictionary
         :returns:   dictionary containing full description of job data
         """
+        job = self.__get_job( trans, id )
+        return self.encode_all_ids( trans, job.to_dict( 'element' ), True )
+
+    @expose_api
+    def inputs( self, trans, id, **kwd ):
+        """
+        show( trans, id )
+        * GET /api/jobs/{job_id}/inputs
+            returns input datasets created by job
+
+        :type   id: string
+        :param  id: Encoded job id
+
+        :rtype:     dictionary
+        :returns:   dictionary containing input dataset associations
+        """
+        job = self.__get_job( trans, id )
+        return self.__dictify_associations( trans, job.input_datasets, job.input_library_datasets )
+
+    @expose_api
+    def outputs( self, trans, id, **kwd ):
+        """
+        show( trans, id )
+        * GET /api/jobs/{job_id}/outputs
+            returns output datasets created by job
+
+        :type   id: string
+        :param  id: Encoded job id
+
+        :rtype:     dictionary
+        :returns:   dictionary containing output dataset associations
+        """
+        job = self.__get_job( trans, id )
+        return self.__dictify_associations( trans, job.output_datasets, job.output_library_datasets )
+
+    def __dictify_associations( self, trans, *association_lists ):
+        rval = []
+        for association_list in association_lists:
+            rval.extend( map( lambda a: self.__dictify_association( trans, a ), association_list ) )
+        return rval
+
+    def __dictify_association( self, trans, job_dataset_association ):
+        dataset_dict = None
+        dataset = job_dataset_association.dataset
+        if dataset:
+            if isinstance( dataset, model.HistoryDatasetAssociation ):
+                dataset_dict = dict( src="hda", id=trans.security.encode_id( dataset.id ) )
+            else:
+                dataset_dict = dict( src="ldda", id=trans.security.encode_id( dataset.id ) )
+        return dict( name=job_dataset_association.name, dataset=dataset_dict )
+
+    def __get_job( self, trans, id ):
         try:
-            decoded_job_id = trans.security.decode_id(id)
-        except:
-            raise exceptions.ObjectAttributeInvalidException()
+            decoded_job_id = trans.security.decode_id( id )
+        except Exception:
+            raise exceptions.MalformedId()
         query = trans.sa_session.query( trans.app.model.Job ).filter(
             trans.app.model.Job.user == trans.user,
             trans.app.model.Job.id == decoded_job_id
@@ -102,7 +155,7 @@
         job = query.first()
         if job is None:
             raise exceptions.ObjectNotFound()
-        return self.encode_all_ids( trans, job.to_dict( 'element' ), True )
+        return job
 
     @expose_api
     def create( self, trans, payload, **kwd ):

diff -r 92c23f534d2ff322476e988c998d14672d558e31 -r 839a099d99a3d1f35c35c76740087655eacfe46f lib/galaxy/webapps/galaxy/buildapp.py
--- a/lib/galaxy/webapps/galaxy/buildapp.py
+++ b/lib/galaxy/webapps/galaxy/buildapp.py
@@ -261,6 +261,8 @@
                             'jobs',
                             path_prefix='/api' )
     webapp.mapper.connect( 'job_search', '/api/jobs/search', controller='jobs', action='search', conditions=dict( method=['POST'] ) )
+    webapp.mapper.connect( 'job_inputs', '/api/jobs/{id}/inputs', controller='jobs', action='inputs', conditions=dict( method=['GET'] ) )
+    webapp.mapper.connect( 'job_outputs', '/api/jobs/{id}/outputs', controller='jobs', action='outputs', conditions=dict( method=['GET'] ) )
 
     # Job files controllers. Only for consumption by remote job runners.
     webapp.mapper.resource( 'file',


https://bitbucket.org/galaxy/galaxy-central/commits/0ce5e9eb984b/
Changeset:   0ce5e9eb984b
User:        jmchilton
Date:        2014-03-29 23:11:17
Summary:     Allow tools to test runtime discovered datasets.
Affected #:  5 files

diff -r 839a099d99a3d1f35c35c76740087655eacfe46f -r 0ce5e9eb984bc70f1f1c338a3f1d21ed9d6ddee4 lib/galaxy/tools/test.py
--- a/lib/galaxy/tools/test.py
+++ b/lib/galaxy/tools/test.py
@@ -297,6 +297,19 @@
     if name is None:
         raise Exception( "Test output does not have a 'name'" )
 
+    file, attributes = __parse_test_attributes( output_elem, attrib )
+    primary_datasets = {}
+    for primary_elem in ( output_elem.findall( "discovered_dataset" ) or [] ):
+        primary_attrib = dict( primary_elem.attrib )
+        designation = primary_attrib.pop( 'designation', None )
+        if designation is None:
+            raise Exception( "Test primary dataset does not have a 'designation'" )
+        primary_datasets[ designation ] = __parse_test_attributes( primary_elem, primary_attrib )
+    attributes[ "primary_datasets" ] = primary_datasets
+    return name, file, attributes
+
+
+def __parse_test_attributes( output_elem, attrib ):
     assert_list = __parse_assert_list( output_elem )
     file = attrib.pop( 'file', None )
     # File no longer required if an list of assertions was present.
@@ -321,7 +334,7 @@
     attributes['assert_list'] = assert_list
     attributes['extra_files'] = extra_files
     attributes['metadata'] = metadata
-    return name, file, attributes
+    return file, attributes
 
 
 def __parse_assert_list( output_elem ):

diff -r 839a099d99a3d1f35c35c76740087655eacfe46f -r 0ce5e9eb984bc70f1f1c338a3f1d21ed9d6ddee4 test/base/interactor.py
--- a/test/base/interactor.py
+++ b/test/base/interactor.py
@@ -41,16 +41,44 @@
         self.api_key = self.__get_user_key( twill_test_case.user_api_key, twill_test_case.master_api_key, test_user=test_user )
         self.uploads = {}
 
-    def verify_output( self, history_id, output_data, outfile, attributes, shed_tool_id, maxseconds ):
+    def verify_output( self, history_id, output_data, output_testdef, shed_tool_id, maxseconds ):
+        outfile = output_testdef.outfile
+        attributes = output_testdef.attributes
+        name = output_testdef.name
         self.wait_for_history( history_id, maxseconds )
         hid = self.__output_id( output_data )
         fetcher = self.__dataset_fetcher( history_id )
         ## TODO: Twill version verifys dataset is 'ok' in here.
         self.twill_test_case.verify_hid( outfile, hda_id=hid, attributes=attributes, dataset_fetcher=fetcher, shed_tool_id=shed_tool_id )
+
+        primary_datasets = attributes.get( 'primary_datasets', {} )
+        if primary_datasets:
+            job_id = self._dataset_provenance( history_id, hid )[ "job_id" ]
+            outputs = self._get( "jobs/%s/outputs" % ( job_id ) ).json()
+
+        for designation, ( primary_outfile, primary_attributes ) in primary_datasets.iteritems():
+            primary_output = None
+            for output in outputs:
+                if output[ "name" ] == '__new_primary_file_%s|%s__' % ( name, designation ):
+                    primary_output = output
+                    break
+
+            if not primary_output:
+                msg_template = "Failed to find primary dataset with designation [%s] for output with name [%s]"
+                msg_args = ( designation, name )
+                raise Exception( msg_template % msg_args )
+
+            primary_hda_id = primary_output[ "dataset" ][ "id" ]
+            self.twill_test_case.verify_hid( primary_outfile, hda_id=primary_hda_id, attributes=primary_attributes, dataset_fetcher=fetcher, shed_tool_id=shed_tool_id )
+            self._verify_metadata( history_id, primary_hda_id, primary_attributes )
+
+        self._verify_metadata( history_id, hid, attributes )
+
+    def _verify_metadata( self, history_id, hid, attributes ):
         metadata = attributes.get( 'metadata', {} ).copy()
         for key, value in metadata.copy().iteritems():
             new_key = "metadata_%s" % key
-            metadata[ new_key ] = metadata[ key ] 
+            metadata[ new_key ] = metadata[ key ]
             del metadata[ key ]
         expected_file_type = attributes.get( 'ftype', None )
         if expected_file_type:
@@ -316,7 +344,10 @@
     def __init__( self, twill_test_case ):
         self.twill_test_case = twill_test_case
 
-    def verify_output( self, history, output_data, outfile, attributes, shed_tool_id, maxseconds ):
+    def verify_output( self, history, output_data, output_testdef, shed_tool_id, maxseconds ):
+        outfile = output_testdef.outfile
+        attributes = output_testdef.attributes
+
         hid = output_data.get( 'hid' )
         self.twill_test_case.verify_dataset_correctness( outfile, hid=hid, attributes=attributes, shed_tool_id=shed_tool_id, maxseconds=maxseconds )
 

diff -r 839a099d99a3d1f35c35c76740087655eacfe46f -r 0ce5e9eb984bc70f1f1c338a3f1d21ed9d6ddee4 test/functional/test_toolbox.py
--- a/test/functional/test_toolbox.py
+++ b/test/functional/test_toolbox.py
@@ -3,6 +3,7 @@
 from base.twilltestcase import TwillTestCase
 from base.interactor import build_interactor, stage_data_in_history
 from galaxy.tools import DataManagerTool
+from galaxy.util import bunch
 import logging
 log = logging.getLogger( __name__ )
 
@@ -52,6 +53,7 @@
         for output_index, output_tuple in enumerate(testdef.outputs):
             # Get the correct hid
             name, outfile, attributes = output_tuple
+            output_testdef = bunch.Bunch( name=name, outfile=outfile, attributes=attributes )
             try:
                 output_data = data_list[ name ]
             except (TypeError, KeyError):
@@ -64,7 +66,7 @@
                     output_data = data_list[ len(data_list) - len(testdef.outputs) + output_index ]
             self.assertTrue( output_data is not None )
             try:
-                galaxy_interactor.verify_output( history, output_data, outfile, attributes=attributes, shed_tool_id=shed_tool_id, maxseconds=maxseconds )
+                galaxy_interactor.verify_output( history, output_data, output_testdef=output_testdef, shed_tool_id=shed_tool_id, maxseconds=maxseconds )
             except Exception:
                 for stream in ['stdout', 'stderr']:
                     stream_output = galaxy_interactor.get_job_stream( history, output_data, stream=stream )

diff -r 839a099d99a3d1f35c35c76740087655eacfe46f -r 0ce5e9eb984bc70f1f1c338a3f1d21ed9d6ddee4 test/functional/tools/multi_output.xml
--- a/test/functional/tools/multi_output.xml
+++ b/test/functional/tools/multi_output.xml
@@ -1,7 +1,7 @@
 <tool id="multi_output" name="Multi_Output" description="multi_output" force_history_refresh="True" version="0.1.0"><command>
     echo "Hello" > $report;
-    echo "World" > '${__new_file_path__}/primary_${report.id}_moo_visible_?'
+    echo "World Contents" > '${__new_file_path__}/primary_${report.id}_world_visible_?'
   </command><inputs><param name="input" type="integer" value="7" />
@@ -16,6 +16,11 @@
         <assert_contents><has_line line="Hello" /></assert_contents>
+        <discovered_dataset designation="world">
+          <assert_contents>
+            <has_line line="World Contents" />
+          </assert_contents>
+        </discovered_dataset></output></test></tests>

diff -r 839a099d99a3d1f35c35c76740087655eacfe46f -r 0ce5e9eb984bc70f1f1c338a3f1d21ed9d6ddee4 test/functional/workflow.py
--- a/test/functional/workflow.py
+++ b/test/functional/workflow.py
@@ -4,6 +4,7 @@
 from base.interactor import GalaxyInteractorApi, stage_data_in_history
 
 from galaxy.util import parse_xml
+from galaxy.util import bunch
 from galaxy.tools.test import parse_param_elem, require_file, test_data_iter, parse_output_elems
 from json import load, dumps
 
@@ -66,10 +67,11 @@
         for expected_output_def in workflow_test.outputs:
             # Get the correct hid
             name, outfile, attributes = expected_output_def
+            output_testdef = bunch.Bunch( name=name, outfile=outfile, attributes=attributes )
 
             output_data = outputs[ int( name ) ]
             try:
-                galaxy_interactor.verify_output( test_history, output_data, outfile, attributes=attributes, shed_tool_id=None, maxseconds=maxseconds )
+                galaxy_interactor.verify_output( test_history, output_data, output_testdef=output_testdef, shed_tool_id=None, maxseconds=maxseconds )
             except Exception:
                 for stream in ['stdout', 'stderr']:
                     stream_output = galaxy_interactor.get_job_stream( test_history, output_data, stream=stream )


https://bitbucket.org/galaxy/galaxy-central/commits/681f2cc6dcfa/
Changeset:   681f2cc6dcfa
User:        jmchilton
Date:        2014-03-29 23:11:17
Summary:     Refactor Tool.collect_primary_datasets into own module.
Want to break it down in smaller pieces and make process more configurable. Will be easier if its outside of monolithic tool class.
Affected #:  2 files

diff -r 0ce5e9eb984bc70f1f1c338a3f1d21ed9d6ddee4 -r 681f2cc6dcfa2a6674875d10fa8c7699be63dba4 lib/galaxy/tools/__init__.py
--- a/lib/galaxy/tools/__init__.py
+++ b/lib/galaxy/tools/__init__.py
@@ -42,6 +42,7 @@
 from galaxy.tools.deps import build_dependency_manager
 from galaxy.tools.deps.requirements import parse_requirements_from_xml
 from galaxy.tools.parameters import check_param, params_from_strings, params_to_strings
+from galaxy.tools.parameters import output_collect
 from galaxy.tools.parameters.basic import (BaseURLToolParameter,
                                            DataToolParameter, HiddenToolParameter, LibraryDatasetToolParameter,
                                            SelectToolParameter, ToolParameter, UnvalidatedValue,
@@ -2757,94 +2758,7 @@
         Find any additional datasets generated by a tool and attach (for
         cases where number of outputs is not known in advance).
         """
-        new_primary_datasets = {}
-        try:
-            json_file = open( os.path.join( job_working_directory, jobs.TOOL_PROVIDED_JOB_METADATA_FILE ), 'r' )
-            for line in json_file:
-                line = json.loads( line )
-                if line.get( 'type' ) == 'new_primary_dataset':
-                    new_primary_datasets[ os.path.split( line.get( 'filename' ) )[-1] ] = line
-        except Exception:
-            # This should not be considered an error or warning condition, this file is optional
-            pass
-        # Loop through output file names, looking for generated primary
-        # datasets in form of:
-        #     'primary_associatedWithDatasetID_designation_visibility_extension(_DBKEY)'
-        primary_datasets = {}
-        for name, outdata in output.items():
-            filenames = []
-            if 'new_file_path' in self.app.config.collect_outputs_from:
-                filenames.extend( glob.glob(os.path.join(self.app.config.new_file_path, "primary_%i_*" % outdata.id) ) )
-            if 'job_working_directory' in self.app.config.collect_outputs_from:
-                filenames.extend( glob.glob(os.path.join(job_working_directory, "primary_%i_*" % outdata.id) ) )
-            for filename in filenames:
-                if not name in primary_datasets:
-                    primary_datasets[name] = {}
-                fields = os.path.basename(filename).split("_")
-                fields.pop(0)
-                parent_id = int(fields.pop(0))
-                designation = fields.pop(0)
-                visible = fields.pop(0).lower()
-                if visible == "visible":
-                    visible = True
-                else:
-                    visible = False
-                ext = fields.pop(0).lower()
-                dbkey = outdata.dbkey
-                if fields:
-                    dbkey = fields[ 0 ]
-                # Create new primary dataset
-                primary_data = self.app.model.HistoryDatasetAssociation( extension=ext,
-                                                                         designation=designation,
-                                                                         visible=visible,
-                                                                         dbkey=dbkey,
-                                                                         create_dataset=True,
-                                                                         sa_session=self.sa_session )
-                self.app.security_agent.copy_dataset_permissions( outdata.dataset, primary_data.dataset )
-                self.sa_session.add( primary_data )
-                self.sa_session.flush()
-                # Move data from temp location to dataset location
-                self.app.object_store.update_from_file(primary_data.dataset, file_name=filename, create=True)
-                primary_data.set_size()
-                primary_data.name = "%s (%s)" % ( outdata.name, designation )
-                primary_data.info = outdata.info
-                primary_data.init_meta( copy_from=outdata )
-                primary_data.dbkey = dbkey
-                # Associate new dataset with job
-                job = None
-                for assoc in outdata.creating_job_associations:
-                    job = assoc.job
-                    break
-                if job:
-                    assoc = self.app.model.JobToOutputDatasetAssociation( '__new_primary_file_%s|%s__' % ( name, designation ), primary_data )
-                    assoc.job = job
-                    self.sa_session.add( assoc )
-                    self.sa_session.flush()
-                primary_data.state = outdata.state
-                #add tool/metadata provided information
-                new_primary_datasets_attributes = new_primary_datasets.get( os.path.split( filename )[-1] )
-                if new_primary_datasets_attributes:
-                    dataset_att_by_name = dict( ext='extension' )
-                    for att_set in [ 'name', 'info', 'ext', 'dbkey' ]:
-                        dataset_att_name = dataset_att_by_name.get( att_set, att_set )
-                        setattr( primary_data, dataset_att_name, new_primary_datasets_attributes.get( att_set, getattr( primary_data, dataset_att_name ) ) )
-                primary_data.set_meta()
-                primary_data.set_peek()
-                self.sa_session.add( primary_data )
-                self.sa_session.flush()
-                outdata.history.add_dataset( primary_data )
-                # Add dataset to return dict
-                primary_datasets[name][designation] = primary_data
-                # Need to update all associated output hdas, i.e. history was
-                # shared with job running
-                for dataset in outdata.dataset.history_associations:
-                    if outdata == dataset:
-                        continue
-                    new_data = primary_data.copy()
-                    dataset.history.add_dataset( new_data )
-                    self.sa_session.add( new_data )
-                    self.sa_session.flush()
-        return primary_datasets
+        return output_collect.collect_primary_datatasets( self, output, job_working_directory )
 
     def to_dict( self, trans, link_details=False, io_details=False ):
         """ Returns dict of tool. """

diff -r 0ce5e9eb984bc70f1f1c338a3f1d21ed9d6ddee4 -r 681f2cc6dcfa2a6674875d10fa8c7699be63dba4 lib/galaxy/tools/parameters/output_collect.py
--- /dev/null
+++ b/lib/galaxy/tools/parameters/output_collect.py
@@ -0,0 +1,101 @@
+""" Code allowing tools to define extra files associated with an output datset.
+"""
+import os
+import glob
+import json
+
+
+from galaxy import jobs
+
+
+def collect_primary_datatasets( tool, output, job_working_directory ):
+    app = tool.app
+    sa_session = tool.sa_session
+    new_primary_datasets = {}
+    try:
+        json_file = open( os.path.join( job_working_directory, jobs.TOOL_PROVIDED_JOB_METADATA_FILE ), 'r' )
+        for line in json_file:
+            line = json.loads( line )
+            if line.get( 'type' ) == 'new_primary_dataset':
+                new_primary_datasets[ os.path.split( line.get( 'filename' ) )[-1] ] = line
+    except Exception:
+        # This should not be considered an error or warning condition, this file is optional
+        pass
+    # Loop through output file names, looking for generated primary
+    # datasets in form of:
+    #     'primary_associatedWithDatasetID_designation_visibility_extension(_DBKEY)'
+    primary_datasets = {}
+    for name, outdata in output.items():
+        filenames = []
+        if 'new_file_path' in app.config.collect_outputs_from:
+            filenames.extend( glob.glob(os.path.join(app.config.new_file_path, "primary_%i_*" % outdata.id) ) )
+        if 'job_working_directory' in app.config.collect_outputs_from:
+            filenames.extend( glob.glob(os.path.join(job_working_directory, "primary_%i_*" % outdata.id) ) )
+        for filename in filenames:
+            if not name in primary_datasets:
+                primary_datasets[name] = {}
+            fields = os.path.basename(filename).split("_")
+            fields.pop(0)
+            parent_id = int(fields.pop(0))
+            designation = fields.pop(0)
+            visible = fields.pop(0).lower()
+            if visible == "visible":
+                visible = True
+            else:
+                visible = False
+            ext = fields.pop(0).lower()
+            dbkey = outdata.dbkey
+            if fields:
+                dbkey = fields[ 0 ]
+            # Create new primary dataset
+            primary_data = app.model.HistoryDatasetAssociation( extension=ext,
+                                                                designation=designation,
+                                                                visible=visible,
+                                                                dbkey=dbkey,
+                                                                create_dataset=True,
+                                                                sa_session=sa_session )
+            app.security_agent.copy_dataset_permissions( outdata.dataset, primary_data.dataset )
+            sa_session.add( primary_data )
+            sa_session.flush()
+            # Move data from temp location to dataset location
+            app.object_store.update_from_file(primary_data.dataset, file_name=filename, create=True)
+            primary_data.set_size()
+            primary_data.name = "%s (%s)" % ( outdata.name, designation )
+            primary_data.info = outdata.info
+            primary_data.init_meta( copy_from=outdata )
+            primary_data.dbkey = dbkey
+            # Associate new dataset with job
+            job = None
+            for assoc in outdata.creating_job_associations:
+                job = assoc.job
+                break
+            if job:
+                assoc = app.model.JobToOutputDatasetAssociation( '__new_primary_file_%s|%s__' % ( name, designation ), primary_data )
+                assoc.job = job
+                sa_session.add( assoc )
+                sa_session.flush()
+            primary_data.state = outdata.state
+            #add tool/metadata provided information
+            new_primary_datasets_attributes = new_primary_datasets.get( os.path.split( filename )[-1] )
+            if new_primary_datasets_attributes:
+                dataset_att_by_name = dict( ext='extension' )
+                for att_set in [ 'name', 'info', 'ext', 'dbkey' ]:
+                    dataset_att_name = dataset_att_by_name.get( att_set, att_set )
+                    setattr( primary_data, dataset_att_name, new_primary_datasets_attributes.get( att_set, getattr( primary_data, dataset_att_name ) ) )
+            primary_data.set_meta()
+            primary_data.set_peek()
+            sa_session.add( primary_data )
+            sa_session.flush()
+            outdata.history.add_dataset( primary_data )
+            # Add dataset to return dict
+            primary_datasets[name][designation] = primary_data
+            # Need to update all associated output hdas, i.e. history was
+            # shared with job running
+            for dataset in outdata.dataset.history_associations:
+                if outdata == dataset:
+                    continue
+                new_data = primary_data.copy()
+                dataset.history.add_dataset( new_data )
+                sa_session.add( new_data )
+                sa_session.flush()
+    return primary_datasets


https://bitbucket.org/galaxy/galaxy-central/commits/7b8a02bb183b/
Changeset:   7b8a02bb183b
User:        jmchilton
Date:        2014-03-29 23:11:17
Summary:     Rework parsing of metadata from file names of runtime discovered datasets.
Newer method uses regex and named groups instead of split for more extensible approach to parsing metadata fields during primary dataset collecting. This method will be easier to allow override and configuration of in subsequent changesets.
Affected #:  1 file

diff -r 681f2cc6dcfa2a6674875d10fa8c7699be63dba4 -r 7b8a02bb183bcd7edf6ee9997ea73fe2b6097d5b lib/galaxy/tools/parameters/output_collect.py
--- a/lib/galaxy/tools/parameters/output_collect.py
+++ b/lib/galaxy/tools/parameters/output_collect.py
@@ -1,12 +1,15 @@
 """ Code allowing tools to define extra files associated with an output datset.
 """
 import os
+import re
 import glob
 import json
 
 
 from galaxy import jobs
 
+DEFAULT_EXTRA_FILENAME_PATTERN = re.compile(r"primary_(?P<id>\d+)_(?P<designation>[^_]+)_(?P<visible>[^_]+)_(?P<ext>[^_]+)(_(?P<dbkey>[^_]+))?")
+
 
 def collect_primary_datatasets( tool, output, job_working_directory ):
     app = tool.app
@@ -34,19 +37,14 @@
         for filename in filenames:
             if not name in primary_datasets:
                 primary_datasets[name] = {}
-            fields = os.path.basename(filename).split("_")
-            fields.pop(0)
-            parent_id = int(fields.pop(0))
-            designation = fields.pop(0)
-            visible = fields.pop(0).lower()
-            if visible == "visible":
-                visible = True
-            else:
-                visible = False
-            ext = fields.pop(0).lower()
-            dbkey = outdata.dbkey
-            if fields:
-                dbkey = fields[ 0 ]
+            fields_match = DEFAULT_EXTRA_FILENAME_PATTERN.match( os.path.basename(filename) )
+            if not fields_match:
+                # Before I guess pop() would just have thrown an IndexError
+                raise Exception( "Problem parsing metadata fields for file %s" % filename )
+            designation = fields_match.group( "designation" )
+            visible = fields_match.group( "visible" ).lower() == "visible"
+            ext = fields_match.group( "ext" ).lower()
+            dbkey = fields_match.group( "dbkey" ) or outdata.dbkey
             # Create new primary dataset
             primary_data = app.model.HistoryDatasetAssociation( extension=ext,
                                                                 designation=designation,


https://bitbucket.org/galaxy/galaxy-central/commits/56e9203f0200/
Changeset:   56e9203f0200
User:        jmchilton
Date:        2014-03-29 23:11:17
Summary:     Allow tool outputs to configure runtime dataset discovery.
Output tags on tool XML datasets may contain any number of child "discover_datasets" elements describing how Galaxy should discover datasests. This new method only works for job_working_directory collection - new_file_path based discovery should be considered deprecated.

Example unit and functional tests describe this new configurability in detail.
Affected #:  5 files

diff -r 7b8a02bb183bcd7edf6ee9997ea73fe2b6097d5b -r 56e9203f020027c8c7a294555f6286ce654f79d2 lib/galaxy/tools/__init__.py
--- a/lib/galaxy/tools/__init__.py
+++ b/lib/galaxy/tools/__init__.py
@@ -1413,6 +1413,7 @@
             output.hidden = string_as_bool( data_elem.get("hidden", "") )
             output.tool = self
             output.actions = ToolOutputActionGroup( output, data_elem.find( 'actions' ) )
+            output.dataset_collectors = output_collect.dataset_collectors_from_elem( data_elem )
             self.outputs[ output.name ] = output
 
     # TODO: Include the tool's name in any parsing warnings.

diff -r 7b8a02bb183bcd7edf6ee9997ea73fe2b6097d5b -r 56e9203f020027c8c7a294555f6286ce654f79d2 lib/galaxy/tools/parameters/output_collect.py
--- a/lib/galaxy/tools/parameters/output_collect.py
+++ b/lib/galaxy/tools/parameters/output_collect.py
@@ -7,8 +7,11 @@
 
 
 from galaxy import jobs
+from galaxy import util
+from galaxy.util import odict
 
-DEFAULT_EXTRA_FILENAME_PATTERN = re.compile(r"primary_(?P<id>\d+)_(?P<designation>[^_]+)_(?P<visible>[^_]+)_(?P<ext>[^_]+)(_(?P<dbkey>[^_]+))?")
+DATASET_ID_TOKEN = "DATASET_ID"
+DEFAULT_EXTRA_FILENAME_PATTERN = r"primary_DATASET_ID_(?P<designation>[^_]+)_(?P<visible>[^_]+)_(?P<ext>[^_]+)(_(?P<dbkey>[^_]+))?"
 
 
 def collect_primary_datatasets( tool, output, job_working_directory ):
@@ -29,22 +32,41 @@
     #     'primary_associatedWithDatasetID_designation_visibility_extension(_DBKEY)'
     primary_datasets = {}
     for name, outdata in output.items():
-        filenames = []
+        dataset_collectors = tool.outputs[ name ].dataset_collectors if name in tool.outputs else [ DEFAULT_DATASET_COLLECTOR ]
+        filenames = odict.odict()
         if 'new_file_path' in app.config.collect_outputs_from:
-            filenames.extend( glob.glob(os.path.join(app.config.new_file_path, "primary_%i_*" % outdata.id) ) )
+            if DEFAULT_DATASET_COLLECTOR in dataset_collectors:
+                # 'new_file_path' collection should be considered deprecated,
+                # only use old-style matching (glob instead of regex and only
+                # using default collector - if enabled).
+                for filename in glob.glob(os.path.join(app.config.new_file_path, "primary_%i_*" % outdata.id) ):
+                    filenames[ filename ] = DEFAULT_DATASET_COLLECTOR
         if 'job_working_directory' in app.config.collect_outputs_from:
-            filenames.extend( glob.glob(os.path.join(job_working_directory, "primary_%i_*" % outdata.id) ) )
-        for filename in filenames:
+            for extra_file_collector in dataset_collectors:
+                directory = job_working_directory
+                if extra_file_collector.directory:
+                    directory = os.path.join( directory, extra_file_collector.directory )
+                    if not util.in_directory( directory, job_working_directory ):
+                        raise Exception( "Problem with tool configuration, attempting to pull in datasets from outside working directory." )
+                if not os.path.isdir( directory ):
+                    continue
+                for filename in os.listdir( directory ):
+                    path = os.path.join( directory, filename )
+                    if not os.path.isfile( path ):
+                        continue
+                    if extra_file_collector.match( outdata, filename ):
+                        filenames[ path ] = extra_file_collector
+        for filename, extra_file_collector in filenames.iteritems():
             if not name in primary_datasets:
                 primary_datasets[name] = {}
-            fields_match = DEFAULT_EXTRA_FILENAME_PATTERN.match( os.path.basename(filename) )
+            fields_match = extra_file_collector.match( outdata, os.path.basename( filename ) )
             if not fields_match:
                 # Before I guess pop() would just have thrown an IndexError
                 raise Exception( "Problem parsing metadata fields for file %s" % filename )
-            designation = fields_match.group( "designation" )
-            visible = fields_match.group( "visible" ).lower() == "visible"
-            ext = fields_match.group( "ext" ).lower()
-            dbkey = fields_match.group( "dbkey" ) or outdata.dbkey
+            designation = fields_match.designation
+            visible = fields_match.visible
+            ext = fields_match.ext
+            dbkey = fields_match.dbkey
             # Create new primary dataset
             primary_data = app.model.HistoryDatasetAssociation( extension=ext,
                                                                 designation=designation,
@@ -58,7 +80,9 @@
             # Move data from temp location to dataset location
             app.object_store.update_from_file(primary_data.dataset, file_name=filename, create=True)
             primary_data.set_size()
-            primary_data.name = "%s (%s)" % ( outdata.name, designation )
+            # If match specified a name use otherwise generate one from
+            # designation.
+            primary_data.name = fields_match.name or "%s (%s)" % ( outdata.name, designation )
             primary_data.info = outdata.info
             primary_data.init_meta( copy_from=outdata )
             primary_data.dbkey = dbkey
@@ -97,3 +121,99 @@
                 sa_session.add( new_data )
                 sa_session.flush()
     return primary_datasets
+
+
+# XML can describe custom patterns, but these literals describe named
+# patterns that will be replaced.
+NAMED_PATTERNS = {
+    "__default__": DEFAULT_EXTRA_FILENAME_PATTERN,
+    "__name__": r"(?P<name>.*)",
+    "__designation__": r"(?P<designation>.*)",
+    "__name_and_ext__": r"(?P<name>.*)\.(?P<ext>[^\.]+)?",
+    "__designation_and_ext__": r"(?P<designation>.*)\.(?P<ext>[^\._]+)?",
+}
+
+
+def dataset_collectors_from_elem( elem ):
+    primary_dataset_elems = elem.findall( "discover_datasets" )
+    if not primary_dataset_elems:
+        return [ DEFAULT_DATASET_COLLECTOR ]
+    else:
+        return map( lambda elem: DatasetCollector( **elem.attrib ), primary_dataset_elems )
+
+
+class DatasetCollector( object ):
+
+    def __init__( self, **kwargs ):
+        pattern = kwargs.get( "pattern", "__default__" )
+        if pattern in NAMED_PATTERNS:
+            pattern = NAMED_PATTERNS.get( pattern )
+        self.pattern = pattern
+        self.default_dbkey = kwargs.get( "dbkey", None )
+        self.default_ext = kwargs.get( "ext", None )
+        self.default_visible = util.asbool( kwargs.get( "visible", None ) )
+        self.directory = kwargs.get( "directory", None )
+
+    def pattern_for_dataset( self, dataset_instance=None ):
+        token_replacement = r'\d+'
+        if dataset_instance:
+            token_replacement = str( dataset_instance.id )
+        return self.pattern.replace( DATASET_ID_TOKEN, token_replacement )
+
+    def match( self, dataset_instance, filename ):
+        re_match = re.match( self.pattern_for_dataset( dataset_instance ), filename )
+        match_object = None
+        if re_match:
+            match_object = CollectedDatasetMatch( re_match, self )
+        return match_object
+
+
+class CollectedDatasetMatch( object ):
+
+    def __init__( self, re_match, collector ):
+        self.re_match = re_match
+        self.collector = collector
+
+    @property
+    def designation( self ):
+        re_match = self.re_match
+        if "designation" in re_match.groupdict():
+            return re_match.group( "designation" )
+        elif "name" in re_match.groupdict():
+            return re_match.group( "name" )
+        else:
+            return None
+
+    @property
+    def name( self ):
+        """ Return name or None if not defined by the discovery pattern.
+        """
+        re_match = self.re_match
+        name = None
+        if "name" in re_match.groupdict():
+            name = re_match.group( "name" )
+        return name
+
+    @property
+    def dbkey( self ):
+        try:
+            return self.re_match.group( "dbkey" )
+        except IndexError:
+            return self.collector.default_dbkey
+
+    @property
+    def ext( self ):
+        try:
+            return self.re_match.group( "ext" )
+        except IndexError:
+            return self.collector.default_ext
+
+    @property
+    def visible( self ):
+        try:
+            return self.re_match.group( "visible" ).lower() == "visible"
+        except IndexError:
+            return self.collector.default_visible
+
+
+DEFAULT_DATASET_COLLECTOR = DatasetCollector()

diff -r 7b8a02bb183bcd7edf6ee9997ea73fe2b6097d5b -r 56e9203f020027c8c7a294555f6286ce654f79d2 test/functional/tools/multi_output_configured.xml
--- /dev/null
+++ b/test/functional/tools/multi_output_configured.xml
@@ -0,0 +1,43 @@
+<tool id="multi_output_configured" name="Multi_Output_Configured" description="multi_output_configured" force_history_refresh="True" version="0.1.0">
+  <command>
+    echo "Hello" > $report;
+    mkdir subdir1;
+    echo "This" > subdir1/this.txt;
+    echo "That" > subdir1/that.txt;
+    mkdir subdir2;
+    echo "1" > subdir2/CUSTOM_1.txt;
+    echo "2" > subdir2/CUSTOM_2.tabular;
+    echo "3" > subdir2/CUSTOM_3.txt;
+  </command>
+  <inputs>
+    <param name="input" type="integer" value="7" />
+  </inputs>
+  <outputs>
+    <data format="txt" name="report">
+      <discover_datasets pattern="__designation_and_ext__" directory="subdir1" />
+      <discover_datasets pattern="CUSTOM_(?P<designation>.+)\.(?P<ext>.+)" directory="subdir2" />
+    </data>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="7" />
+      <output name="report">
+        <assert_contents>
+          <has_line line="Hello" />
+        </assert_contents>
+        <discovered_dataset designation="this" ftype="txt">
+          <assert_contents><has_line line="This" /></assert_contents>
+        </discovered_dataset>
+        <discovered_dataset designation="that" ftype="txt">
+          <assert_contents><has_line line="That" /></assert_contents>
+        </discovered_dataset>
+        <discovered_dataset designation="1" ftype="txt">
+          <assert_contents><has_line line="1" /></assert_contents>
+        </discovered_dataset>
+        <discovered_dataset designation="2" ftype="tabular">
+          <assert_contents><has_line line="2" /></assert_contents>
+        </discovered_dataset>
+      </output>
+    </test>
+  </tests>
+</tool>

diff -r 7b8a02bb183bcd7edf6ee9997ea73fe2b6097d5b -r 56e9203f020027c8c7a294555f6286ce654f79d2 test/functional/tools/samples_tool_conf.xml
--- a/test/functional/tools/samples_tool_conf.xml
+++ b/test/functional/tools/samples_tool_conf.xml
@@ -8,6 +8,7 @@
   <tool file="multi_page.xml"/><tool file="multi_select.xml" /><tool file="multi_output.xml" />
+  <tool file="multi_output_configured.xml" /><tool file="composite_output.xml" /><tool file="metadata.xml" /><tool file="output_order.xml" />

diff -r 7b8a02bb183bcd7edf6ee9997ea73fe2b6097d5b -r 56e9203f020027c8c7a294555f6286ce654f79d2 test/unit/tools/test_collect_primary_datasets.py
--- a/test/unit/tools/test_collect_primary_datasets.py
+++ b/test/unit/tools/test_collect_primary_datasets.py
@@ -5,6 +5,8 @@
 import tools_support
 
 from galaxy import model
+from galaxy import util
+from galaxy.tools.parameters import output_collect
 
 DEFAULT_TOOL_OUTPUT = "out1"
 DEFAULT_EXTRA_NAME = "test1"
@@ -114,6 +116,75 @@
         extra_job_assoc = filter( lambda job_assoc: job_assoc.name.startswith( "__" ), self.job.output_datasets )[ 0 ]
         assert extra_job_assoc.name == "__new_primary_file_out1|test1__"
 
+    def test_pattern_override_designation( self ):
+        self._replace_output_collectors( '''<output><discover_datasets pattern="__designation__" directory="subdir" ext="txt" /></output>''' )
+        self._setup_extra_file( subdir="subdir", filename="foo.txt" )
+        primary_outputs = self._collect( )[ DEFAULT_TOOL_OUTPUT ]
+        assert len( primary_outputs ) == 1
+        created_hda = primary_outputs.values()[ 0 ]
+        assert "foo.txt" in created_hda.name
+        assert created_hda.ext == "txt"
+
+    def test_name_and_ext_pattern( self ):
+        self._replace_output_collectors( '''<output><discover_datasets pattern="__name_and_ext__" directory="subdir" /></output>''' )
+        self._setup_extra_file( subdir="subdir", filename="foo1.txt" )
+        self._setup_extra_file( subdir="subdir", filename="foo2.tabular" )
+        primary_outputs = self._collect( )[ DEFAULT_TOOL_OUTPUT ]
+        assert len( primary_outputs ) == 2
+        assert primary_outputs[ "foo1" ].ext == "txt"
+        assert primary_outputs[ "foo2" ].ext == "tabular"
+
+    def test_custom_pattern( self ):
+        # Hypothetical oral metagenomic classifier that populates a directory
+        # of files based on name and genome. Use custom regex pattern to grab
+        # and classify these files.
+        self._replace_output_collectors( '''<output><discover_datasets pattern="(?P<designation>.*)__(?P<dbkey>.*).fasta" directory="genome_breakdown" ext="fasta" /></output>''' )
+        self._setup_extra_file( subdir="genome_breakdown", filename="samp1__hg19.fasta" )
+        self._setup_extra_file( subdir="genome_breakdown", filename="samp2__lactLact.fasta" )
+        self._setup_extra_file( subdir="genome_breakdown", filename="samp3__hg19.fasta" )
+        self._setup_extra_file( subdir="genome_breakdown", filename="samp4__lactPlan.fasta" )
+        self._setup_extra_file( subdir="genome_breakdown", filename="samp5__fusoNucl.fasta" )
+
+        # Put a file in directory we don't care about, just to make sure
+        # it doesn't get picked up by pattern.
+        self._setup_extra_file( subdir="genome_breakdown", filename="overview.txt" )
+
+        primary_outputs = self._collect( )[ DEFAULT_TOOL_OUTPUT ]
+        assert len( primary_outputs ) == 5
+        genomes = dict( samp1="hg19", samp2="lactLact", samp3="hg19", samp4="lactPlan", samp5="fusoNucl" )
+        for key, hda in primary_outputs.iteritems():
+            assert hda.dbkey == genomes[ key ]
+
+    def test_name_versus_designation( self ):
+        """ This test demonstrates the difference between name and desgination
+        in grouping patterns and named patterns such as __designation__,
+        __name__, __designation_and_ext__, and __name_and_ext__.
+        """
+        self._replace_output_collectors( '''<output>
+            <discover_datasets pattern="__name_and_ext__" directory="subdir_for_name_discovery" />
+            <discover_datasets pattern="__designation_and_ext__" directory="subdir_for_designation_discovery" />
+        </output>''')
+        self._setup_extra_file( subdir="subdir_for_name_discovery", filename="example1.txt" )
+        self._setup_extra_file( subdir="subdir_for_designation_discovery", filename="example2.txt" )
+        primary_outputs = self._collect( )[ DEFAULT_TOOL_OUTPUT ]
+        name_output = primary_outputs[ "example1" ]
+        designation_output = primary_outputs[ "example2" ]
+        # While name is also used for designation, designation is not the name -
+        # it is used in the calculation of the name however...
+        assert name_output.name == "example1"
+        assert designation_output.name == "%s (%s)" % ( self.hda.name, "example2" )
+
+    def test_cannot_read_files_outside_job_directory( self ):
+        self._replace_output_collectors( '''<output>
+            <discover_datasets pattern="__name_and_ext__" directory="../../secrets" />
+        </output>''')
+        exception_thrown = False
+        try:
+            self._collect( )
+        except Exception:
+            exception_thrown = True
+        assert exception_thrown
+
     def _collect_default_extra( self, **kwargs ):
         return self._collect( **kwargs )[ DEFAULT_TOOL_OUTPUT ][ DEFAULT_EXTRA_NAME ]
 
@@ -122,6 +193,12 @@
             job_working_directory = self.test_directory
         return self.tool.collect_primary_datasets( self.outputs, job_working_directory )
 
+    def _replace_output_collectors( self, xml_str ):
+        # Rewrite tool as if it had been created with output containing
+        # supplied dataset_collector elem.
+        elem = util.parse_xml_string( xml_str )
+        self.tool.outputs[ DEFAULT_TOOL_OUTPUT ].dataset_collectors = output_collect.dataset_collectors_from_elem( elem )
+
     def _append_job_json( self, object, output_path=None, line_type="new_primary_dataset" ):
         object[ "type" ] = line_type
         if output_path:
@@ -133,7 +210,8 @@
 
     def _setup_extra_file( self, **kwargs ):
         path = kwargs.get( "path", None )
-        if not path:
+        filename = kwargs.get( "filename", None )
+        if not path and not filename:
             name = kwargs.get( "name", DEFAULT_EXTRA_NAME )
             visible = kwargs.get( "visible", "visible" )
             ext = kwargs.get( "ext", "data" )
@@ -142,6 +220,13 @@
             path = os.path.join( directory, "primary_%s_%s_%s_%s" % template_args )
             if "dbkey" in kwargs:
                 path = "%s_%s" % ( path, kwargs[ "dbkey" ] )
+        if not path:
+            assert filename
+            subdir = kwargs.get( "subdir", "." )
+            path = os.path.join( self.test_directory, subdir, filename )
+        directory = os.path.dirname( path )
+        if not os.path.exists( directory ):
+            os.makedirs( directory )
         contents = kwargs.get( "contents", "test contents" )
         open( path, "w" ).write( contents )
         return path


https://bitbucket.org/galaxy/galaxy-central/commits/8e6cda4c1b3d/
Changeset:   8e6cda4c1b3d
User:        jmchilton
Date:        2014-05-06 15:13:29
Summary:     Merged in jmchilton/galaxy-central-fork-1 (pull request #356)

Enhancements for Runtime Discovered (Collected Primary) Datasets
Affected #:  13 files

diff -r 74b6e23ed7882f3c091d2b66ce85025241372017 -r 8e6cda4c1b3d1685a8ce07412dc542f34eb8b44b lib/galaxy/tools/__init__.py
--- a/lib/galaxy/tools/__init__.py
+++ b/lib/galaxy/tools/__init__.py
@@ -42,6 +42,7 @@
 from galaxy.tools.deps import build_dependency_manager
 from galaxy.tools.deps.requirements import parse_requirements_from_xml
 from galaxy.tools.parameters import check_param, params_from_strings, params_to_strings
+from galaxy.tools.parameters import output_collect
 from galaxy.tools.parameters.basic import (BaseURLToolParameter,
                                            DataToolParameter, HiddenToolParameter, LibraryDatasetToolParameter,
                                            SelectToolParameter, ToolParameter, UnvalidatedValue,
@@ -1452,6 +1453,7 @@
             output.hidden = string_as_bool( data_elem.get("hidden", "") )
             output.tool = self
             output.actions = ToolOutputActionGroup( output, data_elem.find( 'actions' ) )
+            output.dataset_collectors = output_collect.dataset_collectors_from_elem( data_elem )
             self.outputs[ output.name ] = output
 
     # TODO: Include the tool's name in any parsing warnings.
@@ -2800,94 +2802,7 @@
         Find any additional datasets generated by a tool and attach (for
         cases where number of outputs is not known in advance).
         """
-        new_primary_datasets = {}
-        try:
-            json_file = open( os.path.join( job_working_directory, jobs.TOOL_PROVIDED_JOB_METADATA_FILE ), 'r' )
-            for line in json_file:
-                line = json.loads( line )
-                if line.get( 'type' ) == 'new_primary_dataset':
-                    new_primary_datasets[ os.path.split( line.get( 'filename' ) )[-1] ] = line
-        except Exception:
-            # This should not be considered an error or warning condition, this file is optional
-            pass
-        # Loop through output file names, looking for generated primary
-        # datasets in form of:
-        #     'primary_associatedWithDatasetID_designation_visibility_extension(_DBKEY)'
-        primary_datasets = {}
-        for name, outdata in output.items():
-            filenames = []
-            if 'new_file_path' in self.app.config.collect_outputs_from:
-                filenames.extend( glob.glob(os.path.join(self.app.config.new_file_path, "primary_%i_*" % outdata.id) ) )
-            if 'job_working_directory' in self.app.config.collect_outputs_from:
-                filenames.extend( glob.glob(os.path.join(job_working_directory, "primary_%i_*" % outdata.id) ) )
-            for filename in filenames:
-                if not name in primary_datasets:
-                    primary_datasets[name] = {}
-                fields = os.path.basename(filename).split("_")
-                fields.pop(0)
-                parent_id = int(fields.pop(0))
-                designation = fields.pop(0)
-                visible = fields.pop(0).lower()
-                if visible == "visible":
-                    visible = True
-                else:
-                    visible = False
-                ext = fields.pop(0).lower()
-                dbkey = outdata.dbkey
-                if fields:
-                    dbkey = fields[ 0 ]
-                # Create new primary dataset
-                primary_data = self.app.model.HistoryDatasetAssociation( extension=ext,
-                                                                         designation=designation,
-                                                                         visible=visible,
-                                                                         dbkey=dbkey,
-                                                                         create_dataset=True,
-                                                                         sa_session=self.sa_session )
-                self.app.security_agent.copy_dataset_permissions( outdata.dataset, primary_data.dataset )
-                self.sa_session.add( primary_data )
-                self.sa_session.flush()
-                # Move data from temp location to dataset location
-                self.app.object_store.update_from_file(primary_data.dataset, file_name=filename, create=True)
-                primary_data.set_size()
-                primary_data.name = "%s (%s)" % ( outdata.name, designation )
-                primary_data.info = outdata.info
-                primary_data.init_meta( copy_from=outdata )
-                primary_data.dbkey = dbkey
-                # Associate new dataset with job
-                job = None
-                for assoc in outdata.creating_job_associations:
-                    job = assoc.job
-                    break
-                if job:
-                    assoc = self.app.model.JobToOutputDatasetAssociation( '__new_primary_file_%s|%s__' % ( name, designation ), primary_data )
-                    assoc.job = job
-                    self.sa_session.add( assoc )
-                    self.sa_session.flush()
-                primary_data.state = outdata.state
-                #add tool/metadata provided information
-                new_primary_datasets_attributes = new_primary_datasets.get( os.path.split( filename )[-1] )
-                if new_primary_datasets_attributes:
-                    dataset_att_by_name = dict( ext='extension' )
-                    for att_set in [ 'name', 'info', 'ext', 'dbkey' ]:
-                        dataset_att_name = dataset_att_by_name.get( att_set, att_set )
-                        setattr( primary_data, dataset_att_name, new_primary_datasets_attributes.get( att_set, getattr( primary_data, dataset_att_name ) ) )
-                primary_data.set_meta()
-                primary_data.set_peek()
-                self.sa_session.add( primary_data )
-                self.sa_session.flush()
-                outdata.history.add_dataset( primary_data )
-                # Add dataset to return dict
-                primary_datasets[name][designation] = primary_data
-                # Need to update all associated output hdas, i.e. history was
-                # shared with job running
-                for dataset in outdata.dataset.history_associations:
-                    if outdata == dataset:
-                        continue
-                    new_data = primary_data.copy()
-                    dataset.history.add_dataset( new_data )
-                    self.sa_session.add( new_data )
-                    self.sa_session.flush()
-        return primary_datasets
+        return output_collect.collect_primary_datatasets( self, output, job_working_directory )
 
     def to_dict( self, trans, link_details=False, io_details=False ):
         """ Returns dict of tool. """

diff -r 74b6e23ed7882f3c091d2b66ce85025241372017 -r 8e6cda4c1b3d1685a8ce07412dc542f34eb8b44b lib/galaxy/tools/parameters/output_collect.py
--- /dev/null
+++ b/lib/galaxy/tools/parameters/output_collect.py
@@ -0,0 +1,219 @@
+""" Code allowing tools to define extra files associated with an output datset.
+"""
+import os
+import re
+import glob
+import json
+
+
+from galaxy import jobs
+from galaxy import util
+from galaxy.util import odict
+
+DATASET_ID_TOKEN = "DATASET_ID"
+DEFAULT_EXTRA_FILENAME_PATTERN = r"primary_DATASET_ID_(?P<designation>[^_]+)_(?P<visible>[^_]+)_(?P<ext>[^_]+)(_(?P<dbkey>[^_]+))?"
+
+
+def collect_primary_datatasets( tool, output, job_working_directory ):
+    app = tool.app
+    sa_session = tool.sa_session
+    new_primary_datasets = {}
+    try:
+        json_file = open( os.path.join( job_working_directory, jobs.TOOL_PROVIDED_JOB_METADATA_FILE ), 'r' )
+        for line in json_file:
+            line = json.loads( line )
+            if line.get( 'type' ) == 'new_primary_dataset':
+                new_primary_datasets[ os.path.split( line.get( 'filename' ) )[-1] ] = line
+    except Exception:
+        # This should not be considered an error or warning condition, this file is optional
+        pass
+    # Loop through output file names, looking for generated primary
+    # datasets in form of:
+    #     'primary_associatedWithDatasetID_designation_visibility_extension(_DBKEY)'
+    primary_datasets = {}
+    for name, outdata in output.items():
+        dataset_collectors = tool.outputs[ name ].dataset_collectors if name in tool.outputs else [ DEFAULT_DATASET_COLLECTOR ]
+        filenames = odict.odict()
+        if 'new_file_path' in app.config.collect_outputs_from:
+            if DEFAULT_DATASET_COLLECTOR in dataset_collectors:
+                # 'new_file_path' collection should be considered deprecated,
+                # only use old-style matching (glob instead of regex and only
+                # using default collector - if enabled).
+                for filename in glob.glob(os.path.join(app.config.new_file_path, "primary_%i_*" % outdata.id) ):
+                    filenames[ filename ] = DEFAULT_DATASET_COLLECTOR
+        if 'job_working_directory' in app.config.collect_outputs_from:
+            for extra_file_collector in dataset_collectors:
+                directory = job_working_directory
+                if extra_file_collector.directory:
+                    directory = os.path.join( directory, extra_file_collector.directory )
+                    if not util.in_directory( directory, job_working_directory ):
+                        raise Exception( "Problem with tool configuration, attempting to pull in datasets from outside working directory." )
+                if not os.path.isdir( directory ):
+                    continue
+                for filename in os.listdir( directory ):
+                    path = os.path.join( directory, filename )
+                    if not os.path.isfile( path ):
+                        continue
+                    if extra_file_collector.match( outdata, filename ):
+                        filenames[ path ] = extra_file_collector
+        for filename, extra_file_collector in filenames.iteritems():
+            if not name in primary_datasets:
+                primary_datasets[name] = {}
+            fields_match = extra_file_collector.match( outdata, os.path.basename( filename ) )
+            if not fields_match:
+                # Before I guess pop() would just have thrown an IndexError
+                raise Exception( "Problem parsing metadata fields for file %s" % filename )
+            designation = fields_match.designation
+            visible = fields_match.visible
+            ext = fields_match.ext
+            dbkey = fields_match.dbkey
+            # Create new primary dataset
+            primary_data = app.model.HistoryDatasetAssociation( extension=ext,
+                                                                designation=designation,
+                                                                visible=visible,
+                                                                dbkey=dbkey,
+                                                                create_dataset=True,
+                                                                sa_session=sa_session )
+            app.security_agent.copy_dataset_permissions( outdata.dataset, primary_data.dataset )
+            sa_session.add( primary_data )
+            sa_session.flush()
+            # Move data from temp location to dataset location
+            app.object_store.update_from_file(primary_data.dataset, file_name=filename, create=True)
+            primary_data.set_size()
+            # If match specified a name use otherwise generate one from
+            # designation.
+            primary_data.name = fields_match.name or "%s (%s)" % ( outdata.name, designation )
+            primary_data.info = outdata.info
+            primary_data.init_meta( copy_from=outdata )
+            primary_data.dbkey = dbkey
+            # Associate new dataset with job
+            job = None
+            for assoc in outdata.creating_job_associations:
+                job = assoc.job
+                break
+            if job:
+                assoc = app.model.JobToOutputDatasetAssociation( '__new_primary_file_%s|%s__' % ( name, designation ), primary_data )
+                assoc.job = job
+                sa_session.add( assoc )
+                sa_session.flush()
+            primary_data.state = outdata.state
+            #add tool/metadata provided information
+            new_primary_datasets_attributes = new_primary_datasets.get( os.path.split( filename )[-1] )
+            if new_primary_datasets_attributes:
+                dataset_att_by_name = dict( ext='extension' )
+                for att_set in [ 'name', 'info', 'ext', 'dbkey' ]:
+                    dataset_att_name = dataset_att_by_name.get( att_set, att_set )
+                    setattr( primary_data, dataset_att_name, new_primary_datasets_attributes.get( att_set, getattr( primary_data, dataset_att_name ) ) )
+            primary_data.set_meta()
+            primary_data.set_peek()
+            sa_session.add( primary_data )
+            sa_session.flush()
+            outdata.history.add_dataset( primary_data )
+            # Add dataset to return dict
+            primary_datasets[name][designation] = primary_data
+            # Need to update all associated output hdas, i.e. history was
+            # shared with job running
+            for dataset in outdata.dataset.history_associations:
+                if outdata == dataset:
+                    continue
+                new_data = primary_data.copy()
+                dataset.history.add_dataset( new_data )
+                sa_session.add( new_data )
+                sa_session.flush()
+    return primary_datasets
+
+
+# XML can describe custom patterns, but these literals describe named
+# patterns that will be replaced.
+NAMED_PATTERNS = {
+    "__default__": DEFAULT_EXTRA_FILENAME_PATTERN,
+    "__name__": r"(?P<name>.*)",
+    "__designation__": r"(?P<designation>.*)",
+    "__name_and_ext__": r"(?P<name>.*)\.(?P<ext>[^\.]+)?",
+    "__designation_and_ext__": r"(?P<designation>.*)\.(?P<ext>[^\._]+)?",
+}
+
+
+def dataset_collectors_from_elem( elem ):
+    primary_dataset_elems = elem.findall( "discover_datasets" )
+    if not primary_dataset_elems:
+        return [ DEFAULT_DATASET_COLLECTOR ]
+    else:
+        return map( lambda elem: DatasetCollector( **elem.attrib ), primary_dataset_elems )
+
+
+class DatasetCollector( object ):
+
+    def __init__( self, **kwargs ):
+        pattern = kwargs.get( "pattern", "__default__" )
+        if pattern in NAMED_PATTERNS:
+            pattern = NAMED_PATTERNS.get( pattern )
+        self.pattern = pattern
+        self.default_dbkey = kwargs.get( "dbkey", None )
+        self.default_ext = kwargs.get( "ext", None )
+        self.default_visible = util.asbool( kwargs.get( "visible", None ) )
+        self.directory = kwargs.get( "directory", None )
+
+    def pattern_for_dataset( self, dataset_instance=None ):
+        token_replacement = r'\d+'
+        if dataset_instance:
+            token_replacement = str( dataset_instance.id )
+        return self.pattern.replace( DATASET_ID_TOKEN, token_replacement )
+
+    def match( self, dataset_instance, filename ):
+        re_match = re.match( self.pattern_for_dataset( dataset_instance ), filename )
+        match_object = None
+        if re_match:
+            match_object = CollectedDatasetMatch( re_match, self )
+        return match_object
+
+
+class CollectedDatasetMatch( object ):
+
+    def __init__( self, re_match, collector ):
+        self.re_match = re_match
+        self.collector = collector
+
+    @property
+    def designation( self ):
+        re_match = self.re_match
+        if "designation" in re_match.groupdict():
+            return re_match.group( "designation" )
+        elif "name" in re_match.groupdict():
+            return re_match.group( "name" )
+        else:
+            return None
+
+    @property
+    def name( self ):
+        """ Return name or None if not defined by the discovery pattern.
+        """
+        re_match = self.re_match
+        name = None
+        if "name" in re_match.groupdict():
+            name = re_match.group( "name" )
+        return name
+
+    @property
+    def dbkey( self ):
+        try:
+            return self.re_match.group( "dbkey" )
+        except IndexError:
+            return self.collector.default_dbkey
+
+    @property
+    def ext( self ):
+        try:
+            return self.re_match.group( "ext" )
+        except IndexError:
+            return self.collector.default_ext
+
+    @property
+    def visible( self ):
+        try:
+            return self.re_match.group( "visible" ).lower() == "visible"
+        except IndexError:
+            return self.collector.default_visible
+
+
+DEFAULT_DATASET_COLLECTOR = DatasetCollector()

diff -r 74b6e23ed7882f3c091d2b66ce85025241372017 -r 8e6cda4c1b3d1685a8ce07412dc542f34eb8b44b lib/galaxy/tools/test.py
--- a/lib/galaxy/tools/test.py
+++ b/lib/galaxy/tools/test.py
@@ -297,6 +297,19 @@
     if name is None:
         raise Exception( "Test output does not have a 'name'" )
 
+    file, attributes = __parse_test_attributes( output_elem, attrib )
+    primary_datasets = {}
+    for primary_elem in ( output_elem.findall( "discovered_dataset" ) or [] ):
+        primary_attrib = dict( primary_elem.attrib )
+        designation = primary_attrib.pop( 'designation', None )
+        if designation is None:
+            raise Exception( "Test primary dataset does not have a 'designation'" )
+        primary_datasets[ designation ] = __parse_test_attributes( primary_elem, primary_attrib )
+    attributes[ "primary_datasets" ] = primary_datasets
+    return name, file, attributes
+
+
+def __parse_test_attributes( output_elem, attrib ):
     assert_list = __parse_assert_list( output_elem )
     file = attrib.pop( 'file', None )
     # File no longer required if an list of assertions was present.
@@ -321,7 +334,7 @@
     attributes['assert_list'] = assert_list
     attributes['extra_files'] = extra_files
     attributes['metadata'] = metadata
-    return name, file, attributes
+    return file, attributes
 
 
 def __parse_assert_list( output_elem ):

diff -r 74b6e23ed7882f3c091d2b66ce85025241372017 -r 8e6cda4c1b3d1685a8ce07412dc542f34eb8b44b lib/galaxy/webapps/galaxy/api/jobs.py
--- a/lib/galaxy/webapps/galaxy/api/jobs.py
+++ b/lib/galaxy/webapps/galaxy/api/jobs.py
@@ -13,6 +13,7 @@
 from galaxy.web.base.controller import UsesLibraryMixinItems
 from galaxy import exceptions
 from galaxy import util
+from galaxy import model
 
 import logging
 log = logging.getLogger( __name__ )
@@ -91,10 +92,62 @@
         :rtype:     dictionary
         :returns:   dictionary containing full description of job data
         """
+        job = self.__get_job( trans, id )
+        return self.encode_all_ids( trans, job.to_dict( 'element' ), True )
+
+    @expose_api
+    def inputs( self, trans, id, **kwd ):
+        """
+        show( trans, id )
+        * GET /api/jobs/{job_id}/inputs
+            returns input datasets created by job
+
+        :type   id: string
+        :param  id: Encoded job id
+
+        :rtype:     dictionary
+        :returns:   dictionary containing input dataset associations
+        """
+        job = self.__get_job( trans, id )
+        return self.__dictify_associations( trans, job.input_datasets, job.input_library_datasets )
+
+    @expose_api
+    def outputs( self, trans, id, **kwd ):
+        """
+        show( trans, id )
+        * GET /api/jobs/{job_id}/outputs
+            returns output datasets created by job
+
+        :type   id: string
+        :param  id: Encoded job id
+
+        :rtype:     dictionary
+        :returns:   dictionary containing output dataset associations
+        """
+        job = self.__get_job( trans, id )
+        return self.__dictify_associations( trans, job.output_datasets, job.output_library_datasets )
+
+    def __dictify_associations( self, trans, *association_lists ):
+        rval = []
+        for association_list in association_lists:
+            rval.extend( map( lambda a: self.__dictify_association( trans, a ), association_list ) )
+        return rval
+
+    def __dictify_association( self, trans, job_dataset_association ):
+        dataset_dict = None
+        dataset = job_dataset_association.dataset
+        if dataset:
+            if isinstance( dataset, model.HistoryDatasetAssociation ):
+                dataset_dict = dict( src="hda", id=trans.security.encode_id( dataset.id ) )
+            else:
+                dataset_dict = dict( src="ldda", id=trans.security.encode_id( dataset.id ) )
+        return dict( name=job_dataset_association.name, dataset=dataset_dict )
+
+    def __get_job( self, trans, id ):
         try:
-            decoded_job_id = trans.security.decode_id(id)
-        except:
-            raise exceptions.ObjectAttributeInvalidException()
+            decoded_job_id = trans.security.decode_id( id )
+        except Exception:
+            raise exceptions.MalformedId()
         query = trans.sa_session.query( trans.app.model.Job ).filter(
             trans.app.model.Job.user == trans.user,
             trans.app.model.Job.id == decoded_job_id
@@ -102,7 +155,7 @@
         job = query.first()
         if job is None:
             raise exceptions.ObjectNotFound()
-        return self.encode_all_ids( trans, job.to_dict( 'element' ), True )
+        return job
 
     @expose_api
     def create( self, trans, payload, **kwd ):

diff -r 74b6e23ed7882f3c091d2b66ce85025241372017 -r 8e6cda4c1b3d1685a8ce07412dc542f34eb8b44b lib/galaxy/webapps/galaxy/api/provenance.py
--- a/lib/galaxy/webapps/galaxy/api/provenance.py
+++ b/lib/galaxy/webapps/galaxy/api/provenance.py
@@ -46,6 +46,7 @@
             return {
                 "id": trans.security.encode_id(item.id),
                 "uuid": ( lambda uuid: str( uuid ) if uuid else None )( item.dataset.uuid),
+                "job_id": trans.security.encode_id( job.id ),
                 "tool_id": job.tool_id,
                 "parameters": self._get_job_record(trans, job, follow),
                 "stderr": job.stderr,

diff -r 74b6e23ed7882f3c091d2b66ce85025241372017 -r 8e6cda4c1b3d1685a8ce07412dc542f34eb8b44b lib/galaxy/webapps/galaxy/buildapp.py
--- a/lib/galaxy/webapps/galaxy/buildapp.py
+++ b/lib/galaxy/webapps/galaxy/buildapp.py
@@ -281,6 +281,8 @@
                             'jobs',
                             path_prefix='/api' )
     webapp.mapper.connect( 'job_search', '/api/jobs/search', controller='jobs', action='search', conditions=dict( method=['POST'] ) )
+    webapp.mapper.connect( 'job_inputs', '/api/jobs/{id}/inputs', controller='jobs', action='inputs', conditions=dict( method=['GET'] ) )
+    webapp.mapper.connect( 'job_outputs', '/api/jobs/{id}/outputs', controller='jobs', action='outputs', conditions=dict( method=['GET'] ) )
 
     # Job files controllers. Only for consumption by remote job runners.
     webapp.mapper.resource( 'file',

diff -r 74b6e23ed7882f3c091d2b66ce85025241372017 -r 8e6cda4c1b3d1685a8ce07412dc542f34eb8b44b test/base/interactor.py
--- a/test/base/interactor.py
+++ b/test/base/interactor.py
@@ -44,16 +44,44 @@
         self.api_key = self.__get_user_key( twill_test_case.user_api_key, twill_test_case.master_api_key, test_user=test_user )
         self.uploads = {}
 
-    def verify_output( self, history_id, output_data, outfile, attributes, shed_tool_id, maxseconds ):
+    def verify_output( self, history_id, output_data, output_testdef, shed_tool_id, maxseconds ):
+        outfile = output_testdef.outfile
+        attributes = output_testdef.attributes
+        name = output_testdef.name
         self.wait_for_history( history_id, maxseconds )
         hid = self.__output_id( output_data )
         fetcher = self.__dataset_fetcher( history_id )
         ## TODO: Twill version verifys dataset is 'ok' in here.
         self.twill_test_case.verify_hid( outfile, hda_id=hid, attributes=attributes, dataset_fetcher=fetcher, shed_tool_id=shed_tool_id )
+
+        primary_datasets = attributes.get( 'primary_datasets', {} )
+        if primary_datasets:
+            job_id = self._dataset_provenance( history_id, hid )[ "job_id" ]
+            outputs = self._get( "jobs/%s/outputs" % ( job_id ) ).json()
+
+        for designation, ( primary_outfile, primary_attributes ) in primary_datasets.iteritems():
+            primary_output = None
+            for output in outputs:
+                if output[ "name" ] == '__new_primary_file_%s|%s__' % ( name, designation ):
+                    primary_output = output
+                    break
+
+            if not primary_output:
+                msg_template = "Failed to find primary dataset with designation [%s] for output with name [%s]"
+                msg_args = ( designation, name )
+                raise Exception( msg_template % msg_args )
+
+            primary_hda_id = primary_output[ "dataset" ][ "id" ]
+            self.twill_test_case.verify_hid( primary_outfile, hda_id=primary_hda_id, attributes=primary_attributes, dataset_fetcher=fetcher, shed_tool_id=shed_tool_id )
+            self._verify_metadata( history_id, primary_hda_id, primary_attributes )
+
+        self._verify_metadata( history_id, hid, attributes )
+
+    def _verify_metadata( self, history_id, hid, attributes ):
         metadata = attributes.get( 'metadata', {} ).copy()
         for key, value in metadata.copy().iteritems():
             new_key = "metadata_%s" % key
-            metadata[ new_key ] = metadata[ key ] 
+            metadata[ new_key ] = metadata[ key ]
             del metadata[ key ]
         expected_file_type = attributes.get( 'ftype', None )
         if expected_file_type:
@@ -319,7 +347,10 @@
     def __init__( self, twill_test_case ):
         self.twill_test_case = twill_test_case
 
-    def verify_output( self, history, output_data, outfile, attributes, shed_tool_id, maxseconds ):
+    def verify_output( self, history, output_data, output_testdef, shed_tool_id, maxseconds ):
+        outfile = output_testdef.outfile
+        attributes = output_testdef.attributes
+
         hid = output_data.get( 'hid' )
         self.twill_test_case.verify_dataset_correctness( outfile, hid=hid, attributes=attributes, shed_tool_id=shed_tool_id, maxseconds=maxseconds )
 

diff -r 74b6e23ed7882f3c091d2b66ce85025241372017 -r 8e6cda4c1b3d1685a8ce07412dc542f34eb8b44b test/functional/test_toolbox.py
--- a/test/functional/test_toolbox.py
+++ b/test/functional/test_toolbox.py
@@ -3,6 +3,7 @@
 from base.twilltestcase import TwillTestCase
 from base.interactor import build_interactor, stage_data_in_history
 from galaxy.tools import DataManagerTool
+from galaxy.util import bunch
 import logging
 log = logging.getLogger( __name__ )
 
@@ -52,6 +53,7 @@
         for output_index, output_tuple in enumerate(testdef.outputs):
             # Get the correct hid
             name, outfile, attributes = output_tuple
+            output_testdef = bunch.Bunch( name=name, outfile=outfile, attributes=attributes )
             try:
                 output_data = data_list[ name ]
             except (TypeError, KeyError):
@@ -64,7 +66,7 @@
                     output_data = data_list[ len(data_list) - len(testdef.outputs) + output_index ]
             self.assertTrue( output_data is not None )
             try:
-                galaxy_interactor.verify_output( history, output_data, outfile, attributes=attributes, shed_tool_id=shed_tool_id, maxseconds=maxseconds )
+                galaxy_interactor.verify_output( history, output_data, output_testdef=output_testdef, shed_tool_id=shed_tool_id, maxseconds=maxseconds )
             except Exception:
                 for stream in ['stdout', 'stderr']:
                     stream_output = galaxy_interactor.get_job_stream( history, output_data, stream=stream )

diff -r 74b6e23ed7882f3c091d2b66ce85025241372017 -r 8e6cda4c1b3d1685a8ce07412dc542f34eb8b44b test/functional/tools/multi_output.xml
--- a/test/functional/tools/multi_output.xml
+++ b/test/functional/tools/multi_output.xml
@@ -1,7 +1,7 @@
 <tool id="multi_output" name="Multi_Output" description="multi_output" force_history_refresh="True" version="0.1.0"><command>
     echo "Hello" > $report;
-    echo "World" > '${__new_file_path__}/primary_${report.id}_moo_visible_?'
+    echo "World Contents" > '${__new_file_path__}/primary_${report.id}_world_visible_?'
   </command><inputs><param name="input" type="integer" value="7" />
@@ -16,6 +16,11 @@
         <assert_contents><has_line line="Hello" /></assert_contents>
+        <discovered_dataset designation="world">
+          <assert_contents>
+            <has_line line="World Contents" />
+          </assert_contents>
+        </discovered_dataset></output></test></tests>

diff -r 74b6e23ed7882f3c091d2b66ce85025241372017 -r 8e6cda4c1b3d1685a8ce07412dc542f34eb8b44b test/functional/tools/multi_output_configured.xml
--- /dev/null
+++ b/test/functional/tools/multi_output_configured.xml
@@ -0,0 +1,43 @@
+<tool id="multi_output_configured" name="Multi_Output_Configured" description="multi_output_configured" force_history_refresh="True" version="0.1.0">
+  <command>
+    echo "Hello" > $report;
+    mkdir subdir1;
+    echo "This" > subdir1/this.txt;
+    echo "That" > subdir1/that.txt;
+    mkdir subdir2;
+    echo "1" > subdir2/CUSTOM_1.txt;
+    echo "2" > subdir2/CUSTOM_2.tabular;
+    echo "3" > subdir2/CUSTOM_3.txt;
+  </command>
+  <inputs>
+    <param name="input" type="integer" value="7" />
+  </inputs>
+  <outputs>
+    <data format="txt" name="report">
+      <discover_datasets pattern="__designation_and_ext__" directory="subdir1" />
+      <discover_datasets pattern="CUSTOM_(?P<designation>.+)\.(?P<ext>.+)" directory="subdir2" />
+    </data>
+  </outputs>
+  <tests>
+    <test>
+      <param name="input" value="7" />
+      <output name="report">
+        <assert_contents>
+          <has_line line="Hello" />
+        </assert_contents>
+        <discovered_dataset designation="this" ftype="txt">
+          <assert_contents><has_line line="This" /></assert_contents>
+        </discovered_dataset>
+        <discovered_dataset designation="that" ftype="txt">
+          <assert_contents><has_line line="That" /></assert_contents>
+        </discovered_dataset>
+        <discovered_dataset designation="1" ftype="txt">
+          <assert_contents><has_line line="1" /></assert_contents>
+        </discovered_dataset>
+        <discovered_dataset designation="2" ftype="tabular">
+          <assert_contents><has_line line="2" /></assert_contents>
+        </discovered_dataset>
+      </output>
+    </test>
+  </tests>
+</tool>

diff -r 74b6e23ed7882f3c091d2b66ce85025241372017 -r 8e6cda4c1b3d1685a8ce07412dc542f34eb8b44b test/functional/tools/samples_tool_conf.xml
--- a/test/functional/tools/samples_tool_conf.xml
+++ b/test/functional/tools/samples_tool_conf.xml
@@ -8,6 +8,7 @@
   <tool file="multi_page.xml"/><tool file="multi_select.xml" /><tool file="multi_output.xml" />
+  <tool file="multi_output_configured.xml" /><tool file="composite_output.xml" /><tool file="metadata.xml" /><tool file="output_order.xml" />

diff -r 74b6e23ed7882f3c091d2b66ce85025241372017 -r 8e6cda4c1b3d1685a8ce07412dc542f34eb8b44b test/functional/workflow.py
--- a/test/functional/workflow.py
+++ b/test/functional/workflow.py
@@ -4,6 +4,7 @@
 from base.interactor import GalaxyInteractorApi, stage_data_in_history
 
 from galaxy.util import parse_xml
+from galaxy.util import bunch
 from galaxy.tools.test import parse_param_elem, require_file, test_data_iter, parse_output_elems
 from json import load, dumps
 
@@ -66,10 +67,11 @@
         for expected_output_def in workflow_test.outputs:
             # Get the correct hid
             name, outfile, attributes = expected_output_def
+            output_testdef = bunch.Bunch( name=name, outfile=outfile, attributes=attributes )
 
             output_data = outputs[ int( name ) ]
             try:
-                galaxy_interactor.verify_output( test_history, output_data, outfile, attributes=attributes, shed_tool_id=None, maxseconds=maxseconds )
+                galaxy_interactor.verify_output( test_history, output_data, output_testdef=output_testdef, shed_tool_id=None, maxseconds=maxseconds )
             except Exception:
                 for stream in ['stdout', 'stderr']:
                     stream_output = galaxy_interactor.get_job_stream( test_history, output_data, stream=stream )

diff -r 74b6e23ed7882f3c091d2b66ce85025241372017 -r 8e6cda4c1b3d1685a8ce07412dc542f34eb8b44b test/unit/tools/test_collect_primary_datasets.py
--- a/test/unit/tools/test_collect_primary_datasets.py
+++ b/test/unit/tools/test_collect_primary_datasets.py
@@ -5,6 +5,8 @@
 import tools_support
 
 from galaxy import model
+from galaxy import util
+from galaxy.tools.parameters import output_collect
 
 DEFAULT_TOOL_OUTPUT = "out1"
 DEFAULT_EXTRA_NAME = "test1"
@@ -114,6 +116,75 @@
         extra_job_assoc = filter( lambda job_assoc: job_assoc.name.startswith( "__" ), self.job.output_datasets )[ 0 ]
         assert extra_job_assoc.name == "__new_primary_file_out1|test1__"
 
+    def test_pattern_override_designation( self ):
+        self._replace_output_collectors( '''<output><discover_datasets pattern="__designation__" directory="subdir" ext="txt" /></output>''' )
+        self._setup_extra_file( subdir="subdir", filename="foo.txt" )
+        primary_outputs = self._collect( )[ DEFAULT_TOOL_OUTPUT ]
+        assert len( primary_outputs ) == 1
+        created_hda = primary_outputs.values()[ 0 ]
+        assert "foo.txt" in created_hda.name
+        assert created_hda.ext == "txt"
+
+    def test_name_and_ext_pattern( self ):
+        self._replace_output_collectors( '''<output><discover_datasets pattern="__name_and_ext__" directory="subdir" /></output>''' )
+        self._setup_extra_file( subdir="subdir", filename="foo1.txt" )
+        self._setup_extra_file( subdir="subdir", filename="foo2.tabular" )
+        primary_outputs = self._collect( )[ DEFAULT_TOOL_OUTPUT ]
+        assert len( primary_outputs ) == 2
+        assert primary_outputs[ "foo1" ].ext == "txt"
+        assert primary_outputs[ "foo2" ].ext == "tabular"
+
+    def test_custom_pattern( self ):
+        # Hypothetical oral metagenomic classifier that populates a directory
+        # of files based on name and genome. Use custom regex pattern to grab
+        # and classify these files.
+        self._replace_output_collectors( '''<output><discover_datasets pattern="(?P<designation>.*)__(?P<dbkey>.*).fasta" directory="genome_breakdown" ext="fasta" /></output>''' )
+        self._setup_extra_file( subdir="genome_breakdown", filename="samp1__hg19.fasta" )
+        self._setup_extra_file( subdir="genome_breakdown", filename="samp2__lactLact.fasta" )
+        self._setup_extra_file( subdir="genome_breakdown", filename="samp3__hg19.fasta" )
+        self._setup_extra_file( subdir="genome_breakdown", filename="samp4__lactPlan.fasta" )
+        self._setup_extra_file( subdir="genome_breakdown", filename="samp5__fusoNucl.fasta" )
+
+        # Put a file in directory we don't care about, just to make sure
+        # it doesn't get picked up by pattern.
+        self._setup_extra_file( subdir="genome_breakdown", filename="overview.txt" )
+
+        primary_outputs = self._collect( )[ DEFAULT_TOOL_OUTPUT ]
+        assert len( primary_outputs ) == 5
+        genomes = dict( samp1="hg19", samp2="lactLact", samp3="hg19", samp4="lactPlan", samp5="fusoNucl" )
+        for key, hda in primary_outputs.iteritems():
+            assert hda.dbkey == genomes[ key ]
+
+    def test_name_versus_designation( self ):
+        """ This test demonstrates the difference between name and desgination
+        in grouping patterns and named patterns such as __designation__,
+        __name__, __designation_and_ext__, and __name_and_ext__.
+        """
+        self._replace_output_collectors( '''<output>
+            <discover_datasets pattern="__name_and_ext__" directory="subdir_for_name_discovery" />
+            <discover_datasets pattern="__designation_and_ext__" directory="subdir_for_designation_discovery" />
+        </output>''')
+        self._setup_extra_file( subdir="subdir_for_name_discovery", filename="example1.txt" )
+        self._setup_extra_file( subdir="subdir_for_designation_discovery", filename="example2.txt" )
+        primary_outputs = self._collect( )[ DEFAULT_TOOL_OUTPUT ]
+        name_output = primary_outputs[ "example1" ]
+        designation_output = primary_outputs[ "example2" ]
+        # While name is also used for designation, designation is not the name -
+        # it is used in the calculation of the name however...
+        assert name_output.name == "example1"
+        assert designation_output.name == "%s (%s)" % ( self.hda.name, "example2" )
+
+    def test_cannot_read_files_outside_job_directory( self ):
+        self._replace_output_collectors( '''<output>
+            <discover_datasets pattern="__name_and_ext__" directory="../../secrets" />
+        </output>''')
+        exception_thrown = False
+        try:
+            self._collect( )
+        except Exception:
+            exception_thrown = True
+        assert exception_thrown
+
     def _collect_default_extra( self, **kwargs ):
         return self._collect( **kwargs )[ DEFAULT_TOOL_OUTPUT ][ DEFAULT_EXTRA_NAME ]
 
@@ -122,6 +193,12 @@
             job_working_directory = self.test_directory
         return self.tool.collect_primary_datasets( self.outputs, job_working_directory )
 
+    def _replace_output_collectors( self, xml_str ):
+        # Rewrite tool as if it had been created with output containing
+        # supplied dataset_collector elem.
+        elem = util.parse_xml_string( xml_str )
+        self.tool.outputs[ DEFAULT_TOOL_OUTPUT ].dataset_collectors = output_collect.dataset_collectors_from_elem( elem )
+
     def _append_job_json( self, object, output_path=None, line_type="new_primary_dataset" ):
         object[ "type" ] = line_type
         if output_path:
@@ -133,7 +210,8 @@
 
     def _setup_extra_file( self, **kwargs ):
         path = kwargs.get( "path", None )
-        if not path:
+        filename = kwargs.get( "filename", None )
+        if not path and not filename:
             name = kwargs.get( "name", DEFAULT_EXTRA_NAME )
             visible = kwargs.get( "visible", "visible" )
             ext = kwargs.get( "ext", "data" )
@@ -142,6 +220,13 @@
             path = os.path.join( directory, "primary_%s_%s_%s_%s" % template_args )
             if "dbkey" in kwargs:
                 path = "%s_%s" % ( path, kwargs[ "dbkey" ] )
+        if not path:
+            assert filename
+            subdir = kwargs.get( "subdir", "." )
+            path = os.path.join( self.test_directory, subdir, filename )
+        directory = os.path.dirname( path )
+        if not os.path.exists( directory ):
+            os.makedirs( directory )
         contents = kwargs.get( "contents", "test contents" )
         open( path, "w" ).write( contents )
         return path

Repository URL: https://bitbucket.org/galaxy/galaxy-central/

--

This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.

    

commits-noreply＠bitbucket.org

tags

participants (1)