commit/galaxy-central: dannon: Merged in jmchilton/galaxy-central-fork-1 (pull request #401)
1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/646b5877871b/ Changeset: 646b5877871b User: dannon Date: 2014-06-12 17:09:04 Summary: Merged in jmchilton/galaxy-central-fork-1 (pull request #401) Allow tools and deployers to specify optional Docker-based dependency resolution. Affected #: 10 files diff -r a19ef9da5e470d14a31ad8385f14168a14556224 -r 646b5877871b3b15c6fc6a9ecf111fceb23ea270 job_conf.xml.sample_advanced --- a/job_conf.xml.sample_advanced +++ b/job_conf.xml.sample_advanced @@ -97,6 +97,90 @@ elements. This empty block will simply disable job metrics for the corresponding destination. --></destination> + <destination id="docker_local" runner="local"> + <param id="docker_enabled">true</param> + <!-- docker_volumes can be used to configure volumes to expose to docker, + For added isolation append :ro to the path to mount it read only. + Galaxy will attempt to infer a reasonable set of defaults which + volumes should be exposed how based on Galaxy's settings and the + destination - but be sure to add any library paths or data incides + that may be needed read-only. + --> + <!-- + <param id="docker_volumes">$defaults,/mnt/galaxyData/libraries:ro,/mnt/galaxyData/indices:ro</param> + --> + <!-- For a stock Galaxy instance and traditional job runner $defaults will + expand out as: + + $galaxy_root:ro,$tool_directory:ro,$working_directory:rw,$default_file_path:rw + + This assumes most of what is needed is available under Galaxy's root directory, + the tool directory, and the Galaxy's file_path (if using object store creatively + you will definitely need to expand defaults). + + This configuration allows any docker instance to write to any Galaxy + file - for greater isolation set outputs_to_working_directory in + universe_wsgi.ini. This will cause $defaults to allow writing to much + less. It will then expand as follows: + + $galaxy_root:ro,$tool_directory:ro,$working_directory:rw,$default_file_path:ro + + If using the LWR, defaults will be even further restricted because the + LWR will (by default) stage all needed inputs into the job's job_directory + (so there is not need to allow the docker container to read all the + files - let alone write over them). Defaults in this case becomes: + + $job_directory:ro,$tool_directory:ro,$job_directory/outputs:rw,$working_directory:rw + + Python string.Template is used to expand volumes and values $defaults, + $galaxy_root, $default_file_path, $tool_directory, $working_directory, + are available to all jobs and $job_directory is also available for + LWR jobs. + --> + <!-- Control memory allocatable by docker container with following option: + --> + <!-- <param id="docker_memory">24G</param> --> + <!-- By default Docker will need to runnable by Galaxy using + password-less sudo - this can be configured by adding the + following line to the sudoers file of all compute nodes + with docker enabled: + + galaxy ALL = (root) NOPASSWD: SETENV: /usr/bin/docker + + The follow option is set to false to disable sudo (docker + must likewise be configured to allow this). + --> + <!-- <param id="docker_sudo">false</param> --> + <!-- Following option can be used to tweak sudo command used by + default. --> + <!-- <param id="docker_sudo_cmd">/usr/bin/sudo --extra_param</param> --> + <!-- By default, docker container will not have any networking + enabled. host networking can be bridged by uncommenting next option + http://docs.docker.io/reference/run/#network-settings + --> + <!-- <param id="docker_net">bridge</param> --> + <!-- Following command can be used to tweak docker command. --> + <!-- <param id="docker_cmd">/usr/local/custom_docker/docker</param> --> + <!-- Following can be used to connect to docke server in different + ways (translated as -H argument to docker client). + <!-- <param id="docker_host">unix:///var/run/docker.sock</param> --> + <!-- <param id="docker_host">:5555</param> --> + <!-- <param id="docker_host">:5555</param> --> + <!-- <param id="docker_host">tcp://127.0.0.1:4243</param> --> + + <!-- If deployer wants to use docker for isolation, but does not + trust tool's specified container - a destination wide override + can be set. This will cause all jobs on this destination to use + that docker image. --> + <!-- <param id="docker_container_id_override">busybox:ubuntu-14.04</param> --> + + <!-- Likewise, if deployer wants to use docker for isolation and + does trust tool's specified container - but also wants tool's not + configured to run in a container the following option can provide + a fallback. --> + <!-- <param id="dockers_default_container_id">busybox:ubuntu-14.04</param> --> + + </destination><destination id="pbs" runner="pbs" tags="mycluster"/><destination id="pbs_longjobs" runner="pbs" tags="mycluster,longjobs"><!-- Define parameters that are native to the job runner plugin. --> diff -r a19ef9da5e470d14a31ad8385f14168a14556224 -r 646b5877871b3b15c6fc6a9ecf111fceb23ea270 lib/galaxy/config.py --- a/lib/galaxy/config.py +++ b/lib/galaxy/config.py @@ -582,6 +582,16 @@ import galaxy.tools.search self.toolbox_search = galaxy.tools.search.ToolBoxSearch( self.toolbox ) + from galaxy.tools.deps import containers + galaxy_root_dir = os.path.abspath(self.config.root) + file_path = os.path.abspath(getattr(self.config, "file_path")) + app_info = containers.AppInfo( + galaxy_root_dir, + default_file_path=file_path, + outputs_to_working_directory=self.config.outputs_to_working_directory + ) + self.container_finder = galaxy.tools.deps.containers.ContainerFinder(app_info) + def _configure_tool_data_tables( self, from_shed_config ): from galaxy.tools.data import ToolDataTableManager diff -r a19ef9da5e470d14a31ad8385f14168a14556224 -r 646b5877871b3b15c6fc6a9ecf111fceb23ea270 lib/galaxy/jobs/command_factory.py --- a/lib/galaxy/jobs/command_factory.py +++ b/lib/galaxy/jobs/command_factory.py @@ -1,4 +1,6 @@ from os import getcwd +from os import chmod +from os.path import join from os.path import abspath CAPTURE_RETURN_CODE = "return_code=$?" @@ -8,7 +10,14 @@ log = getLogger( __name__ ) -def build_command( runner, job_wrapper, include_metadata=False, include_work_dir_outputs=True, remote_command_params={} ): +def build_command( + runner, + job_wrapper, + container=None, + include_metadata=False, + include_work_dir_outputs=True, + remote_command_params={} +): """ Compose the sequence of commands necessary to execute a job. This will currently include: @@ -29,7 +38,35 @@ __handle_version_command(commands_builder, job_wrapper) __handle_task_splitting(commands_builder, job_wrapper) - __handle_dependency_resolution(commands_builder, job_wrapper, remote_command_params) + + # One could imagine also allowing dependencies inside of the container but + # that is too sophisticated for a first crack at this - build your + # containers ready to go! + if not container: + __handle_dependency_resolution(commands_builder, job_wrapper, remote_command_params) + + if container: + # Stop now and build command before handling metadata and copying + # working directory files back. These should always happen outside + # of docker container - no security implications when generating + # metadata and means no need for Galaxy to be available to container + # and not copying workdir outputs back means on can be more restrictive + # of where container can write to in some circumstances. + + local_container_script = join( job_wrapper.working_directory, "container.sh" ) + fh = file( local_container_script, "w" ) + fh.write( "#!/bin/sh\n%s" % commands_builder.build() ) + fh.close() + chmod( local_container_script, 0755 ) + + compute_container_script = local_container_script + if 'working_directory' in remote_command_params: + compute_container_script = "/bin/sh %s" % join(remote_command_params['working_directory'], "container.sh") + + run_in_container_command = container.containerize_command( + compute_container_script + ) + commands_builder = CommandsBuilder( run_in_container_command ) if include_work_dir_outputs: __handle_work_dir_outputs(commands_builder, job_wrapper, runner, remote_command_params) diff -r a19ef9da5e470d14a31ad8385f14168a14556224 -r 646b5877871b3b15c6fc6a9ecf111fceb23ea270 lib/galaxy/jobs/runners/__init__.py --- a/lib/galaxy/jobs/runners/__init__.py +++ b/lib/galaxy/jobs/runners/__init__.py @@ -177,7 +177,16 @@ raise NotImplementedError() def build_command_line( self, job_wrapper, include_metadata=False, include_work_dir_outputs=True ): - return build_command( self, job_wrapper, include_metadata=include_metadata, include_work_dir_outputs=include_work_dir_outputs ) + # TODO: Eliminate extra kwds no longer used (since LWR skips + # abstraction and calls build_command directly). + container = self._find_container( job_wrapper ) + return build_command( + self, + job_wrapper, + include_metadata=include_metadata, + include_work_dir_outputs=include_work_dir_outputs, + container=container + ) def get_work_dir_outputs( self, job_wrapper, job_working_directory=None ): """ @@ -276,6 +285,31 @@ if ajs.job_wrapper.get_state() != model.Job.states.DELETED: self.work_queue.put( ( self.finish_job, ajs ) ) + def _find_container( + self, + job_wrapper, + compute_working_directory=None, + compute_tool_directory=None, + compute_job_directory=None + ): + if not compute_working_directory: + compute_working_directory = job_wrapper.working_directory + + if not compute_tool_directory: + compute_tool_directory = job_wrapper.tool.tool_dir + + tool = job_wrapper.tool + from galaxy.tools.deps import containers + tool_info = containers.ToolInfo(tool.containers, tool.requirements) + job_info = containers.JobInfo(compute_working_directory, compute_tool_directory, compute_job_directory) + + destination_info = job_wrapper.job_destination.params + return self.app.container_finder.find_container( + tool_info, + destination_info, + job_info + ) + class JobState( object ): """ diff -r a19ef9da5e470d14a31ad8385f14168a14556224 -r 646b5877871b3b15c6fc6a9ecf111fceb23ea270 lib/galaxy/jobs/runners/lwr.py --- a/lib/galaxy/jobs/runners/lwr.py +++ b/lib/galaxy/jobs/runners/lwr.py @@ -233,9 +233,21 @@ metadata_kwds=metadata_kwds, dependency_resolution=dependency_resolution, ) + remote_working_directory = remote_job_config['working_directory'] + # TODO: Following defs work for LWR, always worked for LWR but should be + # calculated at some other level. + remote_job_directory = os.path.abspath(os.path.join(remote_working_directory, os.path.pardir)) + remote_tool_directory = os.path.abspath(os.path.join(remote_job_directory, "tool_files")) + container = self._find_container( + job_wrapper, + compute_working_directory=remote_working_directory, + compute_tool_directory=remote_tool_directory, + compute_job_directory=remote_job_directory, + ) command_line = build_command( self, job_wrapper=job_wrapper, + container=container, include_metadata=remote_metadata, include_work_dir_outputs=False, remote_command_params=remote_command_params, diff -r a19ef9da5e470d14a31ad8385f14168a14556224 -r 646b5877871b3b15c6fc6a9ecf111fceb23ea270 lib/galaxy/tools/__init__.py --- a/lib/galaxy/tools/__init__.py +++ b/lib/galaxy/tools/__init__.py @@ -1346,7 +1346,9 @@ self.__tests_populated = False # Requirements (dependencies) - self.requirements = parse_requirements_from_xml( root ) + requirements, containers = parse_requirements_from_xml( root ) + self.requirements = requirements + self.containers = containers # Determine if this tool can be used in workflows self.is_workflow_compatible = self.check_workflow_compatible(root) # Trackster configuration. diff -r a19ef9da5e470d14a31ad8385f14168a14556224 -r 646b5877871b3b15c6fc6a9ecf111fceb23ea270 lib/galaxy/tools/deps/containers.py --- /dev/null +++ b/lib/galaxy/tools/deps/containers.py @@ -0,0 +1,246 @@ +from abc import ( + ABCMeta, + abstractmethod +) +import os +import string + +from galaxy.util import asbool +from ..deps import docker_util + +import logging +log = logging.getLogger(__name__) + +DEFAULT_CONTAINER_TYPE = "docker" + + +class ContainerFinder(object): + + def __init__(self, app_info): + self.app_info = app_info + self.container_registry = ContainerRegistry() + + def find_container(self, tool_info, destination_info, job_info): + def __destination_container(container_description=None, container_id=None, container_type=None): + if container_description: + container_id = container_description.identifier + container_type = container_description.type + container = self.__destination_container( + container_id, + container_type, + tool_info, + destination_info, + job_info + ) + return container + + # Is destination forcing Galaxy to use a particular container do it, + # this is likely kind of a corner case. For instance if deployers + # do not trust the containers annotated in tools. + for container_type in CONTAINER_CLASSES.keys(): + container_id = self.__overridden_container_id(container_type, destination_info) + if container_id: + container = __destination_container(container_type=container_type, container_id=container_id) + if container: + return container + + # Otherwise lets see if we can find container for the tool. + + # Exact matches first from explicitly listed containers in tools... + for container_description in tool_info.container_descriptions: + container = __destination_container(container_description) + if container: + return container + + # Implement vague concept of looping through all containers + # matching requirements. Exact details need to be worked through + # but hopefully the idea that it sits below find_container somewhere + # external components to this module don't need to worry about it + # is good enough. + container_descriptions = self.container_registry.container_descriptions_for_requirements(tool_info.requirements) + for container_description in container_descriptions: + container = __destination_container(container_description) + if container: + return container + + # If we still don't have a container, check to see if any container + # types define a default container id and use that. + for container_type in CONTAINER_CLASSES.keys(): + container_id = self.__default_container_id(container_type, destination_info) + if container_id: + container = __destination_container(container_type=container_type, container_id=container_id) + if container: + return container + + return NULL_CONTAINER + + def __overridden_container_id(self, container_type, destination_info): + if not self.__container_type_enabled(container_type, destination_info): + return None + return destination_info.get("%s_container_id_override" % container_type) + + def __default_container_id(self, container_type, destination_info): + if not self.__container_type_enabled(container_type, destination_info): + return None + return destination_info.get("%s_default_container_id" % container_type) + + def __destination_container(self, container_id, container_type, tool_info, destination_info, job_info): + # TODO: ensure destination_info is dict-like + if not self.__container_type_enabled(container_type, destination_info): + return NULL_CONTAINER + + # TODO: Right now this assumes all containers available when a + # container type is - there should be more thought put into this. + # Checking which are availalbe - settings policies for what can be + # auto-fetched, etc.... + return CONTAINER_CLASSES[container_type](container_id, self.app_info, tool_info, destination_info, job_info) + + def __container_type_enabled(self, container_type, destination_info): + return asbool(destination_info.get("%s_enabled" % container_type, False)) + + +class ContainerRegistry(): + + def __init__(self): + pass + + def container_descriptions_for_requirements(self, requirements): + # Return lists of containers that would match requirements... + return [] + + +class AppInfo(object): + + def __init__(self, galaxy_root_dir=None, default_file_path=None, outputs_to_working_directory=False): + self.galaxy_root_dir = galaxy_root_dir + self.default_file_path = default_file_path + # TODO: Vary default value for docker_volumes based on this... + self.outputs_to_working_directory = outputs_to_working_directory + + +class ToolInfo(object): + # TODO: Introduce tool XML syntax to annotate the optional environment + # variables they can consume (e.g. JVM options, license keys, etc..) + # and add these to env_path_through + + def __init__(self, container_descriptions=[], requirements=[]): + self.container_descriptions = container_descriptions + self.requirements = requirements + self.env_pass_through = ["GALAXY_SLOTS"] + + +class JobInfo(object): + + def __init__(self, working_directory, tool_directory, job_directory): + self.working_directory = working_directory + self.job_directory = job_directory + # Tool files may be remote staged - so this is unintuitively a property + # of the job not of the tool. + self.tool_directory = tool_directory + + +class Container( object ): + __metaclass__ = ABCMeta + + def __init__(self, container_id, app_info, tool_info, destination_info, job_info): + self.container_id = container_id + self.app_info = app_info + self.tool_info = tool_info + self.destination_info = destination_info + self.job_info = job_info + + @abstractmethod + def containerize_command(self, command): + """ + Use destination supplied container configuration parameters, + container_id, and command to build a new command that runs + input command in container. + """ + + +class DockerContainer(Container): + + def containerize_command(self, command): + def prop(name, default): + destination_name = "docker_%s" % name + return self.destination_info.get(destination_name, default) + + env_directives = [] + for pass_through_var in self.tool_info.env_pass_through: + env_directives.append('"%s=$%s"' % (pass_through_var, pass_through_var)) + + # Allow destinations to explicitly set environment variables just for + # docker container. Better approach is to set for destination and then + # pass through only what tool needs however. (See todo in ToolInfo.) + for key, value in self.destination_info.iteritems(): + if key.startswith("docker_env_"): + env = key[len("docker_env_"):] + env_directives.append('"%s=%s"' % (env, value)) + + working_directory = self.job_info.working_directory + if not working_directory: + raise Exception("Cannot containerize command [%s] without defined working directory." % working_directory) + + volumes_raw = self.__expand_str(self.destination_info.get("docker_volumes", "$defaults")) + # TODO: Remove redundant volumes... + volumes = docker_util.DockerVolume.volumes_from_str(volumes_raw) + return docker_util.build_docker_run_command( + command, + self.container_id, + volumes=volumes, + env_directives=env_directives, + working_directory=working_directory, + docker_cmd=prop("cmd", docker_util.DEFAULT_DOCKER_COMMAND), + sudo=asbool(prop("sudo", docker_util.DEFAULT_SUDO)), + sudo_cmd=prop("sudo_cmd", docker_util.DEFAULT_SUDO_COMMAND), + host=prop("host", docker_util.DEFAULT_HOST), + net=prop("net", "none") # By default, docker instance has networking disabled + ) + + def __expand_str(self, value): + template = string.Template(value) + variables = dict() + + def add_var(name, value): + if value: + variables[name] = os.path.abspath(value) + + add_var("working_directory", self.job_info.working_directory) + add_var("job_directory", self.job_info.job_directory) + add_var("tool_directory", self.job_info.tool_directory) + add_var("galaxy_root", self.app_info.galaxy_root_dir) + add_var("default_file_path", self.app_info.default_file_path) + + if self.job_info.job_directory: + # We have a job directory, so everything needed (excluding index + # files) should be available in job_directory... + defaults = "$job_directory:ro,$tool_directory:ro,$job_directory/outputs:rw,$working_directory:rw" + elif self.app_info.outputs_to_working_directory: + # Should need default_file_path (which is a course estimate given + # object stores anyway. + defaults = "$galaxy_root:ro,$tool_directory:ro,$working_directory:rw,$default_file_path:ro" + else: + defaults = "$galaxy_root:ro,$tool_directory:ro,$working_directory:rw,$default_file_path:rw" + + # Define $defaults that can easily be extended with external library and + # index data without deployer worrying about above details. + variables["defaults"] = string.Template(defaults).safe_substitute(variables) + + return template.safe_substitute(variables) + + +CONTAINER_CLASSES = dict( + docker=DockerContainer, +) + + +class NullContainer(object): + + def __init__(self): + pass + + def __nonzero__(self): + return False + + +NULL_CONTAINER = NullContainer() diff -r a19ef9da5e470d14a31ad8385f14168a14556224 -r 646b5877871b3b15c6fc6a9ecf111fceb23ea270 lib/galaxy/tools/deps/docker_util.py --- /dev/null +++ b/lib/galaxy/tools/deps/docker_util.py @@ -0,0 +1,91 @@ + +DEFAULT_DOCKER_COMMAND = "docker" +DEFAULT_SUDO = True +DEFAULT_SUDO_COMMAND = "sudo" +DEFAULT_HOST = None +DEFAULT_VOLUME_MOUNT_TYPE = "rw" +DEFAULT_WORKING_DIRECTORY = None +DEFAULT_NET = None +DEFAULT_MEMORY = None + + +class DockerVolume(object): + + def __init__(self, path, to_path=None, how=DEFAULT_VOLUME_MOUNT_TYPE): + self.from_path = path + self.to_path = to_path or path + if not DockerVolume.__valid_how(how): + raise ValueError("Invalid way to specify docker volume %s" % how) + self.how = how + + @staticmethod + def volumes_from_str(volumes_as_str): + if not volumes_as_str: + return [] + volume_strs = [v.strip() for v in volumes_as_str.split(",")] + return map(DockerVolume.volume_from_str, volume_strs) + + @staticmethod + def volume_from_str(as_str): + if not as_str: + raise ValueError("Failed to parse docker volume from %s" % as_str) + parts = as_str.split(":", 2) + kwds = dict(path=parts[0]) + if len(parts) == 2: + if DockerVolume.__valid_how(parts[1]): + kwds["how"] = parts[1] + else: + kwds["to_path"] = parts[1] + elif len(parts) == 3: + kwds["to_path"] = parts[1] + kwds["how"] = parts[2] + return DockerVolume(**kwds) + + @staticmethod + def __valid_how(how): + return how in ["ro", "rw"] + + def __str__(self): + return ":".join([self.from_path, self.to_path, self.how]) + + +def build_docker_run_command( + container_command, + image, + tag=None, + docker_cmd=DEFAULT_DOCKER_COMMAND, + volumes=[], + memory=DEFAULT_MEMORY, + env_directives=[], + working_directory=DEFAULT_WORKING_DIRECTORY, + sudo=DEFAULT_SUDO, + sudo_cmd=DEFAULT_SUDO_COMMAND, + name=None, + host=DEFAULT_HOST, + net=DEFAULT_NET, +): + command_parts = [] + if sudo: + command_parts.append(sudo_cmd) + command_parts.append(docker_cmd) + if host: + command_parts.append(["-H", host]) + command_parts.append("run") + for env_directive in env_directives: + command_parts.extend(["-e", env_directive]) + for volume in volumes: + command_parts.extend(["-v", str(volume)]) + if memory: + command_parts.extend(["-m", memory]) + if name: + command_parts.extend(["-name", name]) + if working_directory: + command_parts.extend(["-w", working_directory]) + if net: + command_parts.extend(["--net", net]) + full_image = image + if tag: + full_image = "%s:%s" % (full_image, tag) + command_parts.append(full_image) + command_parts.append(container_command) + return " ".join(command_parts) diff -r a19ef9da5e470d14a31ad8385f14168a14556224 -r 646b5877871b3b15c6fc6a9ecf111fceb23ea270 lib/galaxy/tools/deps/requirements.py --- a/lib/galaxy/tools/deps/requirements.py +++ b/lib/galaxy/tools/deps/requirements.py @@ -26,6 +26,25 @@ return ToolRequirement( name=name, type=type, version=version ) +DEFAULT_CONTAINER_TYPE = "docker" + + +class ContainerDescription( object ): + + def __init__( self, identifier=None, type="docker" ): + self.identifier = identifier + self.type = type + + def to_dict( self ): + return dict(identifier=self.identifier, type=self.type) + + @staticmethod + def from_dict( dict ): + identifier = dict["identifier"] + type = dict.get("type", DEFAULT_CONTAINER_TYPE) + return ContainerDescription( identifier=identifier, type=type ) + + def parse_requirements_from_xml( xml_root ): """ @@ -63,4 +82,15 @@ requirement = ToolRequirement( name=name, type=type, version=version ) requirements.append( requirement ) - return requirements + container_elems = [] + if requirements_elem is not None: + container_elems = requirements_elem.findall( 'container' ) + + containers = [] + for container_elem in container_elems: + identifier = xml_text( container_elem ) + type = container_elem.get( "type", DEFAULT_CONTAINER_TYPE ) + container = ContainerDescription( identifier=identifier, type=type ) + containers.append( container ) + + return requirements, containers diff -r a19ef9da5e470d14a31ad8385f14168a14556224 -r 646b5877871b3b15c6fc6a9ecf111fceb23ea270 test/functional/tools/catDocker.xml --- /dev/null +++ b/test/functional/tools/catDocker.xml @@ -0,0 +1,28 @@ +<tool id="catdc" name="Concatenate datasets (in docker)"> + <description>tail-to-head</description> + <requirements> + <container type="docker">busybox:ubuntu-14.04</container> + </requirements> + <command> + echo "Galaxy slots passed through contain as \$GALAXY_SLOTS"; + cat $input1 + #for $q in $queries + ${q.input2} + #end for + > $out_file1; + echo "Work dir output" > working_file + + </command> + <inputs> + <param name="input1" type="data" label="Concatenate Dataset"/> + <repeat name="queries" title="Dataset"> + <param name="input2" type="data" label="Select" /> + </repeat> + </inputs> + <outputs> + <data name="out_file1" format="input" metadata_source="input1"/> + <data name="out_file2" format="txt" from_work_dir="working_file" /> + </outputs> + <help> + </help> +</tool> Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
commits-noreply@bitbucket.org