1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/a1dca14d5b1a/ Changeset: a1dca14d5b1a Branch: stable User: natefoo Date: 2014-11-07 20:41:27+00:00 Summary: Memory limit detection. This code is from default and is a partial graft. It includes the code for detecting if a Slurm job was killed due to exceeding memory limits but does not include resubmit functionality. Affected #: 2 files diff -r 83f821c5ecc1e0ac4bbdc6710c6e7abb7faf6afa -r a1dca14d5b1afbf2b5bde192e3e6b6763836eff8 lib/galaxy/jobs/runners/__init__.py --- a/lib/galaxy/jobs/runners/__init__.py +++ b/lib/galaxy/jobs/runners/__init__.py @@ -337,6 +337,7 @@ """ runner_states = Bunch( WALLTIME_REACHED = 'walltime_reached', + MEMORY_LIMIT_REACHED = 'memory_limit_reached', GLOBAL_WALLTIME_REACHED = 'global_walltime_reached', OUTPUT_SIZE_LIMIT = 'output_size_limit' ) diff -r 83f821c5ecc1e0ac4bbdc6710c6e7abb7faf6afa -r a1dca14d5b1afbf2b5bde192e3e6b6763836eff8 lib/galaxy/jobs/runners/slurm.py --- a/lib/galaxy/jobs/runners/slurm.py +++ b/lib/galaxy/jobs/runners/slurm.py @@ -13,6 +13,8 @@ __all__ = [ 'SlurmJobRunner' ] +SLURM_MEMORY_LIMIT_EXCEEDED_MSG = 'slurmstepd: error: Exceeded job memory limit' + class SlurmJobRunner( DRMAAJobRunner ): runner_name = "SlurmRunner" @@ -62,8 +64,14 @@ except: ajs.fail_message = "This job failed due to a cluster node failure, and an attempt to resubmit the job failed." elif job_info['JobState'] == 'CANCELLED': - log.info( '(%s/%s) Job was cancelled via slurm (e.g. with scancel(1))', ajs.job_wrapper.get_id_tag(), ajs.job_id ) - ajs.fail_message = "This job failed because it was cancelled by an administrator." + # Check to see if the job was killed for exceeding memory consumption + if self.__check_memory_limit( ajs.error_file ): + log.info( '(%s/%s) Job hit memory limit', ajs.job_wrapper.get_id_tag(), ajs.job_id ) + ajs.fail_message = "This job was terminated because it used more memory than it was allocated." + ajs.runner_state = ajs.runner_states.MEMORY_LIMIT_REACHED + else: + log.info( '(%s/%s) Job was cancelled via slurm (e.g. with scancel(1))', ajs.job_wrapper.get_id_tag(), ajs.job_id ) + ajs.fail_message = "This job failed because it was cancelled by an administrator." else: log.warning( '(%s/%s) Job failed due to unknown reasons, JobState was: %s', ajs.job_wrapper.get_id_tag(), ajs.job_id, job_info['JobState'] ) ajs.fail_message = "This job failed for reasons that could not be determined." @@ -77,3 +85,31 @@ super( SlurmJobRunner, self )._complete_terminal_job( ajs, drmaa_state = drmaa_state ) # by default, finish as if the job was successful. super( SlurmJobRunner, self )._complete_terminal_job( ajs, drmaa_state = drmaa_state ) + + def __check_memory_limit( self, efile_path ): + """ + A very poor implementation of tail, but it doesn't need to be fancy + since we are only searching the last 2K + """ + try: + log.debug( 'Checking %s for exceeded memory message from slurm', efile_path ) + with open( efile_path ) as f: + pos = 2 + bof = False + while pos < 2048: + try: + f.seek(-pos, 2) + pos += 1 + except: + f.seek(-pos + 1, 2) + bof = True + + if (bof or f.read(1) == '\n') and f.readline().strip() == SLURM_MEMORY_LIMIT_EXCEEDED_MSG: + return True + + if bof: + break + except: + log.exception('Error reading end of %s:', path) + + return False Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.