1 new commit in galaxy-central:
https://bitbucket.org/galaxy/galaxy-central/commits/a16007e4df83/
Changeset: a16007e4df83
Branch: stable
User: natefoo
Date: 2014-11-10 14:00:34+00:00
Summary: Update tag latest_2014.10.06 for changeset a1dca14d5b1a
Affected #: 1 file
diff -r a1dca14d5b1afbf2b5bde192e3e6b6763836eff8 -r a16007e4df83c21ff1d9218b3892fae783742fb8 .hgtags
--- a/.hgtags
+++ b/.hgtags
@@ -20,4 +20,4 @@
ca45b78adb4152fc6e7395514d46eba6b7d0b838 release_2014.08.11
548ab24667d6206780237bd807f7d857a484c461 latest_2014.08.11
2092948937ac30ef82f71463a235c66d34987088 release_2014.10.06
-9b8cb8d48a798e8905d09e380138c1102012ce6f latest_2014.10.06
+a1dca14d5b1afbf2b5bde192e3e6b6763836eff8 latest_2014.10.06
Repository URL: https://bitbucket.org/galaxy/galaxy-central/
--
This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.
1 new commit in galaxy-central:
https://bitbucket.org/galaxy/galaxy-central/commits/a1dca14d5b1a/
Changeset: a1dca14d5b1a
Branch: stable
User: natefoo
Date: 2014-11-07 20:41:27+00:00
Summary: Memory limit detection. This code is from default and is a partial graft. It
includes the code for detecting if a Slurm job was killed due to exceeding
memory limits but does not include resubmit functionality.
Affected #: 2 files
diff -r 83f821c5ecc1e0ac4bbdc6710c6e7abb7faf6afa -r a1dca14d5b1afbf2b5bde192e3e6b6763836eff8 lib/galaxy/jobs/runners/__init__.py
--- a/lib/galaxy/jobs/runners/__init__.py
+++ b/lib/galaxy/jobs/runners/__init__.py
@@ -337,6 +337,7 @@
"""
runner_states = Bunch(
WALLTIME_REACHED = 'walltime_reached',
+ MEMORY_LIMIT_REACHED = 'memory_limit_reached',
GLOBAL_WALLTIME_REACHED = 'global_walltime_reached',
OUTPUT_SIZE_LIMIT = 'output_size_limit'
)
diff -r 83f821c5ecc1e0ac4bbdc6710c6e7abb7faf6afa -r a1dca14d5b1afbf2b5bde192e3e6b6763836eff8 lib/galaxy/jobs/runners/slurm.py
--- a/lib/galaxy/jobs/runners/slurm.py
+++ b/lib/galaxy/jobs/runners/slurm.py
@@ -13,6 +13,8 @@
__all__ = [ 'SlurmJobRunner' ]
+SLURM_MEMORY_LIMIT_EXCEEDED_MSG = 'slurmstepd: error: Exceeded job memory limit'
+
class SlurmJobRunner( DRMAAJobRunner ):
runner_name = "SlurmRunner"
@@ -62,8 +64,14 @@
except:
ajs.fail_message = "This job failed due to a cluster node failure, and an attempt to resubmit the job failed."
elif job_info['JobState'] == 'CANCELLED':
- log.info( '(%s/%s) Job was cancelled via slurm (e.g. with scancel(1))', ajs.job_wrapper.get_id_tag(), ajs.job_id )
- ajs.fail_message = "This job failed because it was cancelled by an administrator."
+ # Check to see if the job was killed for exceeding memory consumption
+ if self.__check_memory_limit( ajs.error_file ):
+ log.info( '(%s/%s) Job hit memory limit', ajs.job_wrapper.get_id_tag(), ajs.job_id )
+ ajs.fail_message = "This job was terminated because it used more memory than it was allocated."
+ ajs.runner_state = ajs.runner_states.MEMORY_LIMIT_REACHED
+ else:
+ log.info( '(%s/%s) Job was cancelled via slurm (e.g. with scancel(1))', ajs.job_wrapper.get_id_tag(), ajs.job_id )
+ ajs.fail_message = "This job failed because it was cancelled by an administrator."
else:
log.warning( '(%s/%s) Job failed due to unknown reasons, JobState was: %s', ajs.job_wrapper.get_id_tag(), ajs.job_id, job_info['JobState'] )
ajs.fail_message = "This job failed for reasons that could not be determined."
@@ -77,3 +85,31 @@
super( SlurmJobRunner, self )._complete_terminal_job( ajs, drmaa_state = drmaa_state )
# by default, finish as if the job was successful.
super( SlurmJobRunner, self )._complete_terminal_job( ajs, drmaa_state = drmaa_state )
+
+ def __check_memory_limit( self, efile_path ):
+ """
+ A very poor implementation of tail, but it doesn't need to be fancy
+ since we are only searching the last 2K
+ """
+ try:
+ log.debug( 'Checking %s for exceeded memory message from slurm', efile_path )
+ with open( efile_path ) as f:
+ pos = 2
+ bof = False
+ while pos < 2048:
+ try:
+ f.seek(-pos, 2)
+ pos += 1
+ except:
+ f.seek(-pos + 1, 2)
+ bof = True
+
+ if (bof or f.read(1) == '\n') and f.readline().strip() == SLURM_MEMORY_LIMIT_EXCEEDED_MSG:
+ return True
+
+ if bof:
+ break
+ except:
+ log.exception('Error reading end of %s:', path)
+
+ return False
Repository URL: https://bitbucket.org/galaxy/galaxy-central/
--
This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.
1 new commit in galaxy-central:
https://bitbucket.org/galaxy/galaxy-central/commits/93723d1cf699/
Changeset: 93723d1cf699
User: dannon
Date: 2014-11-08 00:48:15+00:00
Summary: Merged in kellrott/galaxy-farm/workflow-input-fix (pull request #552)
Fixing workflow import to correctly set the uuid
Affected #: 1 file
diff -r 04644e64c498ca4eacbfa1c6df26cb546705b96c -r 93723d1cf699eab771ccde086d34bec1fd1c22e6 lib/galaxy/web/base/controller.py
--- a/lib/galaxy/web/base/controller.py
+++ b/lib/galaxy/web/base/controller.py
@@ -1596,6 +1596,8 @@
else:
name = data['name']
workflow.name = name
+ if 'uuid' in data:
+ workflow.uuid = data['uuid']
# Assume no errors until we find a step that has some
workflow.has_errors = False
# Create each step
Repository URL: https://bitbucket.org/galaxy/galaxy-central/
--
This is a commit notification from bitbucket.org. You are receiving
this because you have the service enabled, addressing the recipient of
this email.