commit/galaxy-central: natefoo: Handle all submission failures in the drmaa runner.
1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/1bb61cbf8082/ Changeset: 1bb61cbf8082 User: natefoo Date: 2014-11-03 18:58:14+00:00 Summary: Handle all submission failures in the drmaa runner. Affected #: 1 file diff -r 25ea95428c508c7982a3f9ff2af855eac7c62dbb -r 1bb61cbf80825b0793a61d83f01925b57467d3a9 lib/galaxy/jobs/runners/drmaa.py --- a/lib/galaxy/jobs/runners/drmaa.py +++ b/lib/galaxy/jobs/runners/drmaa.py @@ -13,6 +13,7 @@ from galaxy import eggs from galaxy import model from galaxy.jobs import JobDestination +from galaxy.jobs.handler import DEFAULT_JOB_PUT_FAILURE_MESSAGE from galaxy.jobs.runners import AsynchronousJobState, AsynchronousJobRunner eggs.require( "drmaa" ) @@ -170,6 +171,7 @@ # TODO: configurable max tries and sleep trynum = 0 external_job_id = None + fail_msg = None while external_job_id is None and trynum < 5: try: external_job_id = self.ds.runJob(jt) @@ -177,10 +179,16 @@ except ( drmaa.InternalException, drmaa.DeniedByDrmException ), e: trynum += 1 log.warning( '(%s) drmaa.Session.runJob() failed, will retry: %s', galaxy_id_tag, e ) + fail_msg = "Unable to run this job due to a cluster error, please retry it later" time.sleep( 5 ) + except: + log.exception( '(%s) drmaa.Session.runJob() failed unconditionally', galaxy_id_tag ) + trynum = 5 else: log.error( "(%s) All attempts to submit job failed" % galaxy_id_tag ) - job_wrapper.fail( "Unable to run this job due to a cluster error, please retry it later" ) + if not fail_msg: + fail_msg = DEFAULT_JOB_PUT_FAILURE_MESSAGE + job_wrapper.fail( fail_msg ) self.ds.deleteJobTemplate( jt ) return else: Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
commits-noreply@bitbucket.org