commit/galaxy-central: natefoo: Retrying operations that can encounter transient failures when running jobs.
1 new commit in galaxy-central: https://bitbucket.org/galaxy/galaxy-central/commits/384240b8cd29/ Changeset: 384240b8cd29 User: natefoo Date: 2013-10-04 22:11:45 Summary: Retrying operations that can encounter transient failures when running jobs. Affected #: 2 files diff -r e8dcf9b9fa2914623435997c567b415b1ad416f1 -r 384240b8cd29963f302a0349476cf83734cfb5df lib/galaxy/jobs/__init__.py --- a/lib/galaxy/jobs/__init__.py +++ b/lib/galaxy/jobs/__init__.py @@ -2,6 +2,7 @@ Support for running a tool in Galaxy via an internal job management system """ +import time import copy import datetime import galaxy @@ -17,7 +18,7 @@ import traceback from galaxy import model, util from galaxy.datatypes import metadata -from galaxy.exceptions import ObjectInvalid +from galaxy.exceptions import ObjectInvalid, ObjectNotFound from galaxy.jobs.actions.post import ActionBox from galaxy.jobs.mapper import JobRunnerMapper from galaxy.jobs.runners import BaseJobRunner @@ -926,6 +927,17 @@ context = self.get_dataset_finish_context( job_context, dataset_assoc.dataset.dataset ) #should this also be checking library associations? - can a library item be added from a history before the job has ended? - lets not allow this to occur for dataset in dataset_assoc.dataset.dataset.history_associations + dataset_assoc.dataset.dataset.library_associations: #need to update all associated output hdas, i.e. history was shared with job running + trynum = 0 + while trynum < self.app.config.retry_job_output_collection: + try: + # Attempt to short circuit NFS attribute caching + os.stat( dataset.dataset.file_name ) + os.chown( dataset.dataset.file_name, os.getuid(), -1 ) + trynum = self.app.config.retry_job_output_collection + except ( OSError, ObjectNotFound ), e: + trynum += 1 + log.warning( 'Error accessing %s, will retry: %s', dataset.dataset.file_name, e ) + time.sleep( 2 ) dataset.blurb = 'done' dataset.peek = 'no peek' dataset.info = (dataset.info or '') diff -r e8dcf9b9fa2914623435997c567b415b1ad416f1 -r 384240b8cd29963f302a0349476cf83734cfb5df lib/galaxy/jobs/runners/drmaa.py --- a/lib/galaxy/jobs/runners/drmaa.py +++ b/lib/galaxy/jobs/runners/drmaa.py @@ -166,7 +166,17 @@ # runJob will raise if there's a submit problem if self.external_runJob_script is None: - external_job_id = self.ds.runJob(jt) + # TODO: create a queue for retrying submission indefinitely + # TODO: configurable max tries and sleep + trynum = 0 + external_job_id = None + while external_job_id is None and trynum < 5: + try: + external_job_id = self.ds.runJob(jt) + except drmaa.InternalException, e: + trynum += 1 + log.warning( '(%s) drmaa.Session.runJob() failed, will retry: %s', galaxy_id_tag, e ) + time.sleep( 5 ) else: job_wrapper.change_ownership_for_run() log.debug( '(%s) submitting with credentials: %s [uid: %s]' % ( galaxy_id_tag, job_wrapper.user_system_pwent[0], job_wrapper.user_system_pwent[2] ) ) @@ -202,16 +212,9 @@ try: assert external_job_id not in ( None, 'None' ), '(%s/%s) Invalid job id' % ( galaxy_id_tag, external_job_id ) state = self.ds.jobStatus( external_job_id ) - # InternalException was reported to be necessary on some DRMs, but - # this could cause failures to be detected as completion! Please - # report if you experience problems with this. - except ( drmaa.InvalidJobException, drmaa.InternalException ), e: - # we should only get here if an orphaned job was put into the queue at app startup - log.info( "(%s/%s) job left DRM queue with following message: %s" % ( galaxy_id_tag, external_job_id, e ) ) - self.work_queue.put( ( self.finish_job, ajs ) ) - continue - except drmaa.DrmCommunicationException, e: - log.warning( "(%s/%s) unable to communicate with DRM: %s" % ( galaxy_id_tag, external_job_id, e )) + # TODO: probably need to keep track of InvalidJobException count and remove after it exceeds some configurable + except ( drmaa.DrmCommunicationException, drmaa.InternalException, drmaa.InvalidJobException ), e: + log.warning( "(%s/%s) job check resulted in %s: %s", galaxy_id_tag, external_job_id, e.__class__.name, e ) new_watched.append( ajs ) continue except Exception, e: Repository URL: https://bitbucket.org/galaxy/galaxy-central/ -- This is a commit notification from bitbucket.org. You are receiving this because you have the service enabled, addressing the recipient of this email.
participants (1)
-
commits-noreply@bitbucket.org