Hi,
I have noticed that from time to time, the job queue seems to
be ¡°stuck¡± and can only be unstuck by restarting galaxy.
The jobs seem to be in the queue state and the python job
handler processes are hardly ticking over and the cluster is
empty.
When I restart, the startup procedure realizes all jobs are
in the a ¡°new state¡± and it then assigns a jobhandler after which the jobs start
fine¡.
Any ideas?
Thon
P.S I am using the june version of galaxy and I DO set limits
on my users in job_conf.xml as so: (Maybe it is related? Before it went into
dormant mode, this user had started lots of jobs and may have hit the limit, but
I assumed this limit was the number of running jobs at one time,
right?)
<?xml version="1.0"?>
<job_conf>
<plugins workers="4">
<!-- "workers"
is the number of threads for the runner's work queue.
The default from <plugins> is used if not defined for a
<plugin>.
-->
<plugin
id="local" type="runner" load="galaxy.jobs.runners.local:LocalJobRunner"
workers="2"/>
<plugin
id="drmaa" type="runner" load="galaxy.jobs.runners.drmaa:DRMAAJobRunner"
workers="8"/>
<plugin
id="cli" type="runner" load="galaxy.jobs.runners.cli:ShellJobRunner"
workers="2"/>
</plugins>
<handlers
default="handlers">
<!-- Additional
job handlers - the id should match the name of a
[server:<id>] in universe_wsgi.ini.
-->
<handler
id="handler0" tags="handlers"/>
<handler
id="handler1" tags="handlers"/>
<handler
id="handler2" tags="handlers"/>
<handler
id="handler3" tags="handlers"/>
<!--
<handler id="handler10" tags="handlers"/>
<handler
id="handler11" tags="handlers"/>
<handler
id="handler12" tags="handlers"/>
<handler
id="handler13" tags="handlers"/>
-->
</handlers>
<destinations
default="regularjobs">
<!--
Destinations define details about remote resources and how jobs
should be executed on those remote resources.
-->
<destination
id="local" runner="local"/>
<destination
id="regularjobs" runner="drmaa" tags="cluster">
<!-- These are the parameters
for qsub, such as queue etc. -->
<param id="nativeSpecification">-V -q long.q -pe smp
1</param>
</destination>
<destination
id="longjobs" runner="drmaa" tags="cluster,long_jobs">
<!-- These are the parameters for qsub, such as queue etc.
-->
<param id="nativeSpecification">-V -q long.q -pe smp
1</param>
</destination>
<destination
id="shortjobs" runner="drmaa" tags="cluster,short_jobs">
<!-- These are the parameters for qsub, such as queue etc.
-->
<param id="nativeSpecification">-V -q short.q -pe smp
1</param>
</destination>
<destination
id="multicorejobs4" runner="drmaa"
tags="cluster,multicore_jobs">
<!-- These are the parameters for qsub, such as queue etc.
-->
<param id="nativeSpecification">-V -q long.q -pe smp
4</param>
</destination>
<!--
<destination id="real_user_cluster" runner="drmaa">
<param
id="galaxy_external_runjob_script">scripts/drmaa_external_runner.py</param>
<param
id="galaxy_external_killjob_script">scripts/drmaa_external_killer.py</param>
<param
id="galaxy_external_chown_script">scripts/external_chown_script.py</param>
</destination> -->
<destination
id="dynamic" runner="dynamic">
<!-- A destination that represents a method in the dynamic runner.
-->
<param id="type">python</param>
<param id="function">interactiveOrCluster</param>
</destination>
</destinations>
<tools>
<!-- Tools can
be configured to use specific destinations or handlers,
identified by either the "id" or "tags" attribute. If assigned
to
a tag, a handler or destination that matches that tag will
be
chosen at random.
-->
<tool
id="bwa_wrapper" destination="multicorejobs4"/>
</tools>
<limits>
<!-- Certain
limits can be defined.
<limit
type="registered_user_concurrent_jobs">500</limit>
<limit
type="unregistered_user_concurrent_jobs">1</limit>
<limit
type="concurrent_jobs" id="local">1</limit>
<limit
type="concurrent_jobs" tag="cluster">200</limit>
<limit
type="concurrent_jobs" tag="long_jobs">200</limit>
<limit
type="concurrent_jobs" tag="short_jobs">200</limit>
<limit
type="concurrent_jobs" tag="multicore_jobs">100</limit>
-->
</limits>
</job_conf>