Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions src/modules/job-manager/plugins/limit-duration.c
Original file line number Diff line number Diff line change
Expand Up @@ -193,17 +193,24 @@ static int check_limit (struct limit_duration *ctx,
{
double limit = ctx->general_limit;
double qlimit = queues_lookup (ctx->queues, queue);
bool unlimited = duration == DURATION_UNLIMITED;

if (qlimit != DURATION_INVALID)
limit = qlimit;
if (limit != DURATION_INVALID
&& limit != DURATION_UNLIMITED
&& (duration > limit || duration == DURATION_UNLIMITED)) {
&& (duration > limit || unlimited)) {
char requested[64];
char fsd[64];
fsd_format_duration_ex (requested, sizeof (requested), duration, 2);
fsd_format_duration_ex (fsd, sizeof (fsd), limit, 2);
return errprintf (error,
"requested duration exceeds policy limit of %s",
fsd);
"requested duration (%s) exceeds policy limit of "
"%s%s%s",
unlimited ? "unlimited" : requested,
fsd,
queue ? " for queue " : "",
queue ? queue : "");
}
return 0;
}
Expand Down
81 changes: 50 additions & 31 deletions src/modules/job-manager/plugins/limit-job-size.c
Original file line number Diff line number Diff line change
Expand Up @@ -297,6 +297,45 @@ static int queues_parse (zhashx_t **zhp,
return -1;
}

static int check_limit (const char *queue,
const char *resource,
bool over,
int limit,
int value,
flux_error_t *errp)
{
if ((over && LIMIT_OVER (limit, value))
|| (!over && LIMIT_UNDER (limit, value)))
return errprintf (errp,
"requested %s (%d) %s policy limit of %d%s%s",
resource,
value,
over ? "exceeds" : "is under",
limit,
queue ? " for queue ": "",
queue ? queue : "");
return 0;
}

static int check_over (const char *queue,
const char *resource,
int limit,
int value,
flux_error_t *errp)
{
return check_limit (queue, resource, true, limit, value, errp);
}

static int check_under (const char *queue,
const char *resource,
int limit,
int value,
flux_error_t *errp)
{
return check_limit (queue, resource, false, limit, value, errp);
}


static int check_limits (struct limit_job_size *ctx,
struct jj_counts *counts,
const char *queue,
Expand All @@ -305,41 +344,21 @@ static int check_limits (struct limit_job_size *ctx,
struct limits limits;
struct limits *queue_limits;

int nnodes = counts->nnodes;
int ncores = counts->nslots * counts->slot_size;
int ngpus = counts->nslots * counts->slot_gpus;

limits = ctx->general_limits;
if (queue && (queue_limits = queues_lookup (ctx->queues, queue)))
limits_override (&limits, queue_limits);

if (LIMIT_OVER (limits.max.nnodes, counts->nnodes)) {
return errprintf (error,
"requested nnodes exceeds policy limit of %d",
limits.max.nnodes);
}
if (LIMIT_OVER (limits.max.ncores, counts->nslots * counts->slot_size)) {
return errprintf (error,
"requested ncores exceeds policy limit of %d",
limits.max.ncores);
}
if (LIMIT_OVER (limits.max.ngpus, counts->nslots * counts->slot_gpus)) {
return errprintf (error,
"requested ngpus exceeds policy limit of %d",
limits.max.ngpus);
}
if (LIMIT_UNDER (limits.min.nnodes, counts->nnodes)) {
return errprintf (error,
"requested nnodes is under policy limit of %d",
limits.min.nnodes);
}
if (LIMIT_UNDER (limits.min.ncores, counts->nslots * counts->slot_size)) {
return errprintf (error,
"requested ncores is under policy limit of %d",
limits.min.ncores);
}
if (LIMIT_UNDER (limits.min.ngpus, counts->nslots * counts->slot_gpus)) {
return errprintf (error,
"requested ngpus is under policy limit of %d",
limits.min.ngpus);
}

if (check_over (queue, "nnodes", limits.max.nnodes, nnodes, error) < 0
|| check_over (queue, "ncores", limits.max.ncores, ncores, error) < 0
|| check_over (queue, "ngpus", limits.max.ngpus, ngpus, error) < 0
|| check_under (queue, "nnodes", limits.min.nnodes, nnodes, error) < 0
|| check_under (queue, "ncores", limits.min.ncores, ncores, error) < 0
|| check_under (queue, "ngpus", limits.min.ngpus, ngpus, error) < 0)
return -1;
return 0;
}

Expand Down
12 changes: 10 additions & 2 deletions t/t2221-job-manager-limit-duration.t
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,18 @@ test_expect_success 'configure policy.limits.duration and queue duration' '
flux queue start --all
'
test_expect_success 'a job that exceeds policy.limits.duration is rejected' '
test_must_fail flux submit --queue=debug -t 2h true
test_must_fail flux submit --queue=debug -t 2h true 2>limit.err &&
test_debug "cat limit.err"
'
test_expect_success 'error message includes expected details' '
grep "duration (2h) exceeds.*limit of 1h for queue debug" limit.err
'
test_expect_success 'a job with no limit is also rejected' '
test_must_fail flux submit --queue=debug -t 0 true
test_must_fail flux submit --queue=debug -t 0 true 2>limit2.err &&
test_debug "cat limit2.err"
'
test_expect_success 'error message includes expected details' '
grep "duration (unlimited) exceeds.*limit of 1h for queue debug" limit2.err
'
test_expect_success 'but is accepted by a queue with higher limit' '
flux submit \
Expand Down
6 changes: 5 additions & 1 deletion t/t2222-job-manager-limit-job-size.t
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,11 @@ test_expect_success 'a job is accepted if under general gpu limit' '
flux submit --queue=debug -n1 true
'
test_expect_success 'a job is rejected if over gpu limit' '
test_must_fail flux submit --queue=debug -n1 -g1 true
test_must_fail flux submit --queue=debug -n1 -g1 true 2>debug.err &&
test_debug "cat debug.err"
'
test_expect_success 'error message contains queue name' '
grep "for queue debug" debug.err
'
test_expect_success 'same job is accepted with unlimited queue override' '
flux submit --queue=batch -n1 -g1 true
Expand Down
2 changes: 1 addition & 1 deletion t/t2290-job-update.t
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ test_expect_success 'reload update-duration plugin with owner-allow-any=0' '
test_expect_success 'update duration above policy limit now fails' '
test_expect_code 1 flux update $jobid duration=1h 2>limit.err &&
test_debug "cat limit.err" &&
grep "requested duration exceeds policy limit" limit.err
grep "requested duration.*exceeds policy limit" limit.err
'
test_expect_success 'update fails for running job' '
jobid=$(flux submit -t1m --wait-event=start sleep 60) &&
Expand Down
2 changes: 1 addition & 1 deletion t/t2291-job-update-queue.t
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ test_expect_success 'job runs on batch resources' '
test_expect_success 'update to queue with lower duration limit fails' '
jobid=$(flux submit -q batch --urgency=hold hostname) &&
test_must_fail flux update $jobid queue=debug 2>queue.err &&
grep "duration exceeds policy limit" queue.err
grep "duration.*exceeds policy limit" queue.err
'
test_expect_success 'update of duration allows queue update' '
flux update $jobid queue=debug duration=1m &&
Expand Down
Loading