Skip to content

Commit b9156df

Browse files
authored
Merge pull request #160 from jameshcorbett/disable-fluxion-scheduling
DWS: add option to disable fluxion scheduling
2 parents 914258c + b8cc916 commit b9156df

File tree

2 files changed

+81
-15
lines changed

2 files changed

+81
-15
lines changed

src/modules/coral2_dws.py

Lines changed: 38 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -306,7 +306,7 @@ def state_complete(workflow, state):
306306
)
307307

308308

309-
def workflow_state_change_cb(event, handle, k8s_api):
309+
def workflow_state_change_cb(event, handle, k8s_api, disable_fluxion):
310310
"""Exception-catching wrapper around _workflow_state_change_cb_inner."""
311311
try:
312312
workflow = event["object"]
@@ -324,7 +324,9 @@ def workflow_state_change_cb(event, handle, k8s_api):
324324
del _WORKFLOWINFO_CACHE[jobid]
325325
return
326326
try:
327-
_workflow_state_change_cb_inner(workflow, jobid, winfo, handle, k8s_api)
327+
_workflow_state_change_cb_inner(
328+
workflow, jobid, winfo, handle, k8s_api, disable_fluxion
329+
)
328330
except Exception:
329331
LOGGER.exception(
330332
"Failed to process event update for workflow with jobid %s:", jobid
@@ -342,7 +344,9 @@ def workflow_state_change_cb(event, handle, k8s_api):
342344
handle.job_raise(jobid, "exception", 0, "DWS/Rabbit interactions failed")
343345

344346

345-
def _workflow_state_change_cb_inner(workflow, jobid, winfo, handle, k8s_api):
347+
def _workflow_state_change_cb_inner(
348+
workflow, jobid, winfo, handle, k8s_api, disable_fluxion
349+
):
346350
if "state" not in workflow["status"]:
347351
# workflow was just submitted, DWS still needs to give workflow
348352
# a state of 'Proposal'
@@ -379,13 +383,15 @@ def _workflow_state_change_cb_inner(workflow, jobid, winfo, handle, k8s_api):
379383
resources = flux.job.kvslookup.job_kvs_lookup(handle, jobid)["jobspec"][
380384
"resources"
381385
]
386+
if not disable_fluxion:
387+
resources = directivebreakdown.apply_breakdowns(
388+
k8s_api, workflow, resources, _MIN_ALLOCATION_SIZE
389+
)
382390
handle.rpc(
383391
"job-manager.dws.resource-update",
384392
payload={
385393
"id": jobid,
386-
"resources": directivebreakdown.apply_breakdowns(
387-
k8s_api, workflow, resources, _MIN_ALLOCATION_SIZE
388-
),
394+
"resources": resources,
389395
},
390396
).then(log_rpc_response)
391397
elif state_complete(workflow, "Setup"):
@@ -537,7 +543,10 @@ def init_rabbits(k8s_api, handle, watchers, graph_path, disable_draining):
537543
else:
538544
mark_rabbit(handle, rabbit["status"]["status"], *rabbit_rpaths[name], name)
539545
drain_offline_nodes(
540-
handle, name, rabbit["status"]["access"].get("computes", []), disable_draining
546+
handle,
547+
name,
548+
rabbit["status"]["access"].get("computes", []),
549+
disable_draining,
541550
)
542551
watchers.add_watch(
543552
Watch(
@@ -625,6 +634,11 @@ def setup_parsing():
625634
action="store_true",
626635
help="Disable the draining of compute nodes based on k8s status",
627636
)
637+
parser.add_argument(
638+
"--disable-fluxion",
639+
action="store_true",
640+
help="Disable Fluxion scheduling of rabbits",
641+
)
628642
return parser
629643

630644

@@ -729,16 +743,25 @@ def main():
729743
# start watching k8s workflow resources and operate on them when updates occur
730744
# or new RPCs are received
731745
with Watchers(handle, watch_interval=args.watch_interval) as watchers:
732-
init_rabbits(
733-
k8s_api,
734-
handle,
735-
watchers,
736-
args.resourcegraph,
737-
args.disable_compute_node_draining,
738-
)
746+
if not args.disable_fluxion:
747+
init_rabbits(
748+
k8s_api,
749+
handle,
750+
watchers,
751+
args.resourcegraph,
752+
args.disable_compute_node_draining,
753+
)
739754
services = register_services(handle, k8s_api)
740755
watchers.add_watch(
741-
Watch(k8s_api, WORKFLOW_CRD, 0, workflow_state_change_cb, handle, k8s_api)
756+
Watch(
757+
k8s_api,
758+
WORKFLOW_CRD,
759+
0,
760+
workflow_state_change_cb,
761+
handle,
762+
k8s_api,
763+
args.disable_fluxion,
764+
)
742765
)
743766
raise_self_exception(handle)
744767

t/t1002-dws-workflow-obj.t

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,50 @@ test_expect_success 'exec dws service-providing script with bad arguments' '
4343
-e1 -v -rR.local --foobar
4444
'
4545

46+
test_expect_success 'exec dws service-providing script with fluxion scheduling disabled' '
47+
R=$(flux R encode -r 0) &&
48+
DWS_JOBID=$(flux submit \
49+
--setattr=system.alloc-bypass.R="$R" \
50+
-o per-resource.type=node --output=dws-fluxion-disabled.out \
51+
--error=dws-fluxion-disabled.err python ${DWS_MODULE_PATH} -e1 \
52+
-v --disable-fluxion) &&
53+
flux job wait-event -vt 15 -p guest.exec.eventlog ${DWS_JOBID} shell.start &&
54+
flux job wait-event -vt 15 -m "note=dws watchers setup" ${DWS_JOBID} exception &&
55+
${RPC} "dws.create"
56+
'
57+
58+
test_expect_success 'job submission without DW string works with fluxion-rabbit scheduling disabled' '
59+
jobid=$(flux submit -n1 /bin/true) &&
60+
flux job wait-event -vt 25 -m status=0 ${jobid} finish &&
61+
test_must_fail flux job wait-event -vt 5 -m description=${CREATE_DEP_NAME} \
62+
${jobid} dependency-add
63+
'
64+
65+
test_expect_success 'job submission with valid DW string works with fluxion-rabbit scheduling disabled' '
66+
jobid=$(flux submit --setattr=system.dw="#DW jobdw capacity=10GiB type=xfs name=project1" \
67+
-N1 -n1 hostname) &&
68+
flux job wait-event -vt 10 -m description=${CREATE_DEP_NAME} \
69+
${jobid} dependency-add &&
70+
flux job wait-event -t 10 -m description=${CREATE_DEP_NAME} \
71+
${jobid} dependency-remove &&
72+
flux job wait-event -t 10 -m rabbit_workflow=fluxjob-$(flux job id ${jobid}) \
73+
${jobid} memo &&
74+
flux job wait-event -vt 15 ${jobid} depend &&
75+
flux job wait-event -vt 15 ${jobid} priority &&
76+
flux job wait-event -vt 15 -m description=${PROLOG_NAME} \
77+
${jobid} prolog-start &&
78+
flux job wait-event -vt 25 -m description=${PROLOG_NAME} \
79+
${jobid} prolog-finish &&
80+
flux job wait-event -vt 15 -m status=0 ${jobid} finish &&
81+
flux job wait-event -vt 15 -m description=${EPILOG_NAME} \
82+
${jobid} epilog-start &&
83+
flux job wait-event -vt 30 -m description=${EPILOG_NAME} \
84+
${jobid} epilog-finish &&
85+
flux job wait-event -vt 15 ${jobid} clean
86+
'
87+
4688
test_expect_success 'load fluxion with rabbits' '
89+
flux cancel ${DWS_JOBID} &&
4790
flux R encode -l | flux python ${FLUX_SOURCE_DIR}/src/cmd/flux-dws2jgf.py \
4891
--no-validate | jq . > R.local &&
4992
flux kvs put resource.R="$(cat R.local)" &&

0 commit comments

Comments
 (0)