Skip to content

Commit 303736a

Browse files
authored
support older slurm job_state schema
Differential Revision: D83690140 Pull Request resolved: #1137
1 parent 3b5df3a commit 303736a

File tree

2 files changed

+45
-2
lines changed

2 files changed

+45
-2
lines changed

torchx/schedulers/slurm_scheduler.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,15 @@ def appstate_from_slurm_state(slurm_state: str) -> AppState:
7373
return SLURM_STATES.get(slurm_state, AppState.UNKNOWN)
7474

7575

76+
def get_appstate_from_job(job: dict[str, object]) -> AppState:
77+
# Prior to slurm-23.11, job_state was a string and not a list
78+
job_state = job.get("job_state", None)
79+
if isinstance(job_state, list):
80+
return appstate_from_slurm_state(job_state[0])
81+
else:
82+
return appstate_from_slurm_state(str(job_state))
83+
84+
7685
def version() -> Tuple[int, int]:
7786
"""
7887
Uses ``sinfo --version`` to get the slurm version. If the command fails, it
@@ -666,7 +675,7 @@ def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
666675

667676
entrypoint = job["command"]
668677
image = job["current_working_directory"]
669-
state = appstate_from_slurm_state(job["job_state"][0])
678+
state = get_appstate_from_job(job)
670679

671680
job_resources = job["job_resources"]
672681

@@ -881,7 +890,7 @@ def _list_squeue(self) -> List[ListAppResponse]:
881890
out.append(
882891
ListAppResponse(
883892
app_id=str(job["job_id"]),
884-
state=SLURM_STATES[job["job_state"][0]],
893+
state=get_appstate_from_job(job),
885894
name=job["name"],
886895
)
887896
)

torchx/schedulers/test/slurm_scheduler_test.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -547,8 +547,14 @@ def test_list_sacct(self, run: MagicMock) -> None:
547547

548548
@patch("subprocess.run")
549549
def test_list_squeue(self, run: MagicMock) -> None:
550+
# First job is patched with a string-type job state
550551
run.return_value.stdout = b"""{
551552
"jobs": [
553+
{
554+
"job_id": 1233,
555+
"name": "bar",
556+
"job_state": "FAILED"
557+
},
552558
{
553559
"job_id": 1234,
554560
"name": "foo",
@@ -588,6 +594,7 @@ def test_list_squeue(self, run: MagicMock) -> None:
588594
}"""
589595
scheduler = create_scheduler("foo")
590596
expected_apps = [
597+
ListAppResponse(app_id="1233", state=AppState.FAILED, name="bar"),
591598
ListAppResponse(app_id="1234", state=AppState.FAILED, name="foo"),
592599
ListAppResponse(app_id="1235", state=AppState.FAILED, name="foo"),
593600
ListAppResponse(app_id="1236", state=AppState.RUNNING, name="foo-0"),
@@ -1128,3 +1135,30 @@ def test_describe_squeue_nodes_as_string(self) -> None:
11281135

11291136
assert result is not None
11301137
assert result.roles_statuses[0].replicas[0].hostname == "compute-node-123"
1138+
1139+
def test_describe_squeue_handles_string_state(self) -> None:
1140+
"""Test that describe handles job state as string (i.e. for SLURM <= 23.02)."""
1141+
1142+
# Mock legacy slurm response with job_state as a string
1143+
mock_job_data = {
1144+
"jobs": [
1145+
{
1146+
"name": "test-job-0",
1147+
"job_state": "TIMEOUT",
1148+
"job_resources": {"nodes": "compute-node-123"},
1149+
"command": "/bin/echo",
1150+
"current_working_directory": "/tmp",
1151+
}
1152+
]
1153+
}
1154+
1155+
with patch("subprocess.check_output") as mock_subprocess:
1156+
mock_subprocess.return_value = json.dumps(mock_job_data)
1157+
1158+
scheduler = SlurmScheduler("test")
1159+
result = scheduler._describe_squeue("123")
1160+
1161+
assert result is not None
1162+
assert result.app_id == "123"
1163+
# should have a valid parsed state
1164+
assert result.state == AppState.FAILED

0 commit comments

Comments
 (0)