meta-pytorch
diff --git a/‎torchx/schedulers/slurm_scheduler.py‎
Lines changed: 84 additions & 14 deletions b/‎torchx/schedulers/slurm_scheduler.py‎
Lines changed: 84 additions & 14 deletions
@@ -20,6 +20,7 @@
 import tempfile
 from dataclasses import dataclass
 from datetime import datetime
+from subprocess import CalledProcessError, PIPE
 from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple
 
 import torchx
@@ -66,6 +67,11 @@
     "TIMEOUT": AppState.FAILED,
 }
 
+
+def appstate_from_slurm_state(slurm_state: str) -> AppState:
+    return SLURM_STATES.get(slurm_state, AppState.UNKNOWN)
+
+
 SBATCH_JOB_OPTIONS = {
     "comment",
     "mail-user",
@@ -482,10 +488,82 @@ def _cancel_existing(self, app_id: str) -> None:
         subprocess.run(["scancel", app_id], check=True)
 
     def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
-        try:
-            return self._describe_sacct(app_id)
-        except subprocess.CalledProcessError:
-            return self._describe_squeue(app_id)
+        # fallback to using different slurm commands for describing the job
+        for describe in [
+            self._describe_scontrol,  # NOTE: only scontrol fills hostnames
+            self._describe_sacct,
+            self._describe_squeue,
+        ]:
+            try:
+                return describe(app_id)
+            except CalledProcessError:
+                continue
+
+    def _describe_scontrol(self, app_id: str) -> Optional[DescribeAppResponse]:
+        # NOTE: app_id for slurm_scheduler is the job_id (not the heterogenous_job_id).
+        #   For heterogeneous jobs, querying slurm by the base job id returns all the
+        #   "sub-jobs" in it.
+        #   We launch each role's replica on its own srun command where the job_name is set
+        #   to `{role.name}-{replica_id}` (e.g. `worker-0`, `worker-1`, ...).
+        #   So each sub-job maps to a replica in the role.
+
+        output = subprocess.check_output(
+            ["scontrol", "show", "--json", "job", app_id], stderr=PIPE, encoding="utf-8"
+        )
+        output_json = json.loads(output)
+        jobs = output_json["jobs"]
+        if not jobs:
+            # job either finished or does not exist
+            return None
+
+        roles: dict[str, Role] = {}
+        roles_statuses: dict[str, RoleStatus] = {}
+        state = AppState.UNKNOWN
+
+        for job in jobs:
+            # job name is of the form "{role_name}-{replica_id}"
+            role_name, _, replica_id = job["name"].rpartition("-")
+
+            image = job["current_working_directory"]
+            entrypoint = job["command"]
+            state = appstate_from_slurm_state(job["job_state"][0])
+            job_resources = job["job_resources"]
+
+            # nodes is a a hostlist expression (e.g. slurm-compute-node[200-210,212])
+            # but we schedule a job per replica so will always be a single host
+            hostname = job_resources["nodes"]
+
+            role = roles.setdefault(
+                role_name,
+                Role(
+                    name=role_name,
+                    image=image,
+                    entrypoint=entrypoint,
+                    num_replicas=0,
+                ),
+            )
+            role.num_replicas += 1
+
+            role_status = roles_statuses.setdefault(
+                role_name,
+                RoleStatus(role_name, replicas=[]),
+            )
+
+            role_status.replicas.append(
+                ReplicaStatus(
+                    id=int(replica_id),
+                    role=role_name,
+                    state=state,
+                    hostname=hostname,
+                )
+            )
+
+        return DescribeAppResponse(
+            app_id=app_id,
+            roles=list(roles.values()),
+            roles_statuses=list(roles_statuses.values()),
+            state=state,
+        )
 
     def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]:
         p = subprocess.run(
@@ -511,11 +589,7 @@ def _describe_sacct(self, app_id: str) -> Optional[DescribeAppResponse]:
 
             state = row["State"]
             msg = state
-            state_enum = SLURM_STATES.get(state)
-            assert (
-                state_enum
-            ), f"failed to translate slurm state {state} to torchx state"
-            app_state = state_enum
+            app_state = appstate_from_slurm_state(state)
 
             role, _, replica_id = row["JobName"].rpartition("-")
             if not replica_id or not role:
@@ -553,11 +627,7 @@ def _describe_squeue(self, app_id: str) -> Optional[DescribeAppResponse]:
         for job in output_json["jobs"]:
             state = job["job_state"][0]
             msg = state
-            state_enum = SLURM_STATES.get(state)
-            assert (
-                state_enum
-            ), f"failed to translate slurm state {state} to torchx state"
-            app_state = state_enum
+            app_state = appstate_from_slurm_state(state)
 
             role, _, replica_id = job["name"].rpartition("-")
             if not replica_id or not role: