Skip to content

Commit bad0ca7

Browse files
committed
fix: handle kubernetes describe failures gracefully (#1150)
1 parent 1d26b39 commit bad0ca7

File tree

3 files changed

+35
-7
lines changed

3 files changed

+35
-7
lines changed

scripts/setup_pyre.sh

100644100755
File mode changed.

torchx/schedulers/kubernetes_scheduler.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -706,16 +706,29 @@ def _run_opts(self) -> runopts:
706706
return opts
707707

708708
def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
709+
from kubernetes.client.rest import ApiException
710+
709711
namespace, name = app_id.split(":")
710712
roles = {}
711713
roles_statuses = {}
712-
resp = self._custom_objects_api().get_namespaced_custom_object_status(
713-
group="batch.volcano.sh",
714-
version="v1alpha1",
715-
namespace=namespace,
716-
plural="jobs",
717-
name=name,
718-
)
714+
try:
715+
resp = self._custom_objects_api().get_namespaced_custom_object_status(
716+
group="batch.volcano.sh",
717+
version="v1alpha1",
718+
namespace=namespace,
719+
plural="jobs",
720+
name=name,
721+
)
722+
except Exception as e:
723+
return DescribeAppResponse(
724+
app_id=app_id,
725+
state=AppState.UNKNOWN,
726+
msg=(
727+
f"{e.reason}: {e.body}"
728+
if hasattr(e, "body") and e.body
729+
else str(getattr(e, "reason", e))
730+
),
731+
)
719732
status = resp.get("status")
720733
if status:
721734
state_str = status["state"]["phase"]

torchx/schedulers/test/kubernetes_scheduler_test.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -714,6 +714,21 @@ def test_describe_unknown(
714714
),
715715
)
716716

717+
@patch("kubernetes.client.CustomObjectsApi.get_namespaced_custom_object_status")
718+
def test_describe_api_exception(
719+
self, get_namespaced_custom_object_status: MagicMock
720+
) -> None:
721+
from kubernetes.client.rest import ApiException
722+
723+
api_exc = ApiException(status=404, reason="Not Found")
724+
get_namespaced_custom_object_status.side_effect = api_exc
725+
app_id = "testnamespace:testid"
726+
scheduler = create_scheduler("test")
727+
info = scheduler.describe(app_id)
728+
self.assertEqual(info.app_id, app_id)
729+
self.assertEqual(info.state, specs.AppState.UNKNOWN)
730+
self.assertIn("Not Found", info.msg)
731+
717732
def test_runopts(self) -> None:
718733
scheduler = kubernetes_scheduler.create_scheduler("foo")
719734
runopts = scheduler.run_opts()

0 commit comments

Comments
 (0)