Skip to content

Commit 61107b6

Browse files
committed
fix: handle kubernetes describe failures gracefully (#1150)
1 parent 1d26b39 commit 61107b6

File tree

3 files changed

+40
-7
lines changed

3 files changed

+40
-7
lines changed

scripts/setup_pyre.sh

100644100755
File mode changed.

torchx/schedulers/kubernetes_scheduler.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -706,16 +706,23 @@ def _run_opts(self) -> runopts:
706706
return opts
707707

708708
def describe(self, app_id: str) -> Optional[DescribeAppResponse]:
709+
from kubernetes.client.rest import ApiException
710+
709711
namespace, name = app_id.split(":")
710712
roles = {}
711713
roles_statuses = {}
712-
resp = self._custom_objects_api().get_namespaced_custom_object_status(
713-
group="batch.volcano.sh",
714-
version="v1alpha1",
715-
namespace=namespace,
716-
plural="jobs",
717-
name=name,
718-
)
714+
try:
715+
resp = self._custom_objects_api().get_namespaced_custom_object_status(
716+
group="batch.volcano.sh",
717+
version="v1alpha1",
718+
namespace=namespace,
719+
plural="jobs",
720+
name=name,
721+
)
722+
except ApiException as e:
723+
if e.status == 404:
724+
return None
725+
raise
719726
status = resp.get("status")
720727
if status:
721728
state_str = status["state"]["phase"]

torchx/schedulers/test/kubernetes_scheduler_test.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -714,6 +714,32 @@ def test_describe_unknown(
714714
),
715715
)
716716

717+
@patch("kubernetes.client.CustomObjectsApi.get_namespaced_custom_object_status")
718+
def test_describe_api_exception_404(
719+
self, get_namespaced_custom_object_status: MagicMock
720+
) -> None:
721+
from kubernetes.client.rest import ApiException
722+
723+
api_exc = ApiException(status=404, reason="Not Found")
724+
get_namespaced_custom_object_status.side_effect = api_exc
725+
app_id = "testnamespace:testid"
726+
scheduler = create_scheduler("test")
727+
info = scheduler.describe(app_id)
728+
self.assertIsNone(info)
729+
730+
@patch("kubernetes.client.CustomObjectsApi.get_namespaced_custom_object_status")
731+
def test_describe_api_exception_other(
732+
self, get_namespaced_custom_object_status: MagicMock
733+
) -> None:
734+
from kubernetes.client.rest import ApiException
735+
736+
api_exc = ApiException(status=500, reason="Internal Server Error")
737+
get_namespaced_custom_object_status.side_effect = api_exc
738+
app_id = "testnamespace:testid"
739+
scheduler = create_scheduler("test")
740+
with self.assertRaises(ApiException):
741+
scheduler.describe(app_id)
742+
717743
def test_runopts(self) -> None:
718744
scheduler = kubernetes_scheduler.create_scheduler("foo")
719745
runopts = scheduler.run_opts()

0 commit comments

Comments
 (0)