diff --git a/torchx/schedulers/kubernetes_scheduler.py b/torchx/schedulers/kubernetes_scheduler.py index c037da2c9..252417dd7 100644 --- a/torchx/schedulers/kubernetes_scheduler.py +++ b/torchx/schedulers/kubernetes_scheduler.py @@ -101,6 +101,7 @@ import json import logging +import re import warnings from dataclasses import dataclass from datetime import datetime @@ -984,13 +985,34 @@ def create_scheduler( def pod_labels( app: AppDef, role_idx: int, role: Role, replica_id: int, app_id: str ) -> Dict[str, str]: + + def clean(label_value: str) -> str: + # cleans the provided `label_value` to make it compliant + # to pod label specs as described in + # https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ + # + # Valid label value: + # must be 63 characters or less (can be empty), + # unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]), + # could contain dashes (-), underscores (_), dots (.), and alphanumerics between. + + # Replace invalid characters (allow: alphanum, -, _, .) with "." + label_value = re.sub(r"[^A-Za-z0-9\-_.]", ".", label_value) + # Replace leading non-alphanumeric with "." + label_value = re.sub(r"^[^A-Za-z0-9]+", ".", label_value) + # Replace trailing non-alphanumeric with "." + label_value = re.sub(r"[^A-Za-z0-9]+$", ".", label_value) + + # Trim to 63 characters + return label_value[:63] + return { - LABEL_VERSION: torchx.__version__, - LABEL_APP_NAME: app.name, + LABEL_VERSION: clean(torchx.__version__), + LABEL_APP_NAME: clean(app.name), LABEL_ROLE_INDEX: str(role_idx), - LABEL_ROLE_NAME: role.name, + LABEL_ROLE_NAME: clean(role.name), LABEL_REPLICA_ID: str(replica_id), - LABEL_KUBE_APP_NAME: app.name, + LABEL_KUBE_APP_NAME: clean(app.name), LABEL_ORGANIZATION: "torchx.pytorch.org", - LABEL_UNIQUE_NAME: app_id, + LABEL_UNIQUE_NAME: clean(app_id), } diff --git a/torchx/schedulers/test/kubernetes_scheduler_test.py b/torchx/schedulers/test/kubernetes_scheduler_test.py index d3f2c2346..b34d25e04 100644 --- a/torchx/schedulers/test/kubernetes_scheduler_test.py +++ b/torchx/schedulers/test/kubernetes_scheduler_test.py @@ -21,17 +21,27 @@ from torchx.schedulers import kubernetes_scheduler from torchx.schedulers.api import DescribeAppResponse, ListAppResponse from torchx.schedulers.docker_scheduler import has_docker +from torchx.schedulers.ids import make_unique from torchx.schedulers.kubernetes_scheduler import ( app_to_resource, create_scheduler, KubernetesJob, KubernetesOpts, KubernetesScheduler, + LABEL_APP_NAME, LABEL_INSTANCE_TYPE, + LABEL_KUBE_APP_NAME, + LABEL_ORGANIZATION, + LABEL_REPLICA_ID, + LABEL_ROLE_INDEX, + LABEL_ROLE_NAME, + LABEL_UNIQUE_NAME, + LABEL_VERSION, PLACEHOLDER_FIELD_PATH, role_to_pod, ) from torchx.specs import AppDryRunInfo, AppState +from torchx.util.strings import normalize_str SKIP_DOCKER: bool = not has_docker() @@ -311,7 +321,7 @@ def test_submit_dryrun(self) -> None: torchx.pytorch.org/replica-id: '0' torchx.pytorch.org/role-index: '0' torchx.pytorch.org/role-name: trainer_foo - torchx.pytorch.org/version: {torchx.__version__} + torchx.pytorch.org/version: {torchx.__version__.replace("+", ".")} spec: containers: - command: @@ -1309,6 +1319,43 @@ def test_validate_spec_long_pod_name(self) -> None: self.assertIn("Pod name", str(ctx.exception)) self.assertIn("exceeds 63 character limit", str(ctx.exception)) + def test_pod_label(self) -> None: + _UNUSED = "__UNUSED__" + + app = specs.AppDef( + name="foo+bar", + roles=[specs.Role(name="a/b", image=_UNUSED)], + ) + app_id = normalize_str(make_unique(app.name)) + labels = kubernetes_scheduler.pod_labels( + app=app, + role_idx=0, + role=app.roles[0], + replica_id=1, + app_id=app_id, + ) + + self.assertDictEqual( + labels, + { + # torchx version complies with PEP-440 + # while typically it is 0.x.x or 0.x.xdev0 + # there could be org specific builds that are of the form + # 0.x.xdev0+org_name (e.g. 0.8.0dev0+fb) + # "+" is not a valid pod label char + # we expect that the version str would've been "cleaned" + # to replace invalid chars with "." (a valid char) + LABEL_VERSION: torchx.__version__.replace("+", "."), + LABEL_APP_NAME: "foo.bar", + LABEL_ROLE_INDEX: "0", + LABEL_ROLE_NAME: "a.b", + LABEL_REPLICA_ID: "1", + LABEL_KUBE_APP_NAME: "foo.bar", + LABEL_ORGANIZATION: "torchx.pytorch.org", + LABEL_UNIQUE_NAME: app_id, + }, + ) + class KubernetesSchedulerNoImportTest(unittest.TestCase): """