Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 27 additions & 5 deletions torchx/schedulers/kubernetes_scheduler.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@

import json
import logging
import re
import warnings
from dataclasses import dataclass
from datetime import datetime
Expand Down Expand Up @@ -984,13 +985,34 @@ def create_scheduler(
def pod_labels(
app: AppDef, role_idx: int, role: Role, replica_id: int, app_id: str
) -> Dict[str, str]:

def clean(label_value: str) -> str:
# cleans the provided `label_value` to make it compliant
# to pod label specs as described in
# https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/
#
# Valid label value:
# must be 63 characters or less (can be empty),
# unless empty, must begin and end with an alphanumeric character ([a-z0-9A-Z]),
# could contain dashes (-), underscores (_), dots (.), and alphanumerics between.

# Replace invalid characters (allow: alphanum, -, _, .) with "."
label_value = re.sub(r"[^A-Za-z0-9\-_.]", ".", label_value)
# Replace leading non-alphanumeric with "."
label_value = re.sub(r"^[^A-Za-z0-9]+", ".", label_value)
# Replace trailing non-alphanumeric with "."
label_value = re.sub(r"[^A-Za-z0-9]+$", ".", label_value)

# Trim to 63 characters
return label_value[:63]

return {
LABEL_VERSION: torchx.__version__,
LABEL_APP_NAME: app.name,
LABEL_VERSION: clean(torchx.__version__),
LABEL_APP_NAME: clean(app.name),
LABEL_ROLE_INDEX: str(role_idx),
LABEL_ROLE_NAME: role.name,
LABEL_ROLE_NAME: clean(role.name),
LABEL_REPLICA_ID: str(replica_id),
LABEL_KUBE_APP_NAME: app.name,
LABEL_KUBE_APP_NAME: clean(app.name),
LABEL_ORGANIZATION: "torchx.pytorch.org",
LABEL_UNIQUE_NAME: app_id,
LABEL_UNIQUE_NAME: clean(app_id),
}
49 changes: 48 additions & 1 deletion torchx/schedulers/test/kubernetes_scheduler_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,17 +21,27 @@
from torchx.schedulers import kubernetes_scheduler
from torchx.schedulers.api import DescribeAppResponse, ListAppResponse
from torchx.schedulers.docker_scheduler import has_docker
from torchx.schedulers.ids import make_unique
from torchx.schedulers.kubernetes_scheduler import (
app_to_resource,
create_scheduler,
KubernetesJob,
KubernetesOpts,
KubernetesScheduler,
LABEL_APP_NAME,
LABEL_INSTANCE_TYPE,
LABEL_KUBE_APP_NAME,
LABEL_ORGANIZATION,
LABEL_REPLICA_ID,
LABEL_ROLE_INDEX,
LABEL_ROLE_NAME,
LABEL_UNIQUE_NAME,
LABEL_VERSION,
PLACEHOLDER_FIELD_PATH,
role_to_pod,
)
from torchx.specs import AppDryRunInfo, AppState
from torchx.util.strings import normalize_str

SKIP_DOCKER: bool = not has_docker()

Expand Down Expand Up @@ -311,7 +321,7 @@ def test_submit_dryrun(self) -> None:
torchx.pytorch.org/replica-id: '0'
torchx.pytorch.org/role-index: '0'
torchx.pytorch.org/role-name: trainer_foo
torchx.pytorch.org/version: {torchx.__version__}
torchx.pytorch.org/version: {torchx.__version__.replace("+", ".")}
spec:
containers:
- command:
Expand Down Expand Up @@ -1309,6 +1319,43 @@ def test_validate_spec_long_pod_name(self) -> None:
self.assertIn("Pod name", str(ctx.exception))
self.assertIn("exceeds 63 character limit", str(ctx.exception))

def test_pod_label(self) -> None:
_UNUSED = "__UNUSED__"

app = specs.AppDef(
name="foo+bar",
roles=[specs.Role(name="a/b", image=_UNUSED)],
)
app_id = normalize_str(make_unique(app.name))
labels = kubernetes_scheduler.pod_labels(
app=app,
role_idx=0,
role=app.roles[0],
replica_id=1,
app_id=app_id,
)

self.assertDictEqual(
labels,
{
# torchx version complies with PEP-440
# while typically it is 0.x.x or 0.x.xdev0
# there could be org specific builds that are of the form
# 0.x.xdev0+org_name (e.g. 0.8.0dev0+fb)
# "+" is not a valid pod label char
# we expect that the version str would've been "cleaned"
# to replace invalid chars with "." (a valid char)
LABEL_VERSION: torchx.__version__.replace("+", "."),
LABEL_APP_NAME: "foo.bar",
LABEL_ROLE_INDEX: "0",
LABEL_ROLE_NAME: "a.b",
LABEL_REPLICA_ID: "1",
LABEL_KUBE_APP_NAME: "foo.bar",
LABEL_ORGANIZATION: "torchx.pytorch.org",
LABEL_UNIQUE_NAME: app_id,
},
)


class KubernetesSchedulerNoImportTest(unittest.TestCase):
"""
Expand Down
Loading