Skip to content

Use kueue manifest from CT url #557

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 9 commits into
base: develop
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 0 additions & 6 deletions src/xpk/commands/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@
install_kueue_crs,
install_kueue_on_cluster,
wait_for_kueue_available,
update_kueue_resources_if_necessary,
)
from ..core.nap import enable_autoprovisioning_on_cluster
from ..core.network import (
Expand Down Expand Up @@ -973,11 +972,6 @@ def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config):
if enable_kueue_credentials_code != 0:
xpk_exit(enable_kueue_credentials_code)

xpk_print('Update Kueue Controller Manager resources')
update_kueue_resources_code = update_kueue_resources_if_necessary(args)
if update_kueue_resources_code != 0:
xpk_exit(update_kueue_resources_code)


def prepare_gpus(args, system: SystemCharacteristics):
xpk_print('Installing NCCL Plugin for cluster')
Expand Down
125 changes: 3 additions & 122 deletions src/xpk/core/kueue.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,9 @@

from argparse import Namespace

import math
import packaging
from packaging.version import Version

from ..core.blueprint.blueprint_generator import cluster_toolkit_version
from ..utils.console import xpk_exit, xpk_print
from ..utils.file import write_tmp_file
from .capacity import B200_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE
Expand Down Expand Up @@ -190,92 +189,6 @@
command: [ "sleep", "inf" ]
"""

kueue_controller_manager_yml = """
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: kueue
control-plane: controller-manager
name: kueue-controller-manager
namespace: kueue-system
spec:
replicas: 1
selector:
matchLabels:
control-plane: controller-manager
template:
metadata:
annotations:
kubectl.kubernetes.io/default-container: manager
labels:
app.kubernetes.io/component: controller
app.kubernetes.io/name: kueue
control-plane: controller-manager
spec:
containers:
- args:
- --config=/controller_manager_config.yaml
- --zap-log-level=2
command:
- /manager
image: registry.k8s.io/kueue/kueue:{KUEUE_VERSION}
imagePullPolicy: Always
livenessProbe:
httpGet:
path: /healthz
port: 8081
initialDelaySeconds: 15
periodSeconds: 20
name: manager
ports:
- containerPort: 8082
name: visibility
protocol: TCP
- containerPort: 9443
name: webhook-server
protocol: TCP
readinessProbe:
httpGet:
path: /readyz
port: 8081
initialDelaySeconds: 5
periodSeconds: 10
resources:
limits:
cpu: 500m
memory: {memory_limit_size}
requests:
cpu: 500m
memory: 512Mi
securityContext:
allowPrivilegeEscalation: false
volumeMounts:
- mountPath: /visibility
name: visibility
- mountPath: /tmp/k8s-webhook-server/serving-certs
name: cert
readOnly: true
- mountPath: /controller_manager_config.yaml
name: manager-config
subPath: controller_manager_config.yaml
securityContext:
runAsNonRoot: true
serviceAccountName: kueue-controller-manager
terminationGracePeriodSeconds: 10
volumes:
- name: visibility
emptyDir: {{}}
- name: cert
secret:
defaultMode: 420
secretName: kueue-webhook-server-cert
- configMap:
name: kueue-manager-config
name: manager-config
"""


def verify_kueuectl(args: Namespace) -> None:
"""Verify if kueuectl is installed.
Expand Down Expand Up @@ -359,7 +272,8 @@ def install_kueue_on_cluster(args) -> int:

command = (
'kubectl apply --server-side --force-conflicts -f'
f' https://github.com/kubernetes-sigs/kueue/releases/download/{KUEUE_VERSION}/manifests.yaml'
f' https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/tags/{cluster_toolkit_version}'
f'/modules/management/kubectl-apply/manifests/kueue-{KUEUE_VERSION}.yaml'
)
task = 'Set Kueue On Cluster'
return_code = run_command_with_updates_retry(command, task, args)
Expand Down Expand Up @@ -506,36 +420,3 @@ def get_kueue_covered_resources_config(
total_chips=total_chips,
)
return config_string


def update_kueue_resources_if_necessary(args):
"""Update the kueue manifest to increase the resources for the kueue controller manager.

Args:
args: user provided arguments for running the command.

Returns:
0 if successful and 1 otherwise.
"""
# Get total number of nodes
cmd_total_node_num = 'kubectl get node --no-headers | wc -l'
return_code, out = run_command_for_value(
cmd_total_node_num, 'Count total nodes', args
)
if return_code != 0:
xpk_exit(1)
# 1.2MiB per VM or 4GiB (whichever is greater).
new_memory_limit = (
f'{max(math.ceil(int(out) * MEMORY_SIZE_PER_VM), MIN_MEMORY_LIMIT_SIZE)}Mi'
)
yml_string = kueue_controller_manager_yml.format(
memory_limit_size=new_memory_limit, KUEUE_VERSION=KUEUE_VERSION
)
tmp = write_tmp_file(yml_string)
command = f'kubectl apply -f {str(tmp.file.name)}'

task = 'Updating Kueue Controller Manager resources'
return_code = run_command_with_updates_retry(command, task, args)
if return_code != 0:
xpk_print(f'{task} returned ERROR {return_code}')
return return_code
Loading