From 7cf8a46f50e7d97afa378c73bbe7fb8e0a782cac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Paw=C5=82owski?= Date: Wed, 23 Jul 2025 15:53:54 +0000 Subject: [PATCH 1/6] fix kjob.py pyink MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Piotr Pawłowski --- src/xpk/core/kjob.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/xpk/core/kjob.py b/src/xpk/core/kjob.py index ab082e1e4..318fe19d2 100644 --- a/src/xpk/core/kjob.py +++ b/src/xpk/core/kjob.py @@ -333,6 +333,7 @@ def create_job_template_instance( args=args, ) + def create_pod_template_instance(args: Namespace, service_account: str) -> int: """Create new PodTemplate instance on cluster with default settings. From 80ad86823f3aa3bc8a48c0037dc7cb7d4054322a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Paw=C5=82owski?= Date: Fri, 25 Jul 2025 06:54:15 +0000 Subject: [PATCH 2/6] remove kueue manager manifest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Piotr Pawłowski --- src/xpk/core/kueue.py | 93 +++---------------------------------------- 1 file changed, 6 insertions(+), 87 deletions(-) diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index 31430838d..98d1e2a38 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -190,88 +190,6 @@ command: [ "sleep", "inf" ] """ -kueue_controller_manager_yml = """ -apiVersion: apps/v1 -kind: Deployment -metadata: - labels: - app.kubernetes.io/component: controller - app.kubernetes.io/name: kueue - control-plane: controller-manager - name: kueue-controller-manager - namespace: kueue-system -spec: - replicas: 1 - selector: - matchLabels: - control-plane: controller-manager - template: - metadata: - annotations: - kubectl.kubernetes.io/default-container: manager - labels: - app.kubernetes.io/component: controller - app.kubernetes.io/name: kueue - control-plane: controller-manager - spec: - containers: - - args: - - --config=/controller_manager_config.yaml - - --zap-log-level=2 - command: - - /manager - image: registry.k8s.io/kueue/kueue:{KUEUE_VERSION} - imagePullPolicy: Always - livenessProbe: - httpGet: - path: /healthz - port: 8081 - initialDelaySeconds: 15 - periodSeconds: 20 - name: manager - ports: - - containerPort: 8082 - name: visibility - protocol: TCP - - containerPort: 9443 - name: webhook-server - protocol: TCP - readinessProbe: - httpGet: - path: /readyz - port: 8081 - initialDelaySeconds: 5 - periodSeconds: 10 - resources: - limits: - cpu: 500m - memory: {memory_limit_size} - requests: - cpu: 500m - memory: 512Mi - securityContext: - allowPrivilegeEscalation: false - volumeMounts: - - mountPath: /tmp/k8s-webhook-server/serving-certs - name: cert - readOnly: true - - mountPath: /controller_manager_config.yaml - name: manager-config - subPath: controller_manager_config.yaml - securityContext: - runAsNonRoot: true - serviceAccountName: kueue-controller-manager - terminationGracePeriodSeconds: 10 - volumes: - - name: cert - secret: - defaultMode: 420 - secretName: kueue-webhook-server-cert - - configMap: - name: kueue-manager-config - name: manager-config -""" - def verify_kueuectl(args: Namespace) -> None: """Verify if kueuectl is installed. @@ -524,11 +442,12 @@ def update_kueue_resources_if_necessary(args): new_memory_limit = ( f'{max(math.ceil(int(out) * MEMORY_SIZE_PER_VM), MIN_MEMORY_LIMIT_SIZE)}Mi' ) - yml_string = kueue_controller_manager_yml.format( - memory_limit_size=new_memory_limit, KUEUE_VERSION=KUEUE_VERSION - ) - tmp = write_tmp_file(yml_string) - command = f'kubectl apply -f {str(tmp.file.name)}' + kueue_controller_manager_yaml = 'https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/tags/v1.57.1/modules/management/kubectl-apply/manifests/kueue-v0.12.2.yaml' + # yml_string = kueue_controller_manager_yml.format( + # memory_limit_size=new_memory_limit, KUEUE_VERSION=KUEUE_VERSION + # ) + # tmp = write_tmp_file(yml_string) + command = f'kubectl apply -f {kueue_controller_manager_yaml}' task = 'Updating Kueue Controller Manager resources' return_code = run_command_with_updates_retry(command, task, args) From 72c02b7e5c24038cad14c4321b3a3c098b12b4b5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Paw=C5=82owski?= Date: Fri, 25 Jul 2025 06:57:24 +0000 Subject: [PATCH 3/6] fix pylint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Piotr Pawłowski --- src/xpk/core/kueue.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index 98d1e2a38..dc36ccf68 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -439,9 +439,9 @@ def update_kueue_resources_if_necessary(args): if return_code != 0: xpk_exit(1) # 1.2MiB per VM or 4GiB (whichever is greater). - new_memory_limit = ( - f'{max(math.ceil(int(out) * MEMORY_SIZE_PER_VM), MIN_MEMORY_LIMIT_SIZE)}Mi' - ) + # new_memory_limit = ( + # f'{max(math.ceil(int(out) * MEMORY_SIZE_PER_VM), MIN_MEMORY_LIMIT_SIZE)}Mi' + # ) kueue_controller_manager_yaml = 'https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/tags/v1.57.1/modules/management/kubectl-apply/manifests/kueue-v0.12.2.yaml' # yml_string = kueue_controller_manager_yml.format( # memory_limit_size=new_memory_limit, KUEUE_VERSION=KUEUE_VERSION From d936157bc72db10c41229b9382ce219f254c8825 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Paw=C5=82owski?= Date: Fri, 25 Jul 2025 07:45:21 +0000 Subject: [PATCH 4/6] remove updating kueue resources MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Piotr Pawłowski --- src/xpk/commands/cluster.py | 6 ------ src/xpk/core/kueue.py | 39 ++----------------------------------- 2 files changed, 2 insertions(+), 43 deletions(-) diff --git a/src/xpk/commands/cluster.py b/src/xpk/commands/cluster.py index 99126db65..eec16bf82 100644 --- a/src/xpk/commands/cluster.py +++ b/src/xpk/commands/cluster.py @@ -50,7 +50,6 @@ install_kueue_crs, install_kueue_on_cluster, wait_for_kueue_available, - update_kueue_resources_if_necessary, ) from ..core.nap import enable_autoprovisioning_on_cluster from ..core.network import ( @@ -973,11 +972,6 @@ def install_kueue(args, system: SystemCharacteristics, autoprovisioning_config): if enable_kueue_credentials_code != 0: xpk_exit(enable_kueue_credentials_code) - xpk_print('Update Kueue Controller Manager resources') - update_kueue_resources_code = update_kueue_resources_if_necessary(args) - if update_kueue_resources_code != 0: - xpk_exit(update_kueue_resources_code) - def prepare_gpus(args, system: SystemCharacteristics): xpk_print('Installing NCCL Plugin for cluster') diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index dc36ccf68..2be8c03e6 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -16,10 +16,9 @@ from argparse import Namespace -import math import packaging from packaging.version import Version - +from ..core.blueprint.blueprint_generator import cluster_toolkit_version from ..utils.console import xpk_exit, xpk_print from ..utils.file import write_tmp_file from .capacity import B200_DEVICE_TYPE, H100_MEGA_DEVICE_TYPE, H200_DEVICE_TYPE @@ -273,7 +272,7 @@ def install_kueue_on_cluster(args) -> int: command = ( 'kubectl apply --server-side --force-conflicts -f' - f' https://github.com/kubernetes-sigs/kueue/releases/download/{KUEUE_VERSION}/manifests.yaml' + f' https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/tags/{cluster_toolkit_version}/modules/management/kubectl-apply/manifests/kueue-{KUEUE_VERSION}.yaml' ) task = 'Set Kueue On Cluster' return_code = run_command_with_updates_retry(command, task, args) @@ -420,37 +419,3 @@ def get_kueue_covered_resources_config( total_chips=total_chips, ) return config_string - - -def update_kueue_resources_if_necessary(args): - """Update the kueue manifest to increase the resources for the kueue controller manager. - - Args: - args: user provided arguments for running the command. - - Returns: - 0 if successful and 1 otherwise. - """ - # Get total number of nodes - cmd_total_node_num = 'kubectl get node --no-headers | wc -l' - return_code, out = run_command_for_value( - cmd_total_node_num, 'Count total nodes', args - ) - if return_code != 0: - xpk_exit(1) - # 1.2MiB per VM or 4GiB (whichever is greater). - # new_memory_limit = ( - # f'{max(math.ceil(int(out) * MEMORY_SIZE_PER_VM), MIN_MEMORY_LIMIT_SIZE)}Mi' - # ) - kueue_controller_manager_yaml = 'https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/tags/v1.57.1/modules/management/kubectl-apply/manifests/kueue-v0.12.2.yaml' - # yml_string = kueue_controller_manager_yml.format( - # memory_limit_size=new_memory_limit, KUEUE_VERSION=KUEUE_VERSION - # ) - # tmp = write_tmp_file(yml_string) - command = f'kubectl apply -f {kueue_controller_manager_yaml}' - - task = 'Updating Kueue Controller Manager resources' - return_code = run_command_with_updates_retry(command, task, args) - if return_code != 0: - xpk_print(f'{task} returned ERROR {return_code}') - return return_code From 06a79c64c47ac82fa05c7c4e9b9b66c3fc13f0d7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Paw=C5=82owski?= Date: Fri, 25 Jul 2025 08:00:08 +0000 Subject: [PATCH 5/6] fix pyink MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Piotr Pawłowski --- src/xpk/core/kueue.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index 2be8c03e6..30d33ca67 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -272,7 +272,8 @@ def install_kueue_on_cluster(args) -> int: command = ( 'kubectl apply --server-side --force-conflicts -f' - f' https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/tags/{cluster_toolkit_version}/modules/management/kubectl-apply/manifests/kueue-{KUEUE_VERSION}.yaml' + f' https://raw.githubusercontent.com/GoogleCloudPlatform/cluster-toolkit/refs/tags/{cluster_toolkit_version}' + f'/modules/management/kubectl-apply/manifests/kueue-{KUEUE_VERSION}.yaml' ) task = 'Set Kueue On Cluster' return_code = run_command_with_updates_retry(command, task, args) From 4f9510f47632cd71328e859d47116228063fe759 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Piotr=20Paw=C5=82owski?= Date: Fri, 25 Jul 2025 08:00:39 +0000 Subject: [PATCH 6/6] fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Piotr Pawłowski --- src/xpk/core/kueue.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/xpk/core/kueue.py b/src/xpk/core/kueue.py index b3a404db3..30d33ca67 100644 --- a/src/xpk/core/kueue.py +++ b/src/xpk/core/kueue.py @@ -189,6 +189,7 @@ command: [ "sleep", "inf" ] """ + def verify_kueuectl(args: Namespace) -> None: """Verify if kueuectl is installed. Args: