Skip to content

Commit 137d70a

Browse files
committed
Add Diagon installation during cluster creation and modify the workload.py
Add wait_for_deployment_ready() Added unit test update goldens.yaml update goldens.yaml update goldens.yaml Fixed parser/cluster.py update goldens.yaml fixed linter fixed linter pyink Test unit test
1 parent ebae87d commit 137d70a

File tree

6 files changed

+525
-0
lines changed

6 files changed

+525
-0
lines changed

src/xpk/commands/cluster.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@
8181
from ..utils.templates import get_templates_absolute_path
8282
import shutil
8383
import os
84+
from . import managed_ml_diagnostics
8485

8586
CLUSTER_PREHEAT_JINJA_FILE = 'cluster_preheat.yaml.j2'
8687

@@ -407,6 +408,13 @@ def cluster_create(args) -> None:
407408
# pylint: disable=line-too-long
408409
f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
409410
)
411+
412+
if args.managed_ml_diagnostics:
413+
return_code = managed_ml_diagnostics.install_mldiagnostics_prerequisites()
414+
if return_code != 0:
415+
xpk_print('Installation of MLDiagnostics failed.')
416+
xpk_exit(return_code)
417+
410418
xpk_exit(0)
411419

412420

src/xpk/commands/cluster_test.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@ def mocks(mocker) -> _Mocks:
5656
run_command_with_updates_path=(
5757
'xpk.commands.cluster.run_command_with_updates'
5858
),
59+
run_command_for_value_path=(
60+
'xpk.commands.cluster.run_command_for_value'
61+
),
5962
),
6063
)
6164

Lines changed: 246 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,246 @@
1+
"""
2+
Copyright 2024 Google LLC
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
https://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
"""
16+
17+
import time
18+
from packaging.version import Version
19+
from ..core.commands import run_command_for_value, run_command_with_updates
20+
from ..utils.console import xpk_exit, xpk_print
21+
import os
22+
23+
24+
def _install_cert_manager(version: Version = Version('v1.13.0')) -> int:
25+
"""
26+
Apply the cert-manager manifest.
27+
28+
Returns:
29+
0 if successful and 1 otherwise.
30+
"""
31+
32+
command = (
33+
'kubectl apply -f'
34+
' https://github.com/cert-manager/cert-manager/releases/download/'
35+
f'v{version}/cert-manager.yaml'
36+
)
37+
38+
return_code = run_command_with_updates(
39+
command, f'Applying cert-manager {version} manifest...'
40+
)
41+
42+
if return_code != 0:
43+
xpk_exit(return_code)
44+
45+
return return_code
46+
47+
48+
def _download_mldiagnostics_yaml(package_name: str, version: Version) -> int:
49+
"""
50+
Downloads the mldiagnostics injection webhook YAML from Artifact Registry.
51+
52+
Returns:
53+
0 if successful and 1 otherwise.
54+
"""
55+
56+
version_with_v = f'v{version}'
57+
command = (
58+
'gcloud artifacts generic download'
59+
' --repository=mldiagnostics-webhook-and-operator-yaml --location=us'
60+
f' --package={package_name} --version={version_with_v} --destination=/tmp/'
61+
' --project=ai-on-gke'
62+
)
63+
64+
return_code, return_output = run_command_for_value(
65+
command,
66+
f'Download {package_name} {version}...',
67+
)
68+
69+
if return_code != 0:
70+
if 'already exists' in return_output:
71+
xpk_print(
72+
f'Artifact file for {package_name} {version} already exists locally.'
73+
' Skipping download.'
74+
)
75+
return 0
76+
77+
return return_code
78+
79+
80+
def _create_mldiagnostics_namespace() -> int:
81+
"""
82+
Creates the 'gke-mldiagnostics' namespace.
83+
84+
Returns:
85+
0 if successful and 1 otherwise.
86+
"""
87+
88+
command = 'kubectl create namespace gke-mldiagnostics'
89+
90+
return_code, return_output = run_command_for_value(
91+
command, 'Create gke-mldiagnostics namespace...'
92+
)
93+
94+
if return_code != 0:
95+
if 'already exists' in return_output:
96+
xpk_print('Namespace already exists. Skipping creation.')
97+
return 0
98+
99+
return return_code
100+
101+
102+
def _install_mldiagnostics_yaml(artifact_filename: str) -> int:
103+
"""
104+
Applies the mldiagnostics injection webhook YAML manifest.
105+
106+
Returns:
107+
0 if successful and 1 otherwise.
108+
"""
109+
full_artifact_path = os.path.join('/tmp', artifact_filename)
110+
111+
command = f'kubectl apply -f {full_artifact_path} -n gke-mldiagnostics'
112+
113+
return_code = run_command_with_updates(
114+
command,
115+
f'Install {full_artifact_path}...',
116+
)
117+
118+
return return_code
119+
120+
121+
def _label_default_namespace_mldiagnostics() -> int:
122+
"""
123+
Labels the 'default' namespace with 'managed-mldiagnostics-gke=true'.
124+
125+
Returns:
126+
0 if successful and 1 otherwise.
127+
"""
128+
129+
command = 'kubectl label namespace default managed-mldiagnostics-gke=true'
130+
131+
return_code = run_command_with_updates(
132+
command,
133+
'Label default namespace with managed-mldiagnostics-gke=true',
134+
)
135+
136+
return return_code
137+
138+
139+
def install_mldiagnostics_prerequisites() -> int:
140+
"""
141+
Mldiagnostics installation requirements.
142+
143+
Returns:
144+
0 if successful and 1 otherwise.
145+
"""
146+
147+
kueue_deployment_name = 'kueue-controller-manager'
148+
kueue_namespace_name = 'kueue-system'
149+
cert_webhook_deployment_name = 'cert-manager-webhook'
150+
cert_webhook_namespace_name = 'cert-manager'
151+
152+
if not _wait_for_deployment_ready(
153+
deployment_name=kueue_deployment_name, namespace=kueue_namespace_name
154+
):
155+
xpk_print(
156+
f'Application {kueue_deployment_name} failed to become ready within the'
157+
' timeout.'
158+
)
159+
return 1
160+
161+
return_code = _install_cert_manager()
162+
if return_code != 0:
163+
return return_code
164+
165+
cert_webhook_ready = _wait_for_deployment_ready(
166+
deployment_name=cert_webhook_deployment_name,
167+
namespace=cert_webhook_namespace_name,
168+
)
169+
if not cert_webhook_ready:
170+
xpk_print('The cert-manager-webhook installation failed.')
171+
return 1
172+
173+
webhook_package = 'mldiagnostics-injection-webhook'
174+
webhook_version = 'v0.5.0'
175+
webhook_filename = f'{webhook_package}-{webhook_version}.yaml'
176+
177+
return_code = _download_mldiagnostics_yaml(
178+
package_name=webhook_package, version=Version(webhook_version)
179+
)
180+
if return_code != 0:
181+
return return_code
182+
183+
return_code = _create_mldiagnostics_namespace()
184+
if return_code != 0:
185+
return return_code
186+
187+
return_code = _install_mldiagnostics_yaml(artifact_filename=webhook_filename)
188+
if return_code != 0:
189+
return return_code
190+
191+
return_code = _label_default_namespace_mldiagnostics()
192+
if return_code != 0:
193+
return return_code
194+
195+
operator_package = 'mldiagnostics-connection-operator'
196+
operator_version = 'v0.5.0'
197+
operator_filename = f'{operator_package}-{operator_version}.yaml'
198+
199+
return_code = _download_mldiagnostics_yaml(
200+
package_name=operator_package, version=Version(webhook_version)
201+
)
202+
if return_code != 0:
203+
return return_code
204+
205+
return_code = _install_mldiagnostics_yaml(artifact_filename=operator_filename)
206+
if return_code != 0:
207+
return return_code
208+
209+
xpk_print(
210+
'All mldiagnostics installation and setup steps have been'
211+
' successfully completed!'
212+
)
213+
return 0
214+
215+
216+
def _wait_for_deployment_ready(
217+
deployment_name: str, namespace: str, timeout_seconds: int = 300
218+
) -> bool:
219+
"""
220+
Polls the Kubernetes Deployment status using kubectl rollout status
221+
until it successfully rolls out (all replicas are ready) or times out.
222+
223+
Args:
224+
deployment_name: The name of the Kubernetes Deployment (e.g., 'kueue-controller-manager').
225+
namespace: The namespace where the Deployment is located (e.g., 'kueue-system').
226+
timeout_seconds: Timeout duration in seconds (default is 300s / 5 minutes).
227+
228+
Returns:
229+
bool: True if the Deployment successfully rolled out, False otherwise (timeout or error).
230+
"""
231+
232+
command = (
233+
f'kubectl rollout status deployment/{deployment_name} -n {namespace}'
234+
f' --timeout={timeout_seconds}s'
235+
)
236+
237+
return_code = run_command_with_updates(
238+
command, f'Checking status of deployment {deployment_name}...'
239+
)
240+
241+
if return_code != 0:
242+
return False
243+
244+
# When the status changes to 'running,' it might need about 10 seconds to fully stabilize.
245+
time.sleep(30)
246+
return True

0 commit comments

Comments
 (0)