Skip to content

Commit 3a1969e

Browse files
committed
Add Diagon installation during cluster creation and modify the workload.py
Add wait_for_deployment_ready() Added unit test update goldens.yaml update goldens.yaml update goldens.yaml Fixed parser/cluster.py update goldens.yaml fixed linter fixed linter pyink Test unit test
1 parent ebae87d commit 3a1969e

File tree

6 files changed

+534
-0
lines changed

6 files changed

+534
-0
lines changed

src/xpk/commands/cluster.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@
8181
from ..utils.templates import get_templates_absolute_path
8282
import shutil
8383
import os
84+
from . import managed_ml_diagnostics
8485

8586
CLUSTER_PREHEAT_JINJA_FILE = 'cluster_preheat.yaml.j2'
8687

@@ -407,6 +408,13 @@ def cluster_create(args) -> None:
407408
# pylint: disable=line-too-long
408409
f' https://console.cloud.google.com/kubernetes/clusters/details/{get_cluster_location(args.project, args.cluster, args.zone)}/{args.cluster}/details?project={args.project}'
409410
)
411+
412+
if args.managed_ml_diagnostics:
413+
return_code = managed_ml_diagnostics.install_mldiagnostics_prerequisites()
414+
if return_code != 0:
415+
xpk_print('Installation of MLDiagnostics failed.')
416+
xpk_exit(return_code)
417+
410418
xpk_exit(0)
411419

412420

src/xpk/commands/cluster_test.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@ def mocks(mocker) -> _Mocks:
5656
run_command_with_updates_path=(
5757
'xpk.commands.cluster.run_command_with_updates'
5858
),
59+
run_command_for_value_path=(
60+
'xpk.commands.cluster.run_command_for_value'
61+
),
5962
),
6063
)
6164

Lines changed: 255 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,255 @@
1+
"""
2+
Copyright 2024 Google LLC
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
https://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
"""
16+
17+
import time
18+
from packaging.version import Version
19+
from ..core.commands import run_command_for_value, run_command_with_updates
20+
from ..utils.console import xpk_exit, xpk_print
21+
import os
22+
23+
24+
def _install_cert_manager(version: Version = Version('v1.13.0')) -> int:
25+
"""
26+
Apply the cert-manager manifest.
27+
28+
Returns:
29+
0 if successful and 1 otherwise.
30+
"""
31+
32+
command = (
33+
'kubectl apply -f'
34+
' https://github.com/cert-manager/cert-manager/releases/download/'
35+
f'v{version}/cert-manager.yaml'
36+
)
37+
38+
return_code = run_command_with_updates(
39+
command, f'Applying cert-manager {version} manifest...'
40+
)
41+
42+
if return_code != 0:
43+
xpk_exit(return_code)
44+
45+
return return_code
46+
47+
48+
def _download_mldiagnostics_yaml(package_name: str, version: Version) -> int:
49+
"""
50+
Downloads the mldiagnostics injection webhook YAML from Artifact Registry.
51+
52+
Returns:
53+
0 if successful and 1 otherwise.
54+
"""
55+
56+
version_with_v = f'v{version}'
57+
command = (
58+
'gcloud artifacts generic download'
59+
' --repository=mldiagnostics-webhook-and-operator-yaml --location=us'
60+
f' --package={package_name} --version={version_with_v} --destination=/tmp/'
61+
' --project=ai-on-gke'
62+
)
63+
64+
return_code, return_output = run_command_for_value(
65+
command,
66+
f'Download {package_name} {version}...',
67+
)
68+
69+
if return_code != 0:
70+
if 'already exists' in return_output:
71+
xpk_print(
72+
f'Artifact file for {package_name} {version} already exists locally.'
73+
' Skipping download.'
74+
)
75+
return 0
76+
77+
return return_code
78+
79+
80+
def _create_mldiagnostics_namespace() -> int:
81+
"""
82+
Creates the 'gke-mldiagnostics' namespace.
83+
84+
Returns:
85+
0 if successful and 1 otherwise.
86+
"""
87+
88+
command = 'kubectl create namespace gke-mldiagnostics'
89+
90+
return_code, return_output = run_command_for_value(
91+
command, 'Create gke-mldiagnostics namespace...'
92+
)
93+
94+
if return_code != 0:
95+
if 'already exists' in return_output:
96+
xpk_print('Namespace already exists. Skipping creation.')
97+
return 0
98+
99+
return return_code
100+
101+
102+
def _install_mldiagnostics_yaml(artifact_filename: str) -> int:
103+
"""
104+
Applies the mldiagnostics injection webhook YAML manifest.
105+
106+
Returns:
107+
0 if successful and 1 otherwise.
108+
"""
109+
full_artifact_path = os.path.join('/tmp', artifact_filename)
110+
111+
command = f'kubectl apply -f {full_artifact_path} -n gke-mldiagnostics'
112+
113+
return_code = run_command_with_updates(
114+
command,
115+
f'Install {full_artifact_path}...',
116+
)
117+
118+
if return_code != 0:
119+
xpk_print(f'kubectl apply returned with ERROR {return_code}.\n')
120+
xpk_exit(return_code)
121+
122+
xpk_print(f'{artifact_filename} applied successfully.')
123+
124+
return 0
125+
126+
127+
def _label_default_namespace_mldiagnostics() -> int:
128+
"""
129+
Labels the 'default' namespace with 'managed-mldiagnostics-gke=true'.
130+
131+
Returns:
132+
0 if successful and 1 otherwise.
133+
"""
134+
135+
command = 'kubectl label namespace default managed-mldiagnostics-gke=true'
136+
137+
return_code = run_command_with_updates(
138+
command,
139+
'Label default namespace with managed-mldiagnostics-gke=true',
140+
)
141+
142+
if return_code != 0:
143+
xpk_exit(return_code)
144+
145+
return return_code
146+
147+
148+
def install_mldiagnostics_prerequisites() -> int:
149+
"""
150+
Mldiagnostics installation requirements.
151+
152+
Returns:
153+
0 if successful and 1 otherwise.
154+
"""
155+
156+
kueue_deployment_name = 'kueue-controller-manager'
157+
kueue_namespace_name = 'kueue-system'
158+
cert_webhook_deployment_name = 'cert-manager-webhook'
159+
cert_webhook_namespace_name = 'cert-manager'
160+
161+
if not _wait_for_deployment_ready(
162+
deployment_name=kueue_deployment_name, namespace=kueue_namespace_name
163+
):
164+
xpk_print(
165+
f'Application {kueue_deployment_name} failed to become ready within the'
166+
' timeout.'
167+
)
168+
return 1
169+
170+
return_code = _install_cert_manager()
171+
if return_code != 0:
172+
return return_code
173+
174+
cert_webhook_ready = _wait_for_deployment_ready(
175+
deployment_name=cert_webhook_deployment_name,
176+
namespace=cert_webhook_namespace_name,
177+
)
178+
if not cert_webhook_ready:
179+
xpk_print('The cert-manager-webhook installation failed.')
180+
return 1
181+
182+
webhook_package = 'mldiagnostics-injection-webhook'
183+
webhook_version = 'v0.5.0'
184+
webhook_filename = f'{webhook_package}-{webhook_version}.yaml'
185+
186+
return_code = _download_mldiagnostics_yaml(
187+
package_name=webhook_package, version=Version(webhook_version)
188+
)
189+
if return_code != 0:
190+
return return_code
191+
192+
return_code = _create_mldiagnostics_namespace()
193+
if return_code != 0:
194+
return return_code
195+
196+
return_code = _install_mldiagnostics_yaml(artifact_filename=webhook_filename)
197+
if return_code != 0:
198+
return return_code
199+
200+
return_code = _label_default_namespace_mldiagnostics()
201+
if return_code != 0:
202+
return return_code
203+
204+
operator_package = 'mldiagnostics-connection-operator'
205+
operator_version = 'v0.5.0'
206+
operator_filename = f'{operator_package}-{operator_version}.yaml'
207+
208+
return_code = _download_mldiagnostics_yaml(
209+
package_name=operator_package, version=Version(webhook_version)
210+
)
211+
if return_code != 0:
212+
return return_code
213+
214+
return_code = _install_mldiagnostics_yaml(artifact_filename=operator_filename)
215+
if return_code != 0:
216+
return return_code
217+
218+
xpk_print(
219+
'All mldiagnostics installation and setup steps have been'
220+
' successfully completed!'
221+
)
222+
return 0
223+
224+
225+
def _wait_for_deployment_ready(
226+
deployment_name: str, namespace: str, timeout_seconds: int = 300
227+
) -> bool:
228+
"""
229+
Polls the Kubernetes Deployment status using kubectl rollout status
230+
until it successfully rolls out (all replicas are ready) or times out.
231+
232+
Args:
233+
deployment_name: The name of the Kubernetes Deployment (e.g., 'kueue-controller-manager').
234+
namespace: The namespace where the Deployment is located (e.g., 'kueue-system').
235+
timeout_seconds: Timeout duration in seconds (default is 300s / 5 minutes).
236+
237+
Returns:
238+
bool: True if the Deployment successfully rolled out, False otherwise (timeout or error).
239+
"""
240+
241+
command = (
242+
f'kubectl rollout status deployment/{deployment_name} -n {namespace}'
243+
f' --timeout={timeout_seconds}s'
244+
)
245+
246+
return_code = run_command_with_updates(
247+
command, f'Checking status of deployment {deployment_name}...'
248+
)
249+
250+
if return_code != 0:
251+
return False
252+
253+
# When the status changes to 'running,' it might need about 10 seconds to fully stabilize.
254+
time.sleep(30)
255+
return True

0 commit comments

Comments
 (0)