1414limitations under the License.
1515"""
1616
17- import time
1817from packaging .version import Version
1918from ..core .commands import run_command_for_value , run_command_with_updates
20- from ..utils .console import xpk_exit , xpk_print
19+ from ..utils .console import xpk_print
2120import os
21+ import tempfile
22+
23+ _KUEUE_DEPLOYMENT_NAME = 'kueue-controller-manager'
24+ _KUEUE_NAMESPACE_NAME = 'kueue-system'
25+ _CERT_WEBHOOK_DEPLOYMENT_NAME = 'cert-manager-webhook'
26+ _CERT_WEBHOOK_NAMESPACE_NAME = 'cert-manager'
2227
2328
2429def _install_cert_manager (version : Version = Version ('v1.13.0' )) -> int :
@@ -39,9 +44,6 @@ def _install_cert_manager(version: Version = Version('v1.13.0')) -> int:
3944 command , f'Applying cert-manager { version } manifest...'
4045 )
4146
42- if return_code != 0 :
43- xpk_exit (return_code )
44-
4547 return return_code
4648
4749
@@ -53,11 +55,10 @@ def _download_mldiagnostics_yaml(package_name: str, version: Version) -> int:
5355 0 if successful and 1 otherwise.
5456 """
5557
56- version_with_v = f'v{ version } '
5758 command = (
5859 'gcloud artifacts generic download'
5960 ' --repository=mldiagnostics-webhook-and-operator-yaml --location=us'
60- f' --package={ package_name } --version={ version_with_v } --destination=/tmp/'
61+ f' --package={ package_name } --version=v { version } --destination=/tmp/'
6162 ' --project=ai-on-gke'
6263 )
6364
@@ -106,7 +107,7 @@ def _install_mldiagnostics_yaml(artifact_filename: str) -> int:
106107 Returns:
107108 0 if successful and 1 otherwise.
108109 """
109- full_artifact_path = os .path .join ('/tmp' , artifact_filename )
110+ full_artifact_path = os .path .join (tempfile . gettempdir () , artifact_filename )
110111
111112 command = f'kubectl apply -f { full_artifact_path } -n gke-mldiagnostics'
112113
@@ -144,17 +145,12 @@ def install_mldiagnostics_prerequisites() -> int:
144145 0 if successful and 1 otherwise.
145146 """
146147
147- kueue_deployment_name = 'kueue-controller-manager'
148- kueue_namespace_name = 'kueue-system'
149- cert_webhook_deployment_name = 'cert-manager-webhook'
150- cert_webhook_namespace_name = 'cert-manager'
151-
152148 if not _wait_for_deployment_ready (
153- deployment_name = kueue_deployment_name , namespace = kueue_namespace_name
149+ deployment_name = _KUEUE_DEPLOYMENT_NAME , namespace = _KUEUE_NAMESPACE_NAME
154150 ):
155151 xpk_print (
156- f'Application { kueue_deployment_name } failed to become ready within the '
157- ' timeout.'
152+ f'Application { _KUEUE_DEPLOYMENT_NAME } failed to become ready within'
153+ ' the timeout.'
158154 )
159155 return 1
160156
@@ -163,19 +159,19 @@ def install_mldiagnostics_prerequisites() -> int:
163159 return return_code
164160
165161 cert_webhook_ready = _wait_for_deployment_ready (
166- deployment_name = cert_webhook_deployment_name ,
167- namespace = cert_webhook_namespace_name ,
162+ deployment_name = _CERT_WEBHOOK_DEPLOYMENT_NAME ,
163+ namespace = _CERT_WEBHOOK_NAMESPACE_NAME ,
168164 )
169165 if not cert_webhook_ready :
170166 xpk_print ('The cert-manager-webhook installation failed.' )
171167 return 1
172168
173169 webhook_package = 'mldiagnostics-injection-webhook'
174- webhook_version = 'v0.5.0'
175- webhook_filename = f'{ webhook_package } -{ webhook_version } .yaml'
170+ webhook_version = Version ( 'v0.5.0' )
171+ webhook_filename = f'{ webhook_package } -v { webhook_version } .yaml'
176172
177173 return_code = _download_mldiagnostics_yaml (
178- package_name = webhook_package , version = Version ( webhook_version )
174+ package_name = webhook_package , version = webhook_version
179175 )
180176 if return_code != 0 :
181177 return return_code
@@ -193,11 +189,11 @@ def install_mldiagnostics_prerequisites() -> int:
193189 return return_code
194190
195191 operator_package = 'mldiagnostics-connection-operator'
196- operator_version = 'v0.5.0'
197- operator_filename = f'{ operator_package } -{ operator_version } .yaml'
192+ operator_version = Version ( 'v0.5.0' )
193+ operator_filename = f'{ operator_package } -v { operator_version } .yaml'
198194
199195 return_code = _download_mldiagnostics_yaml (
200- package_name = operator_package , version = Version ( webhook_version )
196+ package_name = operator_package , version = operator_version
201197 )
202198 if return_code != 0 :
203199 return return_code
@@ -242,5 +238,15 @@ def _wait_for_deployment_ready(
242238 return False
243239
244240 # When the status changes to 'running,' it might need about 10 seconds to fully stabilize.
245- time .sleep (30 )
241+ stabilization_seconds = 30
242+ stabilization_command = f'sleep { stabilization_seconds } '
243+ stabilization_code = run_command_with_updates (
244+ stabilization_command ,
245+ f'Deployment { deployment_name } is ready. Waiting { stabilization_seconds } '
246+ ' seconds for full stabilization' ,
247+ verbose = True ,
248+ )
249+ if stabilization_code != 0 :
250+ return False
251+
246252 return True
0 commit comments