Skip to content

Commit d761a21

Browse files
committed
Resolve conflicts
1 parent 50358ea commit d761a21

File tree

5 files changed

+38
-69
lines changed

5 files changed

+38
-69
lines changed

src/xpk/commands/cluster_test.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ def construct_args(**kwargs: Any) -> Namespace:
124124
cluster_cpu_machine_type='',
125125
create_vertex_tensorboard=False,
126126
enable_autoprovisioning=False,
127+
managed_ml_diagnostics=False,
127128
)
128129
args_dict.update(kwargs)
129130
return Namespace(**args_dict)

src/xpk/commands/managed_ml_diagnostics.py

Lines changed: 31 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,16 @@
1414
limitations under the License.
1515
"""
1616

17-
import time
1817
from packaging.version import Version
1918
from ..core.commands import run_command_for_value, run_command_with_updates
20-
from ..utils.console import xpk_exit, xpk_print
19+
from ..utils.console import xpk_print
2120
import os
21+
import tempfile
22+
23+
_KUEUE_DEPLOYMENT_NAME = 'kueue-controller-manager'
24+
_KUEUE_NAMESPACE_NAME = 'kueue-system'
25+
_CERT_WEBHOOK_DEPLOYMENT_NAME = 'cert-manager-webhook'
26+
_CERT_WEBHOOK_NAMESPACE_NAME = 'cert-manager'
2227

2328

2429
def _install_cert_manager(version: Version = Version('v1.13.0')) -> int:
@@ -39,9 +44,6 @@ def _install_cert_manager(version: Version = Version('v1.13.0')) -> int:
3944
command, f'Applying cert-manager {version} manifest...'
4045
)
4146

42-
if return_code != 0:
43-
xpk_exit(return_code)
44-
4547
return return_code
4648

4749

@@ -53,11 +55,10 @@ def _download_mldiagnostics_yaml(package_name: str, version: Version) -> int:
5355
0 if successful and 1 otherwise.
5456
"""
5557

56-
version_with_v = f'v{version}'
5758
command = (
5859
'gcloud artifacts generic download'
5960
' --repository=mldiagnostics-webhook-and-operator-yaml --location=us'
60-
f' --package={package_name} --version={version_with_v} --destination=/tmp/'
61+
f' --package={package_name} --version=v{version} --destination=/tmp/'
6162
' --project=ai-on-gke'
6263
)
6364

@@ -106,7 +107,7 @@ def _install_mldiagnostics_yaml(artifact_filename: str) -> int:
106107
Returns:
107108
0 if successful and 1 otherwise.
108109
"""
109-
full_artifact_path = os.path.join('/tmp', artifact_filename)
110+
full_artifact_path = os.path.join(tempfile.gettempdir(), artifact_filename)
110111

111112
command = f'kubectl apply -f {full_artifact_path} -n gke-mldiagnostics'
112113

@@ -144,17 +145,12 @@ def install_mldiagnostics_prerequisites() -> int:
144145
0 if successful and 1 otherwise.
145146
"""
146147

147-
kueue_deployment_name = 'kueue-controller-manager'
148-
kueue_namespace_name = 'kueue-system'
149-
cert_webhook_deployment_name = 'cert-manager-webhook'
150-
cert_webhook_namespace_name = 'cert-manager'
151-
152148
if not _wait_for_deployment_ready(
153-
deployment_name=kueue_deployment_name, namespace=kueue_namespace_name
149+
deployment_name=_KUEUE_DEPLOYMENT_NAME, namespace=_KUEUE_NAMESPACE_NAME
154150
):
155151
xpk_print(
156-
f'Application {kueue_deployment_name} failed to become ready within the'
157-
' timeout.'
152+
f'Application {_KUEUE_DEPLOYMENT_NAME} failed to become ready within'
153+
' the timeout.'
158154
)
159155
return 1
160156

@@ -163,19 +159,19 @@ def install_mldiagnostics_prerequisites() -> int:
163159
return return_code
164160

165161
cert_webhook_ready = _wait_for_deployment_ready(
166-
deployment_name=cert_webhook_deployment_name,
167-
namespace=cert_webhook_namespace_name,
162+
deployment_name=_CERT_WEBHOOK_DEPLOYMENT_NAME,
163+
namespace=_CERT_WEBHOOK_NAMESPACE_NAME,
168164
)
169165
if not cert_webhook_ready:
170166
xpk_print('The cert-manager-webhook installation failed.')
171167
return 1
172168

173169
webhook_package = 'mldiagnostics-injection-webhook'
174-
webhook_version = 'v0.5.0'
175-
webhook_filename = f'{webhook_package}-{webhook_version}.yaml'
170+
webhook_version = Version('v0.5.0')
171+
webhook_filename = f'{webhook_package}-v{webhook_version}.yaml'
176172

177173
return_code = _download_mldiagnostics_yaml(
178-
package_name=webhook_package, version=Version(webhook_version)
174+
package_name=webhook_package, version=webhook_version
179175
)
180176
if return_code != 0:
181177
return return_code
@@ -193,11 +189,11 @@ def install_mldiagnostics_prerequisites() -> int:
193189
return return_code
194190

195191
operator_package = 'mldiagnostics-connection-operator'
196-
operator_version = 'v0.5.0'
197-
operator_filename = f'{operator_package}-{operator_version}.yaml'
192+
operator_version = Version('v0.5.0')
193+
operator_filename = f'{operator_package}-v{operator_version}.yaml'
198194

199195
return_code = _download_mldiagnostics_yaml(
200-
package_name=operator_package, version=Version(webhook_version)
196+
package_name=operator_package, version=operator_version
201197
)
202198
if return_code != 0:
203199
return return_code
@@ -242,5 +238,15 @@ def _wait_for_deployment_ready(
242238
return False
243239

244240
# When the status changes to 'running,' it might need about 10 seconds to fully stabilize.
245-
time.sleep(30)
241+
stabilization_seconds = 30
242+
stabilization_command = f'sleep {stabilization_seconds}'
243+
stabilization_code = run_command_with_updates(
244+
stabilization_command,
245+
f'Deployment {deployment_name} is ready. Waiting {stabilization_seconds}'
246+
' seconds for full stabilization',
247+
verbose=True,
248+
)
249+
if stabilization_code != 0:
250+
return False
251+
246252
return True

src/xpk/commands/managed_ml_diagnostics_test.py

Lines changed: 0 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -62,31 +62,6 @@ def mocks(mocker) -> _Mocks:
6262

6363
def construct_args(**kwargs: Any) -> Namespace:
6464
args_dict = dict(
65-
project='project',
66-
zone='us-central1-a',
67-
reservation='',
68-
default_pool_cpu_machine_type='test-machine-type',
69-
cluster='test-cluster',
70-
default_pool_cpu_num_nodes='100',
71-
sub_slicing=False,
72-
gke_version='',
73-
private=False,
74-
authorized_networks=None,
75-
enable_pathways=False,
76-
enable_ray_cluster=False,
77-
enable_workload_identity=False,
78-
enable_gcsfuse_csi_driver=False,
79-
enable_gcpfilestore_csi_driver=False,
80-
enable_parallelstore_csi_driver=False,
81-
enable_pd_csi_driver=False,
82-
enable_lustre_csi_driver=False,
83-
custom_cluster_arguments='',
84-
num_slices=1,
85-
num_nodes=1,
86-
flex=False,
87-
memory_limit='100Gi',
88-
cpu_limit=100,
89-
cluster_cpu_machine_type='',
9065
managed_mldiagnostics=False,
9166
)
9267
args_dict.update(kwargs)
@@ -99,47 +74,41 @@ def test_install_mldiagnostics_prerequisites_commands_executed(
9974
):
10075

10176
mocks.commands_tester.set_result_for_command(
102-
(0, ''),
10377
'kubectl',
10478
'rollout',
10579
'status',
10680
'deployment/kueue-controller-manager',
10781
)
10882

10983
mocks.commands_tester.set_result_for_command(
110-
(0, ''),
11184
'kubectl',
11285
'rollout',
11386
'status',
11487
'deployment/cert-manager-webhook',
11588
)
11689

11790
mocks.commands_tester.set_result_for_command(
118-
(0, ''),
11991
'kubectl',
12092
'apply',
12193
'-f',
12294
'https://github.com/cert-manager/cert-manager/releases/',
12395
)
12496

12597
mocks.commands_tester.set_result_for_command(
126-
(0, ''),
12798
'gcloud',
12899
'artifacts',
129100
'generic',
130101
'download',
131102
)
132103

133104
mocks.commands_tester.set_result_for_command(
134-
(0, ''),
135105
'kubectl',
136106
'create',
137107
'namespace',
138108
'gke-mldiagnostics',
139109
)
140110

141111
mocks.commands_tester.set_result_for_command(
142-
(0, ''),
143112
'kubectl',
144113
'apply',
145114
'-f',
@@ -148,7 +117,6 @@ def test_install_mldiagnostics_prerequisites_commands_executed(
148117
)
149118

150119
mocks.commands_tester.set_result_for_command(
151-
(0, ''),
152120
'kubectl',
153121
'label',
154122
'namespace',

src/xpk/parser/cluster.py

Lines changed: 5 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -151,12 +151,6 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
151151
),
152152
)
153153

154-
cluster_create_optional_arguments.add_argument(
155-
'--managed-ml-diagnostics',
156-
action='store_true',
157-
help='Enables the installation of required ML Diagnostics components.',
158-
)
159-
160154
if FeatureFlags.SUB_SLICING_ENABLED:
161155
add_cluster_create_sub_slicing_arguments(cluster_create_optional_arguments)
162156

@@ -248,12 +242,6 @@ def set_cluster_create_pathways_parser(
248242
)
249243
add_autoprovisioning_arguments(autoprovisioning_arguments)
250244

251-
cluster_create_pathways_optional_arguments.add_argument(
252-
'--managed-ml-diagnostics',
253-
action='store_true',
254-
help='Enables the installation of required ML Diagnostics components.',
255-
)
256-
257245
### Capacity arguments specific to "cluster create-pathways"
258246
cluster_create_pathways_capacity_arguments = (
259247
cluster_create_pathways_parser.add_argument_group(
@@ -917,6 +905,11 @@ def add_shared_cluster_create_capacity_arguments(
917905
' types.'
918906
),
919907
)
908+
parser_or_group.add_argument(
909+
'--managed-ml-diagnostics',
910+
action='store_true',
911+
help='Enables the installation of required ML Diagnostics components.',
912+
)
920913

921914

922915
def add_shared_cluster_create_mtc_arguments(

src/xpk/parser/cluster_test.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,7 @@ def test_cluster_create_ray_sub_slicing_is_hidden_but_set_to_false():
104104
assert args.sub_slicing is False
105105
assert "--sub-slicing" not in help_str
106106

107+
107108
def test_cluster_create_managed_mldiagnostics():
108109
parser = argparse.ArgumentParser()
109110

0 commit comments

Comments
 (0)