Resolve conflicts

DannyLiCom · DannyLiCom · commit d761a21d13eb · 2025-11-19T18:54:22.000Z
diff --git a/src/xpk/commands/cluster_test.py b/src/xpk/commands/cluster_test.py
@@ -124,6 +124,7 @@ def construct_args(**kwargs: Any) -> Namespace:
       cluster_cpu_machine_type='',
       create_vertex_tensorboard=False,
       enable_autoprovisioning=False,
+      managed_ml_diagnostics=False,
   )
   args_dict.update(kwargs)
   return Namespace(**args_dict)
diff --git a/src/xpk/commands/managed_ml_diagnostics.py b/src/xpk/commands/managed_ml_diagnostics.py
@@ -14,11 +14,16 @@
 limitations under the License.
 """
 
-import time
 from packaging.version import Version
 from ..core.commands import run_command_for_value, run_command_with_updates
-from ..utils.console import xpk_exit, xpk_print
+from ..utils.console import xpk_print
 import os
+import tempfile
+
+_KUEUE_DEPLOYMENT_NAME = 'kueue-controller-manager'
+_KUEUE_NAMESPACE_NAME = 'kueue-system'
+_CERT_WEBHOOK_DEPLOYMENT_NAME = 'cert-manager-webhook'
+_CERT_WEBHOOK_NAMESPACE_NAME = 'cert-manager'
 
 
 def _install_cert_manager(version: Version = Version('v1.13.0')) -> int:
@@ -39,9 +44,6 @@ def _install_cert_manager(version: Version = Version('v1.13.0')) -> int:
       command, f'Applying cert-manager {version} manifest...'
   )
 
-  if return_code != 0:
-    xpk_exit(return_code)
-
   return return_code
 
 
@@ -53,11 +55,10 @@ def _download_mldiagnostics_yaml(package_name: str, version: Version) -> int:
     0 if successful and 1 otherwise.
   """
 
-  version_with_v = f'v{version}'
   command = (
       'gcloud artifacts generic download'
       ' --repository=mldiagnostics-webhook-and-operator-yaml --location=us'
-      f' --package={package_name} --version={version_with_v} --destination=/tmp/'
+      f' --package={package_name} --version=v{version} --destination=/tmp/'
       ' --project=ai-on-gke'
   )
 
@@ -106,7 +107,7 @@ def _install_mldiagnostics_yaml(artifact_filename: str) -> int:
   Returns:
     0 if successful and 1 otherwise.
   """
-  full_artifact_path = os.path.join('/tmp', artifact_filename)
+  full_artifact_path = os.path.join(tempfile.gettempdir(), artifact_filename)
 
   command = f'kubectl apply -f {full_artifact_path} -n gke-mldiagnostics'
 
@@ -144,17 +145,12 @@ def install_mldiagnostics_prerequisites() -> int:
     0 if successful and 1 otherwise.
   """
 
-  kueue_deployment_name = 'kueue-controller-manager'
-  kueue_namespace_name = 'kueue-system'
-  cert_webhook_deployment_name = 'cert-manager-webhook'
-  cert_webhook_namespace_name = 'cert-manager'
-
   if not _wait_for_deployment_ready(
-      deployment_name=kueue_deployment_name, namespace=kueue_namespace_name
+      deployment_name=_KUEUE_DEPLOYMENT_NAME, namespace=_KUEUE_NAMESPACE_NAME
   ):
     xpk_print(
-        f'Application {kueue_deployment_name} failed to become ready within the'
-        ' timeout.'
+        f'Application {_KUEUE_DEPLOYMENT_NAME} failed to become ready within'
+        ' the timeout.'
     )
     return 1
 
@@ -163,19 +159,19 @@ def install_mldiagnostics_prerequisites() -> int:
     return return_code
 
   cert_webhook_ready = _wait_for_deployment_ready(
-      deployment_name=cert_webhook_deployment_name,
-      namespace=cert_webhook_namespace_name,
+      deployment_name=_CERT_WEBHOOK_DEPLOYMENT_NAME,
+      namespace=_CERT_WEBHOOK_NAMESPACE_NAME,
   )
   if not cert_webhook_ready:
     xpk_print('The cert-manager-webhook installation failed.')
     return 1
 
   webhook_package = 'mldiagnostics-injection-webhook'
-  webhook_version = 'v0.5.0'
-  webhook_filename = f'{webhook_package}-{webhook_version}.yaml'
+  webhook_version = Version('v0.5.0')
+  webhook_filename = f'{webhook_package}-v{webhook_version}.yaml'
 
   return_code = _download_mldiagnostics_yaml(
-      package_name=webhook_package, version=Version(webhook_version)
+      package_name=webhook_package, version=webhook_version
   )
   if return_code != 0:
     return return_code
@@ -193,11 +189,11 @@ def install_mldiagnostics_prerequisites() -> int:
     return return_code
 
   operator_package = 'mldiagnostics-connection-operator'
-  operator_version = 'v0.5.0'
-  operator_filename = f'{operator_package}-{operator_version}.yaml'
+  operator_version = Version('v0.5.0')
+  operator_filename = f'{operator_package}-v{operator_version}.yaml'
 
   return_code = _download_mldiagnostics_yaml(
-      package_name=operator_package, version=Version(webhook_version)
+      package_name=operator_package, version=operator_version
   )
   if return_code != 0:
     return return_code
@@ -242,5 +238,15 @@ def _wait_for_deployment_ready(
     return False
 
   # When the status changes to 'running,' it might need about 10 seconds to fully stabilize.
-  time.sleep(30)
+  stabilization_seconds = 30
+  stabilization_command = f'sleep {stabilization_seconds}'
+  stabilization_code = run_command_with_updates(
+      stabilization_command,
+      f'Deployment {deployment_name} is ready. Waiting {stabilization_seconds}'
+      ' seconds for full stabilization',
+      verbose=True,
+  )
+  if stabilization_code != 0:
+    return False
+
   return True
diff --git a/src/xpk/commands/managed_ml_diagnostics_test.py b/src/xpk/commands/managed_ml_diagnostics_test.py
@@ -62,31 +62,6 @@ def mocks(mocker) -> _Mocks:
 
 def construct_args(**kwargs: Any) -> Namespace:
   args_dict = dict(
-      project='project',
-      zone='us-central1-a',
-      reservation='',
-      default_pool_cpu_machine_type='test-machine-type',
-      cluster='test-cluster',
-      default_pool_cpu_num_nodes='100',
-      sub_slicing=False,
-      gke_version='',
-      private=False,
-      authorized_networks=None,
-      enable_pathways=False,
-      enable_ray_cluster=False,
-      enable_workload_identity=False,
-      enable_gcsfuse_csi_driver=False,
-      enable_gcpfilestore_csi_driver=False,
-      enable_parallelstore_csi_driver=False,
-      enable_pd_csi_driver=False,
-      enable_lustre_csi_driver=False,
-      custom_cluster_arguments='',
-      num_slices=1,
-      num_nodes=1,
-      flex=False,
-      memory_limit='100Gi',
-      cpu_limit=100,
-      cluster_cpu_machine_type='',
       managed_mldiagnostics=False,
   )
   args_dict.update(kwargs)
@@ -99,47 +74,41 @@ def test_install_mldiagnostics_prerequisites_commands_executed(
 ):
 
   mocks.commands_tester.set_result_for_command(
-      (0, ''),
       'kubectl',
       'rollout',
       'status',
       'deployment/kueue-controller-manager',
   )
 
   mocks.commands_tester.set_result_for_command(
-      (0, ''),
       'kubectl',
       'rollout',
       'status',
       'deployment/cert-manager-webhook',
   )
 
   mocks.commands_tester.set_result_for_command(
-      (0, ''),
       'kubectl',
       'apply',
       '-f',
       'https://github.com/cert-manager/cert-manager/releases/',
   )
 
   mocks.commands_tester.set_result_for_command(
-      (0, ''),
       'gcloud',
       'artifacts',
       'generic',
       'download',
   )
 
   mocks.commands_tester.set_result_for_command(
-      (0, ''),
       'kubectl',
       'create',
       'namespace',
       'gke-mldiagnostics',
   )
 
   mocks.commands_tester.set_result_for_command(
-      (0, ''),
       'kubectl',
       'apply',
       '-f',
@@ -148,7 +117,6 @@ def test_install_mldiagnostics_prerequisites_commands_executed(
   )
 
   mocks.commands_tester.set_result_for_command(
-      (0, ''),
       'kubectl',
       'label',
       'namespace',
diff --git a/src/xpk/parser/cluster.py b/src/xpk/parser/cluster.py
@@ -151,12 +151,6 @@ def set_cluster_create_parser(cluster_create_parser: ArgumentParser):
       ),
   )
 
-  cluster_create_optional_arguments.add_argument(
-      '--managed-ml-diagnostics',
-      action='store_true',
-      help='Enables the installation of required ML Diagnostics components.',
-  )
-
   if FeatureFlags.SUB_SLICING_ENABLED:
     add_cluster_create_sub_slicing_arguments(cluster_create_optional_arguments)
 
@@ -248,12 +242,6 @@ def set_cluster_create_pathways_parser(
   )
   add_autoprovisioning_arguments(autoprovisioning_arguments)
 
-  cluster_create_pathways_optional_arguments.add_argument(
-      '--managed-ml-diagnostics',
-      action='store_true',
-      help='Enables the installation of required ML Diagnostics components.',
-  )
-
   ### Capacity arguments specific to "cluster create-pathways"
   cluster_create_pathways_capacity_arguments = (
       cluster_create_pathways_parser.add_argument_group(
@@ -917,6 +905,11 @@ def add_shared_cluster_create_capacity_arguments(
           ' types.'
       ),
   )
+  parser_or_group.add_argument(
+      '--managed-ml-diagnostics',
+      action='store_true',
+      help='Enables the installation of required ML Diagnostics components.',
+  )
 
 
 def add_shared_cluster_create_mtc_arguments(
diff --git a/src/xpk/parser/cluster_test.py b/src/xpk/parser/cluster_test.py
@@ -104,6 +104,7 @@ def test_cluster_create_ray_sub_slicing_is_hidden_but_set_to_false():
   assert args.sub_slicing is False
   assert "--sub-slicing" not in help_str
 
+
 def test_cluster_create_managed_mldiagnostics():
   parser = argparse.ArgumentParser()
 

Original file line number	Diff line number	Diff line change
`@@ -124,6 +124,7 @@ def construct_args(**kwargs: Any) -> Namespace:`
`124`	`124`	`cluster_cpu_machine_type='',`
`125`	`125`	`create_vertex_tensorboard=False,`
`126`	`126`	`enable_autoprovisioning=False,`
	`127`	`+ managed_ml_diagnostics=False,`
`127`	`128`	`)`
`128`	`129`	`args_dict.update(kwargs)`
`129`	`130`	`return Namespace(**args_dict)`