Skip to content

Commit 598aaf4

Browse files
committed
Update Docker Resource Limits for Pathways Main Workload
1 parent 78d1a31 commit 598aaf4

File tree

6 files changed

+44
-18
lines changed

6 files changed

+44
-18
lines changed

src/xpk/commands/cluster.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -280,11 +280,11 @@ def cluster_create(args) -> None:
280280
xpk_print('Installation of RayCluster failed.')
281281
xpk_exit(return_code)
282282

283-
if hasattr(args, 'enable_mtc') and args.enable_mtc:
284-
return_code = install_mtc_on_cluster(args, system)
285-
if return_code != 0:
286-
xpk_print('Installation of MTC failed.')
287-
xpk_exit(return_code)
283+
# if hasattr(args, 'enable_mtc') and args.enable_mtc:
284+
# return_code = install_mtc_on_cluster(args, system)
285+
# if return_code != 0:
286+
# xpk_print('Installation of MTC failed.')
287+
# xpk_exit(return_code)
288288

289289
xpk_print('GKE commands done! Resources are created.')
290290
xpk_print(

src/xpk/core/cluster.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -617,14 +617,14 @@ def update_cluster_with_workload_identity_if_necessary(args) -> int:
617617

618618
if is_workload_identity_enabled_on_cluster(args):
619619
return 0
620-
cluster_update_return_code = (
621-
update_gke_cluster_with_workload_identity_enabled(args)
622-
)
623-
if cluster_update_return_code > 0:
624-
xpk_print(
625-
'Updating GKE cluster to enable Workload Identity Federation failed!'
626-
)
627-
return cluster_update_return_code
620+
# cluster_update_return_code = (
621+
# update_gke_cluster_with_workload_identity_enabled(args)
622+
# )
623+
# if cluster_update_return_code > 0:
624+
# xpk_print(
625+
# 'Updating GKE cluster to enable Workload Identity Federation failed!'
626+
# )
627+
# return cluster_update_return_code
628628

629629
return 0
630630

src/xpk/core/docker_resources.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ def get_main_container_resources(
3434
Workload resources port as a YAML string
3535
"""
3636
# Resources requirements for Pathways workload containers are known.
37-
resources_yaml = """cpu: "24"
37+
resources_yaml = f"""cpu: "24"
3838
memory: 100G"""
3939
if args.use_pathways:
4040
return resources_yaml

src/xpk/core/nodepool.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -310,18 +310,26 @@ def run_gke_node_pool_create_command(
310310
create_commands.append(command)
311311
create_task_names.append(task)
312312

313-
desired_pw_cpu_node_pools = ['cpu-np']
313+
desired_pw_cpu_node_pools = ['cpu-np'] #, 'highmem-cpu-np']
314314
if args.enable_pathways:
315315
# Pathways needs CPU nodepools in addition to TPU nodepools
316316
for node_pool_name in desired_pw_cpu_node_pools:
317317
if node_pool_name in existing_node_pool_names:
318318
continue
319+
# if node_pool_name == 'cpu-np':
319320
command = (
320321
'gcloud beta container node-pools create'
321322
f' {node_pool_name} --node-version={gke_node_pool_version} --cluster={args.cluster} --project={args.project} --node-locations={args.zone} --region={zone_to_region(args.zone)} --num-nodes=1'
322323
f' --machine-type={args.pathways_gce_machine_type} --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL} --enable-autoscaling'
323324
' --min-nodes=1 --max-nodes=20'
324325
)
326+
# else:
327+
# command = (
328+
# 'gcloud beta container node-pools create'
329+
# f' {node_pool_name} --node-version={gke_node_pool_version} --cluster={args.cluster} --project={args.project} --node-locations={args.zone} --region={zone_to_region(args.zone)} --num-nodes=1'
330+
# f' --machine-type={args.pathways_highmem_gce_machine_type} --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL} --enable-autoscaling'
331+
# ' --min-nodes=1 --max-nodes=20'
332+
# )
325333
task = f'NodepoolCreate-{node_pool_name}'
326334
create_commands.append(command)
327335
create_task_names.append(task)

src/xpk/core/pathways.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,14 @@ def add_pw_resource_flavors(args):
3333
nodeLabels:
3434
cloud.google.com/gke-nodepool: cpu-np
3535
---"""
36+
# apiVersion: kueue.x-k8s.io/v1beta1
37+
# kind: ResourceFlavor
38+
# metadata:
39+
# name: highmem-cpu-user
40+
# spec:
41+
# nodeLabels:
42+
# cloud.google.com/gke-nodepool: highmem-cpu-np
43+
# ---"""
3644
if args.enable_pathways:
3745
return resource_flavor_yaml
3846
return ''
@@ -48,6 +56,12 @@ def add_pw_resources_to_kueue(args):
4856
nominalQuota: 480
4957
- name: "memory"
5058
nominalQuota: 2000G"""
59+
# - name: highmem-cpu-user
60+
# resources:
61+
# - name: "cpu"
62+
# nominalQuota: 480
63+
# - name: "memory"
64+
# nominalQuota: 4000G"""
5165
if args.enable_pathways:
5266
return resources_yaml
5367
return ''
@@ -79,7 +93,7 @@ def ensure_pathways_workload_prerequisites(args, system) -> bool:
7993

8094
# Ensure the cluster and CPU nodepools were created with create-pathways
8195
all_node_pools = get_all_nodepools_programmatic(args)
82-
desired_pw_cpu_node_pools = {'cpu-np'}
96+
desired_pw_cpu_node_pools = {'cpu-np'} #, 'highmem-cpu-np'}
8397
if not desired_pw_cpu_node_pools.issubset(set(all_node_pools[0])):
8498
xpk_print(
8599
'Cluster needs to be created with `xpk create-pathways` to run'
@@ -263,8 +277,6 @@ def get_user_workload_for_pathways(
263277
spec:
264278
containers:
265279
{container}
266-
nodeSelector:
267-
cloud.google.com/gke-nodepool: cpu-np
268280
hostNetwork: true
269281
dnsPolicy: ClusterFirstWithHostNet
270282
restartPolicy: Never

src/xpk/parser/cluster.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,12 @@ def add_shared_cluster_create_optional_arguments(parser: ArgumentParser):
528528
default='n2-standard-64',
529529
help='The CPU type for Pathways CPU nodepools',
530530
)
531+
# parser.add_argument(
532+
# '--pathways-highmem-gce-machine-type',
533+
# type=str,
534+
# default='c4-highmem-192',
535+
# help='The highmem CPU type for Pathways CPU nodepools',
536+
# )
531537
parser.add_argument(
532538
'--default-pool-cpu-machine-type',
533539
type=str,

0 commit comments

Comments
 (0)