Skip to content

Commit bf8b65a

Browse files
committed
Update Docker Resource Limits for Pathways Main Workload
1 parent 78d1a31 commit bf8b65a

File tree

5 files changed

+49
-21
lines changed

5 files changed

+49
-21
lines changed

src/xpk/core/cluster.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -617,14 +617,14 @@ def update_cluster_with_workload_identity_if_necessary(args) -> int:
617617

618618
if is_workload_identity_enabled_on_cluster(args):
619619
return 0
620-
cluster_update_return_code = (
621-
update_gke_cluster_with_workload_identity_enabled(args)
622-
)
623-
if cluster_update_return_code > 0:
624-
xpk_print(
625-
'Updating GKE cluster to enable Workload Identity Federation failed!'
626-
)
627-
return cluster_update_return_code
620+
# cluster_update_return_code = (
621+
# update_gke_cluster_with_workload_identity_enabled(args)
622+
# )
623+
# if cluster_update_return_code > 0:
624+
# xpk_print(
625+
# 'Updating GKE cluster to enable Workload Identity Federation failed!'
626+
# )
627+
# return cluster_update_return_code
628628

629629
return 0
630630

src/xpk/core/docker_resources.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,8 +34,8 @@ def get_main_container_resources(
3434
Workload resources port as a YAML string
3535
"""
3636
# Resources requirements for Pathways workload containers are known.
37-
resources_yaml = """cpu: "24"
38-
memory: 100G"""
37+
resources_yaml = f"""cpu: "60"
38+
memory: 490G"""
3939
if args.use_pathways:
4040
return resources_yaml
4141

src/xpk/core/nodepool.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -310,18 +310,26 @@ def run_gke_node_pool_create_command(
310310
create_commands.append(command)
311311
create_task_names.append(task)
312312

313-
desired_pw_cpu_node_pools = ['cpu-np']
313+
desired_pw_cpu_node_pools = ['cpu-np', 'highmem-cpu-np']
314314
if args.enable_pathways:
315315
# Pathways needs CPU nodepools in addition to TPU nodepools
316316
for node_pool_name in desired_pw_cpu_node_pools:
317317
if node_pool_name in existing_node_pool_names:
318318
continue
319-
command = (
320-
'gcloud beta container node-pools create'
321-
f' {node_pool_name} --node-version={gke_node_pool_version} --cluster={args.cluster} --project={args.project} --node-locations={args.zone} --region={zone_to_region(args.zone)} --num-nodes=1'
322-
f' --machine-type={args.pathways_gce_machine_type} --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL} --enable-autoscaling'
323-
' --min-nodes=1 --max-nodes=20'
324-
)
319+
if node_pool_name == 'cpu-np':
320+
command = (
321+
'gcloud beta container node-pools create'
322+
f' {node_pool_name} --node-version={gke_node_pool_version} --cluster={args.cluster} --project={args.project} --node-locations={args.zone} --region={zone_to_region(args.zone)} --num-nodes=1'
323+
f' --machine-type={args.pathways_gce_machine_type} --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL} --enable-autoscaling'
324+
' --min-nodes=1 --max-nodes=20'
325+
)
326+
else:
327+
command = (
328+
'gcloud beta container node-pools create'
329+
f' {node_pool_name} --node-version={gke_node_pool_version} --cluster={args.cluster} --project={args.project} --node-locations={args.zone} --region={zone_to_region(args.zone)} --num-nodes=1'
330+
f' --machine-type={args.pathways_highmem_gce_machine_type} --scopes=storage-full,gke-default,{CLOUD_PLATFORM_AUTH_SCOPE_URL} --enable-autoscaling'
331+
' --min-nodes=1 --max-nodes=20'
332+
)
325333
task = f'NodepoolCreate-{node_pool_name}'
326334
create_commands.append(command)
327335
create_task_names.append(task)

src/xpk/core/pathways.py

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,14 @@ def add_pw_resource_flavors(args):
3232
spec:
3333
nodeLabels:
3434
cloud.google.com/gke-nodepool: cpu-np
35+
---
36+
apiVersion: kueue.x-k8s.io/v1beta1
37+
kind: ResourceFlavor
38+
metadata:
39+
name: highmem-cpu-user
40+
spec:
41+
nodeLabels:
42+
cloud.google.com/gke-nodepool: highmem-cpu-np
3543
---"""
3644
if args.enable_pathways:
3745
return resource_flavor_yaml
@@ -47,7 +55,13 @@ def add_pw_resources_to_kueue(args):
4755
- name: "cpu"
4856
nominalQuota: 480
4957
- name: "memory"
50-
nominalQuota: 2000G"""
58+
nominalQuota: 2000G
59+
- name: highmem-cpu-user
60+
resources:
61+
- name: "cpu"
62+
nominalQuota: 480
63+
- name: "memory"
64+
nominalQuota: 4000G"""
5165
if args.enable_pathways:
5266
return resources_yaml
5367
return ''
@@ -79,7 +93,7 @@ def ensure_pathways_workload_prerequisites(args, system) -> bool:
7993

8094
# Ensure the cluster and CPU nodepools were created with create-pathways
8195
all_node_pools = get_all_nodepools_programmatic(args)
82-
desired_pw_cpu_node_pools = {'cpu-np'}
96+
desired_pw_cpu_node_pools = {'cpu-np', 'highmem-cpu-np'}
8397
if not desired_pw_cpu_node_pools.issubset(set(all_node_pools[0])):
8498
xpk_print(
8599
'Cluster needs to be created with `xpk create-pathways` to run'
@@ -263,11 +277,11 @@ def get_user_workload_for_pathways(
263277
spec:
264278
containers:
265279
{container}
266-
nodeSelector:
267-
cloud.google.com/gke-nodepool: cpu-np
268280
hostNetwork: true
269281
dnsPolicy: ClusterFirstWithHostNet
270282
restartPolicy: Never
283+
nodeSelector:
284+
cloud.google.com/gke-nodepool: highmem-cpu-np
271285
volumes:
272286
- hostPath:
273287
path: /tmp

src/xpk/parser/cluster.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -528,6 +528,12 @@ def add_shared_cluster_create_optional_arguments(parser: ArgumentParser):
528528
default='n2-standard-64',
529529
help='The CPU type for Pathways CPU nodepools',
530530
)
531+
parser.add_argument(
532+
'--pathways-highmem-gce-machine-type',
533+
type=str,
534+
default='c4-highmem-192',
535+
help='The highmem CPU type for Pathways CPU nodepools',
536+
)
531537
parser.add_argument(
532538
'--default-pool-cpu-machine-type',
533539
type=str,

0 commit comments

Comments
 (0)