From 183f7cf2e7b72f79ff90663323c6bc0c419486b7 Mon Sep 17 00:00:00 2001 From: gcie Date: Thu, 10 Jul 2025 15:01:58 +0000 Subject: [PATCH 1/7] fix yaml formatting for workloads with TPU and NAP --- src/xpk/core/capacity.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xpk/core/capacity.py b/src/xpk/core/capacity.py index 08d17c09b..980ec0835 100644 --- a/src/xpk/core/capacity.py +++ b/src/xpk/core/capacity.py @@ -234,7 +234,7 @@ def get_capacity_node_selectors_from_capacity_type( case CapacityType.FLEX_START.name: node_selector = 'cloud.google.com/gke-queued="true"' case CapacityType.SPOT.name: - node_selector = 'cloud.google.com/gke-spot="true"' + node_selector = 'cloud.google.com/gke-spot: "true"' case CapacityType.RESERVATION.name: node_selector = f'cloud.google.com/reservation-name: {args.reservation}' case _: From a26b3cc06856fff3a37fc5d358f104215fd7c4a8 Mon Sep 17 00:00:00 2001 From: gcie Date: Thu, 10 Jul 2025 15:02:41 +0000 Subject: [PATCH 2/7] refactor tpu system characteristics --- src/xpk/core/system_characteristics.py | 1237 +++--------------------- 1 file changed, 157 insertions(+), 1080 deletions(-) diff --git a/src/xpk/core/system_characteristics.py b/src/xpk/core/system_characteristics.py index 48fd2c6f3..630c515e3 100644 --- a/src/xpk/core/system_characteristics.py +++ b/src/xpk/core/system_characteristics.py @@ -15,6 +15,8 @@ """ from dataclasses import dataclass +from functools import reduce +from operator import mul AcceleratorType = {'TPU': 1, 'GPU': 2, 'CPU': 3} @@ -91,6 +93,34 @@ def get_system_characteristics_by_device_type( return None, 1 +def get_tpu_system_characteristics_map( + prefix: str, + tensorcores_per_chip: int, + gke_accelerator: str, + machine_type: str, + supported_topologies: list[str], +) -> dict[str, SystemCharacteristics]: + system_characteristics_map = {} + for topology in supported_topologies: + total_chips = reduce(mul, (int(x) for x in topology.split('x')), 1) + num_tensorcores = total_chips * tensorcores_per_chip + chips_per_vm = 1 if total_chips == 1 else 4 + vms_per_slice = total_chips // chips_per_vm + system = SystemCharacteristics( + topology, + vms_per_slice, + gke_accelerator, + machine_type, + chips_per_vm, + AcceleratorType['TPU'], + f'{prefix}-{topology}', + ) + system_characteristics_map[f'{prefix}-{topology}'] = system + system_characteristics_map[f'{prefix}-{num_tensorcores}'] = system + + return system_characteristics_map + + ################### Subcommand Helper Functions ############################# """ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! IF YOU MODIFY THE BELOW UserFacingNameToSystemCharacteristics MAP YOU SHOULD @@ -212,1098 +242,145 @@ def get_system_characteristics_by_device_type( 'h100-mega-80gb-8', ), # TPU system characteristics - # v6e - 'v6e-1': SystemCharacteristics( - '1x1', - 1, - 'tpu-v6e-slice', - 'ct6e-standard-1t', - 1, - AcceleratorType['TPU'], - 'v6e-1', + **get_tpu_system_characteristics_map( + 'v6e', 1, 'tpu-v6e-slice', 'ct6e-standard-1t', ['1x1'] ), - 'v6e-4': SystemCharacteristics( - '2x2', + **get_tpu_system_characteristics_map( + 'v6e', 1, 'tpu-v6e-slice', 'ct6e-standard-4t', - 4, - AcceleratorType['TPU'], - 'v6e-4', + ['2x2', '2x4', '4x4', '4x8', '8x8', '8x16', '16x16'], ), - 'v6e-8': SystemCharacteristics( - '2x4', + **get_tpu_system_characteristics_map( + 'v5p', 2, - 'tpu-v6e-slice', - 'ct6e-standard-4t', - 4, - AcceleratorType['TPU'], - 'v6e-8', - ), - 'v6e-16': SystemCharacteristics( - '4x4', - 4, - 'tpu-v6e-slice', - 'ct6e-standard-4t', - 4, - AcceleratorType['TPU'], - 'v6e-16', - ), - 'v6e-32': SystemCharacteristics( - '4x8', - 8, - 'tpu-v6e-slice', - 'ct6e-standard-4t', - 4, - AcceleratorType['TPU'], - 'v6e-32', - ), - 'v6e-64': SystemCharacteristics( - '8x8', - 16, - 'tpu-v6e-slice', - 'ct6e-standard-4t', - 4, - AcceleratorType['TPU'], - 'v6e-64', - ), - 'v6e-128': SystemCharacteristics( - '8x16', - 32, - 'tpu-v6e-slice', - 'ct6e-standard-4t', - 4, - AcceleratorType['TPU'], - 'v6e-128', - ), - 'v6e-256': SystemCharacteristics( - '16x16', - 64, - 'tpu-v6e-slice', - 'ct6e-standard-4t', - 4, - AcceleratorType['TPU'], - 'v6e-256', - ), - # v5p - 'v5p-8': SystemCharacteristics( - '2x2x1', - 1, 'tpu-v5p-slice', 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8', + [ + '2x2x1', + '2x2x2', + '2x2x4', + '2x4x4', + '4x4x4', + '4x4x8', + '4x4x12', + '4x8x8', + '4x4x20', + '4x8x12', + '4x4x28', + '8x8x8', + '4x12x12', + '4x8x20', + '4x4x44', + '8x8x12', + '4x4x52', + '4x8x28', + '4x12x20', + '8x8x16', + '4x4x68', + '8x12x12', + '4x4x76', + '8x8x20', + '4x12x28', + '4x8x44', + '4x4x92', + '8x12x16', + '4x20x20', + '4x8x52', + '12x12x12', + '8x8x28', + '4x4x116', + '8x12x20', + '4x4x124', + '8x16x16', + '4x12x44', + '4x8x68', + '4x20x28', + '12x12x16', + '4x4x148', + '4x8x76', + '4x12x52', + '8x16x20', + '4x4x164', + '8x12x28', + '4x4x172', + '8x8x44', + '12x12x20', + '4x8x92', + '4x4x188', + '12x16x16', + '4x28x28', + '8x20x20', + '4x12x68', + '8x8x52', + '4x4x212', + '12x12x24', + '4x20x44', + '8x16x28', + '4x12x76', + '4x8x116', + '4x4x236', + '12x16x20', + '4x4x244', + '4x8x124', + '12x12x28', + '16x16x16', + '4x20x52', + '8x12x44', + '8x8x68', + '4x12x92', + '8x20x28', + '12x16x24', + '4x8x148', + '12x20x20', + '8x8x76', + '4x28x44', + '8x12x52', + '16x16x20', + '12x12x36', + '4x8x164', + '12x16x28', + '4x20x68', + '4x8x172', + '4x12x116', + '8x16x44', + '12x20x24', + '4x28x52', + '8x8x92', + '4x12x124', + '4x8x188', + '4x20x76', + '16x16x24', + '12x24x24', + '16x20x28', + ], + ), + **get_tpu_system_characteristics_map( + 'v5litepod', + 1, + 'tpu-v5-lite-podslice', + 'ct5lp-hightpu-4t', + ['2x4', '4x4', '4x8', '8x8', '8x16', '16x16'], ), - 'v5p-16': SystemCharacteristics( - '2x2x2', + **get_tpu_system_characteristics_map( + 'v4', 2, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-16', - ), - 'v5p-32': SystemCharacteristics( - '2x2x4', - 4, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-32', - ), - 'v5p-64': SystemCharacteristics( - '2x4x4', - 8, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-64', - ), - 'v5p-128': SystemCharacteristics( - '4x4x4', - 16, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-128', - ), - 'v5p-256': SystemCharacteristics( - '4x4x8', - 32, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-256', - ), - 'v5p-384': SystemCharacteristics( - '4x4x12', - 48, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-384', - ), - 'v5p-512': SystemCharacteristics( - '4x8x8', - 64, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-512', - ), - 'v5p-640': SystemCharacteristics( - '4x4x20', - 80, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-640', - ), - 'v5p-768': SystemCharacteristics( - '4x8x12', - 96, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-768', - ), - 'v5p-896': SystemCharacteristics( - '4x4x28', - 112, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-896', - ), - 'v5p-1024': SystemCharacteristics( - '8x8x8', - 128, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1024', - ), - 'v5p-1152': SystemCharacteristics( - '4x12x12', - 144, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1152', - ), - 'v5p-1280': SystemCharacteristics( - '4x8x20', - 160, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1280', - ), - 'v5p-1408': SystemCharacteristics( - '4x4x44', - 176, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1408', - ), - 'v5p-1536': SystemCharacteristics( - '8x8x12', - 192, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1536', - ), - 'v5p-1664': SystemCharacteristics( - '4x4x52', - 208, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1664', - ), - 'v5p-1792': SystemCharacteristics( - '4x8x28', - 224, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1792', - ), - 'v5p-1920': SystemCharacteristics( - '4x12x20', - 240, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-1920', - ), - 'v5p-2048': SystemCharacteristics( - '8x8x16', - 256, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2048', - ), - 'v5p-2176': SystemCharacteristics( - '4x4x68', - 272, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2176', - ), - 'v5p-2304': SystemCharacteristics( - '8x12x12', - 288, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2304', - ), - 'v5p-2432': SystemCharacteristics( - '4x4x76', - 304, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2432', - ), - 'v5p-2560': SystemCharacteristics( - '8x8x20', - 320, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2560', - ), - 'v5p-2688': SystemCharacteristics( - '4x12x28', - 336, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2688', - ), - 'v5p-2816': SystemCharacteristics( - '4x8x44', - 352, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2816', - ), - 'v5p-2944': SystemCharacteristics( - '4x4x92', - 368, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-2944', - ), - 'v5p-3072': SystemCharacteristics( - '8x12x16', - 384, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3072', - ), - 'v5p-3200': SystemCharacteristics( - '4x20x20', - 400, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3200', - ), - 'v5p-3328': SystemCharacteristics( - '4x8x52', - 416, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3328', - ), - 'v5p-3456': SystemCharacteristics( - '12x12x12', - 432, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3456', - ), - 'v5p-3584': SystemCharacteristics( - '8x8x28', - 448, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3584', - ), - 'v5p-3712': SystemCharacteristics( - '4x4x116', - 464, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3712', - ), - 'v5p-3840': SystemCharacteristics( - '8x12x20', - 480, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3840', - ), - 'v5p-3968': SystemCharacteristics( - '4x4x124', - 496, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-3968', - ), - 'v5p-4096': SystemCharacteristics( - '8x16x16', - 512, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4096', - ), - 'v5p-4224': SystemCharacteristics( - '4x12x44', - 528, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4224', - ), - 'v5p-4352': SystemCharacteristics( - '4x8x68', - 544, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4352', - ), - 'v5p-4480': SystemCharacteristics( - '4x20x28', - 560, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4480', - ), - 'v5p-4608': SystemCharacteristics( - '12x12x16', - 576, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4608', - ), - 'v5p-4736': SystemCharacteristics( - '4x4x148', - 592, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4736', - ), - 'v5p-4864': SystemCharacteristics( - '4x8x76', - 608, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4864', - ), - 'v5p-4992': SystemCharacteristics( - '4x12x52', - 624, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-4992', - ), - 'v5p-5120': SystemCharacteristics( - '8x16x20', - 640, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-5120', - ), - 'v5p-5248': SystemCharacteristics( - '4x4x164', - 656, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-5248', - ), - 'v5p-5376': SystemCharacteristics( - '8x12x28', - 672, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-5376', - ), - 'v5p-5504': SystemCharacteristics( - '4x4x172', - 688, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-5504', - ), - 'v5p-5632': SystemCharacteristics( - '8x8x44', - 704, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-5632', - ), - 'v5p-5760': SystemCharacteristics( - '12x12x20', - 720, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-5760', - ), - 'v5p-5888': SystemCharacteristics( - '4x8x92', - 736, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-5888', - ), - 'v5p-6016': SystemCharacteristics( - '4x4x188', - 752, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6016', - ), - 'v5p-6144': SystemCharacteristics( - '12x16x16', - 768, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6144', - ), - 'v5p-6272': SystemCharacteristics( - '4x28x28', - 784, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6272', - ), - 'v5p-6400': SystemCharacteristics( - '8x20x20', - 800, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6400', - ), - 'v5p-6528': SystemCharacteristics( - '4x12x68', - 816, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6528', - ), - 'v5p-6656': SystemCharacteristics( - '8x8x52', - 832, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6656', - ), - 'v5p-6784': SystemCharacteristics( - '4x4x212', - 848, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6784', - ), - 'v5p-6912': SystemCharacteristics( - '12x12x24', - 864, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-6912', - ), - 'v5p-7040': SystemCharacteristics( - '4x20x44', - 880, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7040', - ), - 'v5p-7168': SystemCharacteristics( - '8x16x28', - 896, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7168', - ), - 'v5p-7296': SystemCharacteristics( - '4x12x76', - 912, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7296', - ), - 'v5p-7424': SystemCharacteristics( - '4x8x116', - 928, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7424', - ), - 'v5p-7552': SystemCharacteristics( - '4x4x236', - 944, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7552', - ), - 'v5p-7680': SystemCharacteristics( - '12x16x20', - 960, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7680', - ), - 'v5p-7808': SystemCharacteristics( - '4x4x244', - 976, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7808', - ), - 'v5p-7936': SystemCharacteristics( - '4x8x124', - 992, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-7936', - ), - 'v5p-8064': SystemCharacteristics( - '12x12x28', - 1008, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8064', - ), - 'v5p-8192': SystemCharacteristics( - '16x16x16', - 1024, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8192', - ), - 'v5p-8320': SystemCharacteristics( - '4x20x52', - 1040, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8320', - ), - 'v5p-8448': SystemCharacteristics( - '8x12x44', - 1056, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8448', - ), - 'v5p-8704': SystemCharacteristics( - '8x8x68', - 1088, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8704', - ), - 'v5p-8832': SystemCharacteristics( - '4x12x92', - 1104, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8832', - ), - 'v5p-8960': SystemCharacteristics( - '8x20x28', - 1120, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-8960', - ), - 'v5p-9216': SystemCharacteristics( - '12x16x24', - 1152, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-9216', - ), - 'v5p-9472': SystemCharacteristics( - '4x8x148', - 1184, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-9472', - ), - 'v5p-9600': SystemCharacteristics( - '12x20x20', - 1200, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-9600', - ), - 'v5p-9728': SystemCharacteristics( - '8x8x76', - 1216, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-9728', - ), - 'v5p-9856': SystemCharacteristics( - '4x28x44', - 1232, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-9856', - ), - 'v5p-9984': SystemCharacteristics( - '8x12x52', - 1248, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-9984', - ), - 'v5p-10240': SystemCharacteristics( - '16x16x20', - 1280, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-10240', - ), - 'v5p-10368': SystemCharacteristics( - '12x12x36', - 1296, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-10368', - ), - 'v5p-10496': SystemCharacteristics( - '4x8x164', - 1312, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-10496', - ), - 'v5p-10752': SystemCharacteristics( - '12x16x28', - 1344, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-10752', - ), - 'v5p-10880': SystemCharacteristics( - '4x20x68', - 1360, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-10880', - ), - 'v5p-11008': SystemCharacteristics( - '4x8x172', - 1376, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-11008', - ), - 'v5p-11136': SystemCharacteristics( - '4x12x116', - 1392, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-11136', - ), - 'v5p-11264': SystemCharacteristics( - '8x16x44', - 1408, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-11264', - ), - 'v5p-11520': SystemCharacteristics( - '12x20x24', - 1440, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-11520', - ), - 'v5p-11648': SystemCharacteristics( - '4x28x52', - 1456, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-11648', - ), - 'v5p-11776': SystemCharacteristics( - '8x8x92', - 1472, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-11776', - ), - 'v5p-11904': SystemCharacteristics( - '4x12x124', - 1488, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-11904', - ), - 'v5p-12032': SystemCharacteristics( - '4x8x188', - 1504, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-12032', - ), - 'v5p-12160': SystemCharacteristics( - '4x20x76', - 1520, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-12160', - ), - 'v5p-12288': SystemCharacteristics( - '16x16x24', - 1536, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-12288', - ), - 'v5p-13824': SystemCharacteristics( - '12x24x24', - 1728, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-13824', - ), - 'v5p-17920': SystemCharacteristics( - '16x20x28', - 2240, - 'tpu-v5p-slice', - 'ct5p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5p-17920', - ), - # v5litepod - 'v5litepod-8': SystemCharacteristics( - '2x4', - 2, - 'tpu-v5-lite-podslice', - 'ct5lp-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5litepod-8', - ), - 'v5litepod-16': SystemCharacteristics( - '4x4', - 4, - 'tpu-v5-lite-podslice', - 'ct5lp-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5litepod-16', - ), - 'v5litepod-32': SystemCharacteristics( - '4x8', - 8, - 'tpu-v5-lite-podslice', - 'ct5lp-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5litepod-32', - ), - 'v5litepod-64': SystemCharacteristics( - '8x8', - 16, - 'tpu-v5-lite-podslice', - 'ct5lp-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5litepod-64', - ), - 'v5litepod-128': SystemCharacteristics( - '8x16', - 32, - 'tpu-v5-lite-podslice', - 'ct5lp-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5litepod-128', - ), - 'v5litepod-256': SystemCharacteristics( - '16x16', - 64, - 'tpu-v5-lite-podslice', - 'ct5lp-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v5litepod-256', - ), - # v4 - 'v4-8': SystemCharacteristics( - '2x2x1', - 1, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-8', - ), - 'v4-16': SystemCharacteristics( - '2x2x2', - 2, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-16', - ), - 'v4-32': SystemCharacteristics( - '2x2x4', - 4, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-32', - ), - 'v4-64': SystemCharacteristics( - '2x4x4', - 8, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-64', - ), - 'v4-128': SystemCharacteristics( - '4x4x4', - 16, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-128', - ), - 'v4-256': SystemCharacteristics( - '4x4x8', - 32, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-256', - ), - 'v4-512': SystemCharacteristics( - '4x8x8', - 64, 'tpu-v4-podslice', 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-512', - ), - 'v4-1024': SystemCharacteristics( - '8x8x8', - 128, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-1024', - ), - 'v4-1536': SystemCharacteristics( - '8x8x12', - 192, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-1536', - ), - 'v4-2048': SystemCharacteristics( - '8x8x16', - 256, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-2048', - ), - 'v4-4096': SystemCharacteristics( - '8x16x16', - 512, - 'tpu-v4-podslice', - 'ct4p-hightpu-4t', - 4, - AcceleratorType['TPU'], - 'v4-4096', + [ + '2x2x1', + '2x2x2', + '2x2x4', + '2x4x4', + '4x4x4', + '4x4x8', + '4x8x8', + '8x8x8', + '8x8x12', + '8x8x16', + '8x16x16', + ], ), # CPU system characteristics. # Note that chips_per_vm is actually the number of vCPUs in that CPU. From 3290f9ec7900b0faa0c06436dc193aae50883ab6 Mon Sep 17 00:00:00 2001 From: gcie Date: Fri, 11 Jul 2025 07:32:27 +0000 Subject: [PATCH 3/7] fix tests internal representation of TPU machines has changed, so now grep that used the old format fails --- .github/workflows/build_tests.yaml | 6 +++--- .github/workflows/reusable_cluster_create.yaml | 2 +- .github/workflows/reusable_cluster_private.yaml | 2 +- .github/workflows/reusable_storage_create.yaml | 2 +- .github/workflows/reusable_storage_tests.yaml | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build_tests.yaml b/.github/workflows/build_tests.yaml index b3359852f..b2dabbe79 100644 --- a/.github/workflows/build_tests.yaml +++ b/.github/workflows/build_tests.yaml @@ -20,10 +20,10 @@ on: tpu-type: description: 'TPU Type' required: true - default: 'v4-8' + default: 'v4-2x2x1' type: choice options: - - v4-8 + - v4-2x2x1 push: branches: ["main","develop"] pull_request: # By default this runs for types assigned, opened and synchronize. @@ -75,7 +75,7 @@ jobs: - name: set tpu-type id: set-tpu-type run: | - echo tpu-type=v4-8 >> $GITHUB_OUTPUT + echo tpu-type=v4-2x2x1 >> $GITHUB_OUTPUT - name: set location id: set-location run: | diff --git a/.github/workflows/reusable_cluster_create.yaml b/.github/workflows/reusable_cluster_create.yaml index 5c3d78097..97285313d 100644 --- a/.github/workflows/reusable_cluster_create.yaml +++ b/.github/workflows/reusable_cluster_create.yaml @@ -26,7 +26,7 @@ on: tpu-type: description: 'TPU Type' required: false - default: 'v4-8' + default: 'v4-2x2x1' type: string tpu-type-dws: description: 'TPU Type for DWS flex nodepool' diff --git a/.github/workflows/reusable_cluster_private.yaml b/.github/workflows/reusable_cluster_private.yaml index 61a208f53..7e5534325 100644 --- a/.github/workflows/reusable_cluster_private.yaml +++ b/.github/workflows/reusable_cluster_private.yaml @@ -23,7 +23,7 @@ on: tpu-type: description: 'TPU Type' required: false - default: 'v4-8' + default: 'v4-2x2x1' type: string zone: type: string diff --git a/.github/workflows/reusable_storage_create.yaml b/.github/workflows/reusable_storage_create.yaml index f448efb98..7fcfad507 100644 --- a/.github/workflows/reusable_storage_create.yaml +++ b/.github/workflows/reusable_storage_create.yaml @@ -21,7 +21,7 @@ on: tpu-type: description: 'TPU Type' required: false - default: 'v4-8' + default: 'v4-2x2x1' type: string zone: type: string diff --git a/.github/workflows/reusable_storage_tests.yaml b/.github/workflows/reusable_storage_tests.yaml index dad8e789e..f31dc41f3 100644 --- a/.github/workflows/reusable_storage_tests.yaml +++ b/.github/workflows/reusable_storage_tests.yaml @@ -23,7 +23,7 @@ on: tpu-type: description: 'TPU Type' required: false - default: 'v4-8' + default: 'v4-2x2x1' type: string zone: type: string From 692dc8fa64339f58011d10b622bfec0669276c4b Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Tue, 29 Jul 2025 06:45:42 +0000 Subject: [PATCH 4/7] fix device_type issue --- src/xpk/core/scheduling.py | 2 +- src/xpk/core/system_characteristics.py | 12 +++--------- 2 files changed, 4 insertions(+), 10 deletions(-) diff --git a/src/xpk/core/scheduling.py b/src/xpk/core/scheduling.py index 8bc18c66d..d8957e133 100644 --- a/src/xpk/core/scheduling.py +++ b/src/xpk/core/scheduling.py @@ -49,7 +49,7 @@ def check_if_workload_can_schedule(args, system: SystemCharacteristics) -> bool: missing_gke_accelerator_type = False if not cluster_config_map.get(system.gke_accelerator): xpk_print( - f'Gke Accelerator Type Check: {args.workload} is requesting' + f'GKE Accelerator Type Check: {args.workload} is requesting' f' {system.gke_accelerator} but cluster only contains' f' {cluster_config_map.keys()}. ' ) diff --git a/src/xpk/core/system_characteristics.py b/src/xpk/core/system_characteristics.py index 630c515e3..d76ded2d0 100644 --- a/src/xpk/core/system_characteristics.py +++ b/src/xpk/core/system_characteristics.py @@ -95,7 +95,6 @@ def get_system_characteristics_by_device_type( def get_tpu_system_characteristics_map( prefix: str, - tensorcores_per_chip: int, gke_accelerator: str, machine_type: str, supported_topologies: list[str], @@ -103,7 +102,6 @@ def get_tpu_system_characteristics_map( system_characteristics_map = {} for topology in supported_topologies: total_chips = reduce(mul, (int(x) for x in topology.split('x')), 1) - num_tensorcores = total_chips * tensorcores_per_chip chips_per_vm = 1 if total_chips == 1 else 4 vms_per_slice = total_chips // chips_per_vm system = SystemCharacteristics( @@ -113,10 +111,10 @@ def get_tpu_system_characteristics_map( machine_type, chips_per_vm, AcceleratorType['TPU'], - f'{prefix}-{topology}', + f'{prefix}-{total_chips}', ) system_characteristics_map[f'{prefix}-{topology}'] = system - system_characteristics_map[f'{prefix}-{num_tensorcores}'] = system + system_characteristics_map[f'{prefix}-{total_chips}'] = system return system_characteristics_map @@ -243,18 +241,16 @@ def get_tpu_system_characteristics_map( ), # TPU system characteristics **get_tpu_system_characteristics_map( - 'v6e', 1, 'tpu-v6e-slice', 'ct6e-standard-1t', ['1x1'] + 'v6e', 'tpu-v6e-slice', 'ct6e-standard-1t', ['1x1'] ), **get_tpu_system_characteristics_map( 'v6e', - 1, 'tpu-v6e-slice', 'ct6e-standard-4t', ['2x2', '2x4', '4x4', '4x8', '8x8', '8x16', '16x16'], ), **get_tpu_system_characteristics_map( 'v5p', - 2, 'tpu-v5p-slice', 'ct5p-hightpu-4t', [ @@ -358,14 +354,12 @@ def get_tpu_system_characteristics_map( ), **get_tpu_system_characteristics_map( 'v5litepod', - 1, 'tpu-v5-lite-podslice', 'ct5lp-hightpu-4t', ['2x4', '4x4', '4x8', '8x8', '8x16', '16x16'], ), **get_tpu_system_characteristics_map( 'v4', - 2, 'tpu-v4-podslice', 'ct4p-hightpu-4t', [ From 060637ab008e959898b6265b8539471d75f9e8ad Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Tue, 29 Jul 2025 07:22:40 +0000 Subject: [PATCH 5/7] bring back tensorcore count --- src/xpk/core/system_characteristics.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/xpk/core/system_characteristics.py b/src/xpk/core/system_characteristics.py index d76ded2d0..68a5b89c2 100644 --- a/src/xpk/core/system_characteristics.py +++ b/src/xpk/core/system_characteristics.py @@ -95,6 +95,7 @@ def get_system_characteristics_by_device_type( def get_tpu_system_characteristics_map( prefix: str, + tensorcores_per_chip: int, gke_accelerator: str, machine_type: str, supported_topologies: list[str], @@ -102,6 +103,7 @@ def get_tpu_system_characteristics_map( system_characteristics_map = {} for topology in supported_topologies: total_chips = reduce(mul, (int(x) for x in topology.split('x')), 1) + num_tensorcores = total_chips * tensorcores_per_chip chips_per_vm = 1 if total_chips == 1 else 4 vms_per_slice = total_chips // chips_per_vm system = SystemCharacteristics( @@ -111,10 +113,10 @@ def get_tpu_system_characteristics_map( machine_type, chips_per_vm, AcceleratorType['TPU'], - f'{prefix}-{total_chips}', + f'{prefix}-{num_tensorcores}', ) system_characteristics_map[f'{prefix}-{topology}'] = system - system_characteristics_map[f'{prefix}-{total_chips}'] = system + system_characteristics_map[f'{prefix}-{num_tensorcores}'] = system return system_characteristics_map @@ -241,16 +243,18 @@ def get_tpu_system_characteristics_map( ), # TPU system characteristics **get_tpu_system_characteristics_map( - 'v6e', 'tpu-v6e-slice', 'ct6e-standard-1t', ['1x1'] + 'v6e', 1, 'tpu-v6e-slice', 'ct6e-standard-1t', ['1x1'] ), **get_tpu_system_characteristics_map( 'v6e', + 1, 'tpu-v6e-slice', 'ct6e-standard-4t', ['2x2', '2x4', '4x4', '4x8', '8x8', '8x16', '16x16'], ), **get_tpu_system_characteristics_map( 'v5p', + 2, 'tpu-v5p-slice', 'ct5p-hightpu-4t', [ @@ -354,12 +358,14 @@ def get_tpu_system_characteristics_map( ), **get_tpu_system_characteristics_map( 'v5litepod', + 1, 'tpu-v5-lite-podslice', 'ct5lp-hightpu-4t', ['2x4', '4x4', '4x8', '8x8', '8x16', '16x16'], ), **get_tpu_system_characteristics_map( 'v4', + 2, 'tpu-v4-podslice', 'ct4p-hightpu-4t', [ From b030db2ddf3963fe09449f9ee8ad1f5e9a1a8956 Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Tue, 29 Jul 2025 07:27:01 +0000 Subject: [PATCH 6/7] node_selector fix --- src/xpk/core/capacity.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/xpk/core/capacity.py b/src/xpk/core/capacity.py index 980ec0835..93f2d672c 100644 --- a/src/xpk/core/capacity.py +++ b/src/xpk/core/capacity.py @@ -232,7 +232,7 @@ def get_capacity_node_selectors_from_capacity_type( case CapacityType.ON_DEMAND.name: node_selector = '' case CapacityType.FLEX_START.name: - node_selector = 'cloud.google.com/gke-queued="true"' + node_selector = 'cloud.google.com/gke-queued: "true"' case CapacityType.SPOT.name: node_selector = 'cloud.google.com/gke-spot: "true"' case CapacityType.RESERVATION.name: From 4fb9771bd211486317e858586aa56a80d1a6b01e Mon Sep 17 00:00:00 2001 From: Farhad Sharabiani Date: Tue, 29 Jul 2025 13:33:13 +0000 Subject: [PATCH 7/7] fix tests --- .github/workflows/build_tests.yaml | 16 +++++++++++----- .github/workflows/reusable_cluster_create.yaml | 2 +- .github/workflows/reusable_cluster_private.yaml | 2 +- .github/workflows/reusable_storage_create.yaml | 2 +- .github/workflows/reusable_storage_tests.yaml | 2 +- .github/workflows/reusable_workload_tests.yaml | 5 ++++- 6 files changed, 19 insertions(+), 10 deletions(-) diff --git a/.github/workflows/build_tests.yaml b/.github/workflows/build_tests.yaml index b2dabbe79..58b795ca7 100644 --- a/.github/workflows/build_tests.yaml +++ b/.github/workflows/build_tests.yaml @@ -20,10 +20,10 @@ on: tpu-type: description: 'TPU Type' required: true - default: 'v4-2x2x1' + default: 'v4-8' type: choice options: - - v4-2x2x1 + - v4-8 push: branches: ["main","develop"] pull_request: # By default this runs for types assigned, opened and synchronize. @@ -40,6 +40,7 @@ jobs: group-name: ${{ steps.set-group-name.outputs.group-name }} zone: ${{ steps.set-zone.outputs.zone }} tpu-type: ${{ steps.set-tpu-type.outputs.tpu-type }} + tpu-type-topology: ${{ steps.set-tpu-type-topology.outputs.tpu-type-topology }} location: ${{steps.set-location.outputs.location}} run-id: ${{steps.set-run-id.outputs.run-id}} steps: @@ -75,7 +76,11 @@ jobs: - name: set tpu-type id: set-tpu-type run: | - echo tpu-type=v4-2x2x1 >> $GITHUB_OUTPUT + echo tpu-type=v4-8 >> $GITHUB_OUTPUT + - name: set tpu-type-topology + id: set-tpu-type-topology + run: | + echo tpu-type-topology=v4-2x2x1 >> $GITHUB_OUTPUT - name: set location id: set-location run: | @@ -152,7 +157,7 @@ jobs: with: run-id: '${{needs.set-variables.outputs.run-id}}' cluster-name: '${{needs.set-variables.outputs.cluster-name}}' - tpu-type: '${{needs.set-variables.outputs.tpu-type || inputs.tpu-type}}' + tpu-type: '${{needs.set-variables.outputs.tpu-type-topology || inputs.tpu-type}}' zone: '${{needs.set-variables.outputs.zone}}' location: '${{needs.set-variables.outputs.location}}' secrets: inherit @@ -165,7 +170,7 @@ jobs: with: cluster-name-dws: '${{needs.set-variables.outputs.cluster-name-dws}}' cluster-name: '${{needs.set-variables.outputs.cluster-name}}' - tpu-type: '${{needs.set-variables.outputs.tpu-type || inputs.tpu-type}}' + tpu-type: '${{needs.set-variables.outputs.tpu-type-topology || inputs.tpu-type}}' zone: '${{needs.set-variables.outputs.zone}}' location: '${{needs.set-variables.outputs.location}}' run-id: '${{needs.set-variables.outputs.run-id}}' @@ -180,6 +185,7 @@ jobs: cluster-name: ${{needs.set-variables.outputs.cluster-name}} cluster-name-dws: '${{needs.set-variables.outputs.cluster-name-dws}}' tpu-type: ${{needs.set-variables.outputs.tpu-type}} + tpu-type-topology: ${{needs.set-variables.outputs.tpu-type-topology}} zone: ${{needs.set-variables.outputs.zone}} run-id: '${{needs.set-variables.outputs.run-id}}' secrets: inherit diff --git a/.github/workflows/reusable_cluster_create.yaml b/.github/workflows/reusable_cluster_create.yaml index 97285313d..5c3d78097 100644 --- a/.github/workflows/reusable_cluster_create.yaml +++ b/.github/workflows/reusable_cluster_create.yaml @@ -26,7 +26,7 @@ on: tpu-type: description: 'TPU Type' required: false - default: 'v4-2x2x1' + default: 'v4-8' type: string tpu-type-dws: description: 'TPU Type for DWS flex nodepool' diff --git a/.github/workflows/reusable_cluster_private.yaml b/.github/workflows/reusable_cluster_private.yaml index 7e5534325..61a208f53 100644 --- a/.github/workflows/reusable_cluster_private.yaml +++ b/.github/workflows/reusable_cluster_private.yaml @@ -23,7 +23,7 @@ on: tpu-type: description: 'TPU Type' required: false - default: 'v4-2x2x1' + default: 'v4-8' type: string zone: type: string diff --git a/.github/workflows/reusable_storage_create.yaml b/.github/workflows/reusable_storage_create.yaml index 7fcfad507..f448efb98 100644 --- a/.github/workflows/reusable_storage_create.yaml +++ b/.github/workflows/reusable_storage_create.yaml @@ -21,7 +21,7 @@ on: tpu-type: description: 'TPU Type' required: false - default: 'v4-2x2x1' + default: 'v4-8' type: string zone: type: string diff --git a/.github/workflows/reusable_storage_tests.yaml b/.github/workflows/reusable_storage_tests.yaml index f31dc41f3..dad8e789e 100644 --- a/.github/workflows/reusable_storage_tests.yaml +++ b/.github/workflows/reusable_storage_tests.yaml @@ -23,7 +23,7 @@ on: tpu-type: description: 'TPU Type' required: false - default: 'v4-2x2x1' + default: 'v4-8' type: string zone: type: string diff --git a/.github/workflows/reusable_workload_tests.yaml b/.github/workflows/reusable_workload_tests.yaml index 6bf5f14d6..fad5034cd 100644 --- a/.github/workflows/reusable_workload_tests.yaml +++ b/.github/workflows/reusable_workload_tests.yaml @@ -24,6 +24,9 @@ on: tpu-type: required: true type: string + tpu-type-topology: + required: true + type: string tpu-type-dws: required: false type: string @@ -108,7 +111,7 @@ jobs: --docker-password='${{secrets.GCP_SA_KEY}}' \ --docker-email='${{secrets.GCP_SA_EMAIL}}' - name: Run workload with private image - run: python xpk.py workload create --cluster ${{inputs.cluster-name}} --workload $PRIVATE_IMAGE_WORKLOAD_NAME --command "echo foo" --tpu-type=${{inputs.tpu-type}} --num-slices=1 --zone=${{inputs.zone}} --docker-image=${{secrets.DOCKER_REPO_SERVER}}ubuntu2004 --docker-image-pull-secret=gcr-key + run: python xpk.py workload create --cluster ${{inputs.cluster-name}} --workload $PRIVATE_IMAGE_WORKLOAD_NAME --command "echo foo" --tpu-type=${{inputs.tpu-type-topology}} --num-slices=1 --zone=${{inputs.zone}} --docker-image=${{secrets.DOCKER_REPO_SERVER}}ubuntu2004 --docker-image-pull-secret=gcr-key - name: Wait for private image workload completion and confirm it succeeded run: python3 xpk.py workload list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --wait-for-job-completion $PRIVATE_IMAGE_WORKLOAD_NAME --timeout 300 - name: Delete kubectl secret