Skip to content

Commit 95d7ddb

Browse files
authored
Merge branch 'develop' into wstcliyu/kueue-tpu-flavor
2 parents 1bb60af + cd17009 commit 95d7ddb

37 files changed

+1772
-299
lines changed

.github/workflows/build_tests.yaml

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,10 +20,10 @@ on:
2020
tpu-type:
2121
description: 'TPU Type'
2222
required: true
23-
default: 'v4-2x2x1'
23+
default: 'v4-8'
2424
type: choice
2525
options:
26-
- v4-2x2x1
26+
- v4-8
2727
push:
2828
branches: ["main","develop"]
2929
pull_request: # By default this runs for types assigned, opened and synchronize.
@@ -36,6 +36,7 @@ jobs:
3636
cancel-in-progress: true
3737
outputs:
3838
cluster-name: ${{ steps.set-cluster-name.outputs.cluster-name }}
39+
cluster-name-dws: ${{ steps.set-cluster-name-dws.outputs.cluster-name-dws }}
3940
group-name: ${{ steps.set-group-name.outputs.group-name }}
4041
zone: ${{ steps.set-zone.outputs.zone }}
4142
tpu-type: ${{ steps.set-tpu-type.outputs.tpu-type }}
@@ -59,6 +60,10 @@ jobs:
5960
id: set-cluster-name
6061
run: |
6162
echo cluster-name=build-xpk-2-nodepools-${{steps.set-run-id.outputs.run-id}} >> $GITHUB_OUTPUT
63+
- name: set cluster-name-dws
64+
id: set-cluster-name-dws
65+
run: |
66+
echo cluster-name-dws=build-xpk-2-nodepools-dws-${{steps.set-run-id.outputs.run-id}} >> $GITHUB_OUTPUT
6267
- name: set group-name
6368
id: set-group-name
6469
run: |
@@ -70,7 +75,7 @@ jobs:
7075
- name: set tpu-type
7176
id: set-tpu-type
7277
run: |
73-
echo tpu-type=v4-2x2x1 >> $GITHUB_OUTPUT
78+
echo tpu-type=v4-8 >> $GITHUB_OUTPUT
7479
- name: set location
7580
id: set-location
7681
run: |
@@ -158,6 +163,7 @@ jobs:
158163
cancel-in-progress: true
159164
uses: ./.github/workflows/reusable_cluster_create.yaml
160165
with:
166+
cluster-name-dws: '${{needs.set-variables.outputs.cluster-name-dws}}'
161167
cluster-name: '${{needs.set-variables.outputs.cluster-name}}'
162168
tpu-type: '${{needs.set-variables.outputs.tpu-type || inputs.tpu-type}}'
163169
zone: '${{needs.set-variables.outputs.zone}}'
@@ -172,6 +178,7 @@ jobs:
172178
cancel-in-progress: true
173179
with:
174180
cluster-name: ${{needs.set-variables.outputs.cluster-name}}
181+
cluster-name-dws: '${{needs.set-variables.outputs.cluster-name-dws}}'
175182
tpu-type: ${{needs.set-variables.outputs.tpu-type}}
176183
zone: ${{needs.set-variables.outputs.zone}}
177184
run-id: '${{needs.set-variables.outputs.run-id}}'
@@ -204,6 +211,7 @@ jobs:
204211
needs: [set-variables, storage-tests]
205212
uses: ./.github/workflows/reusable_cluster_delete.yaml
206213
with:
214+
cluster-name-dws: ${{needs.set-variables.outputs.cluster-name-dws}}
207215
cluster-name: ${{needs.set-variables.outputs.cluster-name}}
208216
run-id: ${{needs.set-variables.outputs.run-id}}
209217
zone: ${{needs.set-variables.outputs.zone}}

.github/workflows/reusable_cluster_create.yaml

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,26 @@ on:
2020
cluster-name:
2121
type: string
2222
required: true
23+
cluster-name-dws:
24+
type: string
25+
required: true
2326
tpu-type:
2427
description: 'TPU Type'
2528
required: false
26-
default: 'v4-2x2x1'
29+
default: 'v4-8'
30+
type: string
31+
tpu-type-dws:
32+
description: 'TPU Type for DWS flex nodepool'
2733
type: string
34+
required: false
35+
default: v5p-8
2836
zone:
2937
type: string
3038
required: true
39+
zone-dws:
40+
type: string
41+
required: false
42+
default: us-east5-a
3143
location:
3244
type: string
3345
required: true
@@ -39,6 +51,7 @@ on:
3951
env:
4052
# Names must be unique in parallel running tests.
4153
CLUSTER_ARGUMENTS: "--network=${{secrets.NETWORK_NAME}} --subnetwork=${{secrets.SUBNETWORK_NAME}} --maintenance-window=23:50"
54+
CLUSTER_ARGUMENTS_DWS: "--network=${{secrets.NETWORK_NAME}} --subnetwork=${{secrets.SUBNETWORK_NAME_DWS}} --maintenance-window=23:50"
4255

4356
jobs:
4457
cluster-create:
@@ -76,6 +89,8 @@ jobs:
7689
gcloud config get compute/zone
7790
- name: Check xpk installation
7891
run: xpk --help
92+
- name: Create a DWS flex queued xpk cluster
93+
run: python xpk.py cluster create --cluster ${{inputs.cluster-name-dws}} --tpu-type=${{inputs.tpu-type-dws}} --num-slices=1 --zone=${{inputs.zone-dws}} --default-pool-cpu-num-nodes=2 --flex --enable-gcpfilestore-csi-driver --enable-gcsfuse-csi-driver --custom-cluster-arguments="${CLUSTER_ARGUMENTS_DWS}"
7994
- name: Create a Pathways-enabled XPK Cluster with 2x ${{inputs.tpu-type}} nodepools. Larger num-nodes to avoid master resizing.
8095
run: python xpk.py cluster create-pathways --cluster ${{inputs.cluster-name}} --tpu-type=${{inputs.tpu-type}} --num-slices=1 --zone=${{inputs.zone}} --default-pool-cpu-machine-type=n1-standard-16 --default-pool-cpu-num-nodes=2 --reservation='${{ secrets.GCP_TPU_V4_RESERVATION }}' --enable-gcpfilestore-csi-driver --enable-gcsfuse-csi-driver --custom-cluster-arguments="${CLUSTER_ARGUMENTS}"
8196

.github/workflows/reusable_cluster_delete.yaml

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,13 @@ on:
2020
zone:
2121
required: true
2222
type: string
23+
cluster-name-dws:
24+
required: true
25+
type: string
26+
zone-dws:
27+
required: false
28+
type: string
29+
default: us-east5-a
2330
run-id:
2431
required: true
2532
type: string
@@ -61,4 +68,7 @@ jobs:
6168
run: xpk --help
6269
- name: Delete the cluster created
6370
if: always()
64-
run: python xpk.py cluster delete --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --force
71+
run: python xpk.py cluster delete --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --force
72+
- name: Delete the cluster created
73+
if: always()
74+
run: python xpk.py cluster delete --cluster ${{inputs.cluster-name-dws}} --zone=${{inputs.zone-dws}} --force

.github/workflows/reusable_cluster_private.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ on:
2323
tpu-type:
2424
description: 'TPU Type'
2525
required: false
26-
default: 'v4-2x2x1'
26+
default: 'v4-8'
2727
type: string
2828
zone:
2929
type: string

.github/workflows/reusable_storage_create.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ on:
2121
tpu-type:
2222
description: 'TPU Type'
2323
required: false
24-
default: 'v4-2x2x1'
24+
default: 'v4-8'
2525
type: string
2626
zone:
2727
type: string

.github/workflows/reusable_storage_tests.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ on:
2323
tpu-type:
2424
description: 'TPU Type'
2525
required: false
26-
default: 'v4-2x2x1'
26+
default: 'v4-8'
2727
type: string
2828
zone:
2929
type: string

.github/workflows/reusable_workload_tests.yaml

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,18 +18,30 @@ on:
1818
cluster-name:
1919
required: true
2020
type: string
21+
cluster-name-dws:
22+
required: true
23+
type: string
2124
tpu-type:
2225
required: true
2326
type: string
27+
tpu-type-dws:
28+
required: false
29+
type: string
30+
default: v5p-8
2431
zone:
2532
required: true
2633
type: string
34+
zone-dws:
35+
required: false
36+
type: string
37+
default: us-east5-a
2738
run-id:
2839
required: true
2940
type: string
3041

3142
env:
3243
WORKLOAD_NAME: xpktest-build-${{ github.run_attempt }}
44+
WORKLOAD_NAME_DWS: xpktest-build-${{ github.run_attempt }}-dws
3345
PRIVATE_IMAGE_WORKLOAD_NAME: xpktest-build-priv-${{ github.run_attempt }}
3446
PATHWAYS_WORKLOAD_NAME: xpkpw-build-${{ github.run_attempt }}
3547

@@ -78,6 +90,10 @@ jobs:
7890
run: python3 xpk.py inspector --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --workload $WORKLOAD_NAME
7991
- name: Wait for workload completion and confirm it succeeded
8092
run: python3 xpk.py workload list --cluster ${{inputs.cluster-name}} --zone=${{inputs.zone}} --wait-for-job-completion $WORKLOAD_NAME --timeout 300
93+
- name: Run dws flex queued TPU workload
94+
run: python3 xpk.py workload create --workload $WORKLOAD_NAME_DWS --cluster ${{inputs.cluster-name-dws}} --zone=${{inputs.zone-dws}} --tpu-type=${{inputs.tpu-type-dws}} --flex --command "echo foo" --num-slices=1
95+
- name: Wait for workload completion and confirm it succeeded
96+
run: python3 xpk.py workload list --cluster ${{inputs.cluster-name-dws}} --zone=${{inputs.zone-dws}} --wait-for-job-completion $WORKLOAD_NAME_DWS --timeout 500
8197
- name: Run a Pathways workload on Ubuntu base image
8298
run: python xpk.py workload create-pathways --cluster ${{inputs.cluster-name}} --workload $PATHWAYS_WORKLOAD_NAME --docker-image='marketplace.gcr.io/google/ubuntu2004' --tpu-type=${{inputs.tpu-type}} --num-slices=1 --zone=${{inputs.zone}} --command "echo \"Hello world from a test script! \""
8399
- name: Wait for Pathways workload completion and confirm it succeeded

README.md

Lines changed: 44 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -227,6 +227,13 @@ all zones.
227227
--num-slices=4 --spot
228228
```
229229

230+
* Cluster Create (DWS flex queued capacity):
231+
```shell
232+
python3 xpk.py cluster create \
233+
--cluster xpk-test --tpu-type=v5litepod-16 \
234+
--num-slices=4 --flex
235+
```
236+
230237
* Cluster Create for Pathways:
231238
Pathways compatible cluster can be created using `cluster create-pathways`.
232239
```shell
@@ -463,6 +470,7 @@ Currently, the below flags/arguments are supported for A3 Mega, A3 Ultra and A4
463470
* `--reservation`
464471
* `--spot`
465472
* `--on-demand` (A3 Mega only)
473+
* `--flex`
466474

467475
## Running XPK on existing clusters
468476

@@ -486,9 +494,10 @@ Currently XPK supports the below types of storages:
486494
- [Google Cloud Filestore](#filestore)
487495
- [Google Cloud Parallelstore](#parallelstore)
488496
- [Google Cloud Block storages (Persistent Disk, Hyperdisk)](#block-storage-persistent-disk-hyperdisk)
497+
- [Google Cloud Managed Lustre](#managed-lustre)
489498

490499
### FUSE
491-
A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so applications can read and write objects in your bucket using standard file system semantics.
500+
A FUSE adapter lets you mount and access Cloud Storage buckets as local file systems, so workloads can read and write objects in your bucket using standard file system semantics.
492501

493502
To use the GCS FUSE with XPK you need to create a [Storage Bucket](https://console.cloud.google.com/storage/).
494503

@@ -515,7 +524,7 @@ Parameters:
515524
516525
### Filestore
517526
518-
A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
527+
A Filestore adapter lets you mount and access [Filestore instances](https://cloud.google.com/filestore/) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
519528
520529
To create and attach a GCP Filestore instance to your cluster use `xpk storage create` command with `--type=gcpfilestore`:
521530
@@ -551,7 +560,7 @@ Commands `xpk storage create` and `xpk storage attach` with `--type=gcpfilestore
551560
552561
### Parallelstore
553562
554-
A Parallelstore adapter lets you mount and access [Parallelstore instances](https://cloud.google.com/parallelstore/) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
563+
A Parallelstore adapter lets you mount and access [Parallelstore instances](https://cloud.google.com/parallelstore/) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
555564
556565
To use the GCS Parallelstore with XPK you need to create a [Parallelstore Instance](https://console.cloud.google.com/parallelstore/).
557566
@@ -575,7 +584,7 @@ Parameters:
575584

576585
### Block storage (Persistent Disk, Hyperdisk)
577586

578-
A PersistentDisk adapter lets you mount and access Google Cloud Block storage solutions ([Persistent Disk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#pd), [Hyperdisk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#hyperdisk)) as local file systems, so applications can read and write files in your volumes using standard file system semantics.
587+
A PersistentDisk adapter lets you mount and access Google Cloud Block storage solutions ([Persistent Disk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#pd), [Hyperdisk](https://cloud.google.com/kubernetes-engine/docs/concepts/storage-overview#hyperdisk)) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
579588

580589
To use the GCE PersistentDisk with XPK you need to create a [disk in GCE](https://cloud.google.com/compute/docs/disks). Please consider that the disk type you are creating is [compatible with the VMs](https://cloud.google.com/compute/docs/machine-resource#machine_type_comparison) in the default and accelerator nodepools.
581590

@@ -597,6 +606,30 @@ Parameters:
597606
- `--readonly` - if set to true, workload can only read from storage.
598607
- `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
599608
609+
### Managed Lustre
610+
611+
A Managed Lustre adaptor lets you mount and access [Google Cloud Managed Lustre instances](https://cloud.google.com/kubernetes-engine/docs/concepts/managed-lustre) as local file systems, so workloads can read and write files in your volumes using standard file system semantics.
612+
613+
To use the GCP Managed Lustre with XPK you need to create [an instance](https://cloud.google.com/managed-lustre/docs/create-instance). Please make sure you enable GKE support when creating the instance (gcloud ex. `--gke-support-enabled`).
614+
615+
Once it's ready you can use `xpk storage attach` with `--type=lustre` command to attach a Managed Lustre instance to your cluster. Currently, attaching a Managed Lustre instance is supported only by providing a manifest file.
616+
617+
```shell
618+
python3 xpk.py storage attach test-lustre-storage --type=lustre \
619+
--project=$PROJECT --cluster=$CLUSTER --zone=$ZONE \
620+
--mount-point='/test-mount-point' --readonly=false \
621+
--auto-mount=true \
622+
--manifest='./examples/storage/lustre-manifest-attach.yaml'
623+
```
624+
625+
Parameters:
626+
627+
- `--type` - type of the storage `lustre`
628+
- `--auto-mount` - if set to true all workloads will have this storage mounted by default.
629+
- `--mount-point` - the path on which this storage should be mounted for a workload.
630+
- `--readonly` - if set to true, workload can only read from storage.
631+
- `--manifest` - path to the manifest file containing PersistentVolume and PresistentVolumeClaim definitions.
632+
600633
### List attached storages
601634

602635
```shell
@@ -638,8 +671,14 @@ python3 xpk.py storage delete test-fs-instance \
638671
python3 xpk.py workload create \
639672
--workload xpk-test-workload --command "echo goodbye" \
640673
--cluster xpk-test \
641-
--tpu-type=v5litepod-16 --projet=$PROJECT
674+
--tpu-type=v5litepod-16 --project=$PROJECT
642675
```
676+
* Workload create(DWS flex with queued provisioning):
677+
```shell
678+
python3 xpk.py workload create \
679+
--workload xpk-test-workload --command "echo goodbye" \
680+
--cluster xpk-test --flex \
681+
--tpu-type=v5litepod-16 --project=$PROJECT
643682
644683
* Workload Create for Pathways:
645684
Pathways workload can be submitted using `workload create-pathways` on a Pathways enabled cluster (created with `cluster create-pathways`)
Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
apiVersion: v1
2+
kind: PersistentVolume
3+
metadata:
4+
name: xpk-lustre-pv
5+
spec:
6+
storageClassName: ""
7+
capacity:
8+
storage: STORAGE_SIZE #ex. 36000Gi
9+
accessModes:
10+
- ReadWriteMany
11+
persistentVolumeReclaimPolicy: Retain
12+
volumeMode: Filesystem
13+
claimRef:
14+
namespace: default
15+
name: xpk-lustre-pvc
16+
csi:
17+
driver: lustre.csi.storage.gke.io
18+
volumeHandle: "PROJECT_ID/ZONE/INSTANCE_NAME"
19+
volumeAttributes:
20+
ip: IP_ADDRESS
21+
filesystem: FILE_SYSTEM
22+
---
23+
kind: PersistentVolumeClaim
24+
apiVersion: v1
25+
metadata:
26+
name: xpk-lustre-pvc
27+
spec:
28+
accessModes:
29+
- ReadWriteMany
30+
storageClassName: ""
31+
volumeName: xpk-lustre-pv
32+
resources:
33+
requests:
34+
storage: STORAGE_SIZE

src/xpk/blueprints/a3mega/kueue-xpk-configuration.yaml.tftpl

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,30 @@ metadata:
1515
name: "1xh100-mega-80gb-8"
1616
spec:
1717
nodeLabels:
18-
cloud.google.com/gke-accelerator: nvidia-h100-mega-80gb
19-
${tas_name}
18+
cloud.google.com/gke-accelerator: "nvidia-h100-mega-80gb"
19+
%{~ if reservation==1 ~}
20+
topologyName: "gke-default"
21+
%{ endif }
22+
---
23+
apiVersion: kueue.x-k8s.io/v1beta1
24+
kind: ProvisioningRequestConfig
25+
metadata:
26+
name: dws-config
27+
spec:
28+
provisioningClassName: queued-provisioning.gke.io
29+
managedResources:
30+
- nvidia.com/gpu
31+
---
32+
apiVersion: kueue.x-k8s.io/v1beta1
33+
kind: AdmissionCheck
34+
metadata:
35+
name: dws-prov
36+
spec:
37+
controllerName: kueue.x-k8s.io/provisioning-request
38+
parameters:
39+
apiGroup: kueue.x-k8s.io
40+
kind: ProvisioningRequestConfig
41+
name: dws-config
2042
---
2143
apiVersion: kueue.x-k8s.io/v1beta1
2244
kind: ClusterQueue
@@ -35,6 +57,10 @@ spec:
3557
nominalQuota: 10000
3658
- name: "memory"
3759
nominalQuota: 10000Gi
60+
%{~ if flex_start==1 ~}
61+
admissionChecks:
62+
- dws-prov
63+
%{ endif }
3864
---
3965
apiVersion: kueue.x-k8s.io/v1beta1
4066
kind: LocalQueue

0 commit comments

Comments
 (0)