Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions clusterloader2/pkg/dependency/dra/dra.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ func (d *draDependency) Setup(config *dependency.Config) error {
"Namespace": namespace,
"WorkerNodeCount": getWorkerCount(config),
}

if extendedResourceName, ok := config.Params["ExtendedResourceName"]; ok {
mapping["ExtendedResourceName"] = extendedResourceName
}
if err := config.ClusterFramework.ApplyTemplatedManifests(
manifestsFS,
"manifests/*.yaml",
Expand Down
5 changes: 4 additions & 1 deletion clusterloader2/pkg/dependency/dra/manifests/deviceclass.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
---
# Source: dra-example-driver/templates/deviceclass.yaml
apiVersion: resource.k8s.io/v1beta1
apiVersion: resource.k8s.io/v1
kind: DeviceClass
metadata:
name: gpu.example.com
spec:
selectors:
- cel:
expression: "device.driver == 'gpu.example.com'"
{{- if .ExtendedResourceName}}
extendedResourceName: "{{.ExtendedResourceName}}"
{{- end}}
17 changes: 16 additions & 1 deletion clusterloader2/testing/dra/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,15 @@
{{$smallJobSize := 1}}
{{$smallJobCompletions := 10}}
{{$jobRunningTime := DefaultParam .CL2_JOB_RUNNING_TIME "30s"}}
{{$ENABLE_EXTENDED_RESOURCES := DefaultParam .CL2_ENABLE_EXTENDED_RESOURCES false}}

{{$extendedResourceName := ""}}
{{if $ENABLE_EXTENDED_RESOURCES}}
{{$extendedResourceName = DefaultParam .CL2_EXTENDED_RESOURCE_NAME "example.com/gpu"}}
Comment on lines +28 to +30
Copy link
Contributor

@serathius serathius Oct 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
{{$extendedResourceName := ""}}
{{if $ENABLE_EXTENDED_RESOURCES}}
{{$extendedResourceName = DefaultParam .CL2_EXTENDED_RESOURCE_NAME "example.com/gpu"}}
{{$extendedResourceName = DefaultParam .CL2_EXTENDED_RESOURCE_NAME "example.com/gpu"}}
{{if $ENABLE_EXTENDED_RESOURCES}}

Nit: Always defaulting $extendedResourceName simplifies the code, while not having any bad side effects as it's always used under ENABLE_EXTENDED_RESOURCES.

name: dra-extended-resources-steady-state
{{else}}
name: dra-steady-state
{{end}}

namespace:
number: {{$namespaces}}
Expand All @@ -37,11 +44,15 @@ tuningSets:
qpsLoad:
qps: {{$STEADY_STATE_QPS}}


dependencies:
- name: Install dra-example-driver for test
Method: DRATestDriver
Params:
WorkerNodeCount: {{.Nodes}}
{{if $ENABLE_EXTENDED_RESOURCES}}
ExtendedResourceName: {{$extendedResourceName}}
{{end}}
Timeout: 5m

steps:
Expand Down Expand Up @@ -89,6 +100,7 @@ steps:
query: histogram_quantile(0.99, sum(rate(dra_grpc_operations_duration_seconds_bucket{method_name=~".*NodePrepareResources"}[%v])) by (le))
- name: p99_dra_grpc_node_unprepare_resources
query: histogram_quantile(0.99, sum(rate(dra_grpc_operations_duration_seconds_bucket{method_name=~".*NodeUnprepareResources"}[%v])) by (le))
{{if not $ENABLE_EXTENDED_RESOURCES}}
- name: Create ResourceClaimTemplates in namespaces
phases:
- namespaceRange:
Expand All @@ -99,6 +111,7 @@ steps:
objectBundle:
- basename: single-gpu
objectTemplatePath: "resourceclaimtemplate.yaml"
{{end}}
- name: Fill cluster to {{$fillPercentage}}% utilization
phases:
- namespaceRange:
Expand All @@ -113,6 +126,7 @@ steps:
Replicas: {{$longJobSize}}
Mode: {{$MODE}}
Sleep: {{$longJobRunningTime}}
ExtendedResource: {{ $ENABLE_EXTENDED_RESOURCES }}
- name: Wait for fill pods to be running
measurements:
- Identifier: WaitForControlledPodsRunning
Expand Down Expand Up @@ -188,6 +202,7 @@ steps:
CompletionReplicas: {{$smallJobCompletions}}
Mode: {{$MODE}}
Sleep: {{$jobRunningTime}}
ExtendedResource: {{ $ENABLE_EXTENDED_RESOURCES }}
- name: Wait for short-lived jobs to finish
measurements:
- Identifier: WaitForFinishedJobs
Expand Down Expand Up @@ -216,4 +231,4 @@ steps:
- Identifier: ChurnDRAMetrics
Method: GenericPrometheusQuery
Params:
action: gather
action: gather
7 changes: 6 additions & 1 deletion clusterloader2/testing/dra/job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,13 @@ spec:
args:
- {{.Sleep}}
resources:
{{ if .ExtendedResource }}
limits:
example.com/gpu: "1"
{{ else }}
claims:
- name: gpu
resourceClaims:
- name: gpu
resourceClaimTemplateName: single-gpu-0
resourceClaimTemplateName: single-gpu-0
{{ end }}
7 changes: 6 additions & 1 deletion clusterloader2/testing/dra/long-running-job.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,13 @@ spec:
args:
- {{.Sleep}}
resources:
{{ if .ExtendedResource }}
limits:
example.com/gpu: "1"
{{ else }}
claims:
- name: gpu
resourceClaims:
- name: gpu
resourceClaimTemplateName: single-gpu-0
resourceClaimTemplateName: single-gpu-0
{{ end }}