From 037ce2ebd69213c2a64049e233725b5c922a7bc1 Mon Sep 17 00:00:00 2001
From: Oguz Ulgen <oulgen7@gmail.com>
Date: Fri, 10 Oct 2025 23:38:18 -0700
Subject: [PATCH] [Benchmark] Move benchmark kernel sharding to dispatch

stack-info: PR: https://github.com/pytorch/helion/pull/905, branch: oulgen/stack/139
---
 .github/workflows/benchmark.yml               | 32 ++++---------------
 .github/workflows/benchmark_dispatch.yml      | 12 +++----
 .../workflows/compute-benchmark-matrix.yml    | 28 ++++++++++++++--
 3 files changed, 39 insertions(+), 33 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 9cba97c96..064da3260 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -21,18 +21,13 @@ on:
       alias:
         required: true
         type: string
-      num-shards:
+      kernels:
         required: true
-        type: number
-        description: "Number of shards benchmark is running on"
-      shard:
-        required: true
-        type: number
-        description: "Maximum parallel runners to determine shards"
+        type: string
 
 jobs:
   benchmark:
-    name: benchmark-${{ inputs.runtime-version }}-shard${{ inputs.shard }}-py${{ inputs.python-version }}-${{ inputs.alias }}
+    name: benchmark-${{ inputs.runtime-version }}-${{ inputs.kernels }}-py${{ inputs.python-version }}-${{ inputs.alias }}
 
     container:
       image: ${{ inputs.image }}
@@ -120,25 +115,12 @@ jobs:
 
           source .venv/bin/activate
 
-          KERNELS=("softmax" "jsd" "welford" "kl_div" "layer_norm" "layer_norm-bwd" "rms_norm" "rms_norm-bwd" "cross_entropy" "flash_attention" "gemm" "grouped_gemm")
-          NUMSHARDS=${{ inputs.num-shards }}
-          SHARD=${{ inputs.shard }}
-
-          SHARD_KERNELS=()
-          for ((i=0; i<${#KERNELS[@]}; i++)); do
-            if [ $((i % NUMSHARDS)) -eq $SHARD ]; then
-              SHARD_KERNELS+=("${KERNELS[i]}")
-            fi
-          done
-
-          KERNEL_LIST=$(IFS=','; echo "${SHARD_KERNELS[*]}")
-          echo "Running shard $SHARD of $NUMSHARDS with kernels: $KERNEL_LIST"
-
           TEST_REPORTS_DIR=$(pwd)/test/test-reports
           mkdir -p "$TEST_REPORTS_DIR"
           echo "$TEST_REPORTS_DIR"
 
-          for kernel in "${SHARD_KERNELS[@]}"; do
+          KERNEL_LIST="${{ inputs.kernels }}"
+          for kernel in ${KERNEL_LIST//,/ }; do
             echo "=========================================="
             echo "Running benchmark for kernel: $kernel"
             echo "=========================================="
@@ -217,7 +199,7 @@ jobs:
       - name: Upload the benchmark results to GitHub
         uses: actions/upload-artifact@v4
         with:
-          name: benchmark-results-${{ inputs.alias }}-${{ inputs.shard }}
+          name: benchmark-results-${{ inputs.alias }}-${{ inputs.kernels }}
           path: test/test-reports
 
   upload-benchmark-results:
@@ -227,7 +209,7 @@ jobs:
       id-token: write
       contents: read
     with:
-      benchmark-artifact: benchmark-results-${{ inputs.alias }}-${{ inputs.shard }}
+      benchmark-artifact: benchmark-results-${{ inputs.alias }}-${{ inputs.kernels }}
       benchmark-metadata: ${{ needs.benchmark.outputs.benchmark-metadata }}
       runners-info: ${{ needs.benchmark.outputs.runners-info }}
       dependencies: ${{ needs.benchmark.outputs.dependencies }}
diff --git a/.github/workflows/benchmark_dispatch.yml b/.github/workflows/benchmark_dispatch.yml
index 028e6bf98..5cd036ec7 100644
--- a/.github/workflows/benchmark_dispatch.yml
+++ b/.github/workflows/benchmark_dispatch.yml
@@ -27,6 +27,7 @@ jobs:
     uses: ./.github/workflows/compute-benchmark-matrix.yml
     with:
       max-runners: 12
+      kernels: "softmax,jsd,welford,kl_div,layer_norm,layer_norm-bwd,rms_norm,rms_norm-bwd,cross_entropy,flash_attention,gemm,grouped_gemm"
 
   run-h100:
     needs: gen-matrix-h100
@@ -44,14 +45,14 @@ jobs:
       runtime-version: cu128
       container-options: --gpus all
       alias: h100
-      num-shards: ${{ matrix.num_shards }}
-      shard: ${{ matrix.shard }}
+      kernels: ${{ matrix.kernels }}
 
   gen-matrix-b200:
     uses: ./.github/workflows/compute-benchmark-matrix.yml
     if: ${{ github.event.inputs.run_b200 == 'true' || github.event_name == 'schedule' }}
     with:
       max-runners: 12
+      kernels: "softmax,jsd,welford,kl_div,layer_norm,layer_norm-bwd,rms_norm,rms_norm-bwd,cross_entropy,flash_attention,gemm,grouped_gemm"
 
   run-b200:
     needs: gen-matrix-b200
@@ -69,14 +70,14 @@ jobs:
       runtime-version: cu130
       container-options: --gpus all
       alias: b200
-      num-shards: ${{ matrix.num_shards }}
-      shard: ${{ matrix.shard }}
+      kernels: ${{ matrix.kernels }}
 
   gen-matrix-mi325x:
     uses: ./.github/workflows/compute-benchmark-matrix.yml
     if: ${{ github.event.inputs.run_mi325x == 'true' || github.event_name == 'schedule' }}
     with:
       max-runners: 6
+      kernels: "softmax,jsd,welford,kl_div,layer_norm,layer_norm-bwd,rms_norm,rms_norm-bwd,cross_entropy,flash_attention,gemm,grouped_gemm"
 
   run-mi325x:
     needs: gen-matrix-mi325x
@@ -94,5 +95,4 @@ jobs:
       runtime-version: rocm6.4
       container-options: --device=/dev/kfd --device=/dev/dri
       alias: mi325x
-      num-shards: ${{ matrix.num_shards }}
-      shard: ${{ matrix.shard }}
+      kernels: ${{ matrix.kernels }}
diff --git a/.github/workflows/compute-benchmark-matrix.yml b/.github/workflows/compute-benchmark-matrix.yml
index 160210814..0bb1d3c26 100644
--- a/.github/workflows/compute-benchmark-matrix.yml
+++ b/.github/workflows/compute-benchmark-matrix.yml
@@ -6,6 +6,9 @@ on:
       max-runners:
         required: true
         type: string
+      kernels:
+        required: true
+        type: string
     outputs:
       matrix:
         description: "The generated matrix for sharding"
@@ -20,5 +23,26 @@ jobs:
       - id: gen
         run: |
           n="${{ inputs.max-runners }}"
-          shards=$(seq 0 $((n-1)) | paste -sd, -)
-          echo "matrix={\"shard\": [${shards}], \"num_shards\": [${n}]}" >> $GITHUB_OUTPUT
+          IFS=',' read -ra K <<< "${{ inputs.kernels }}"
+          total_kernels=${#K[@]}
+          jobs=$(( total_kernels < n ? total_kernels : n ))
+
+          declare -a BUCKETS
+          for ((i=0; i<jobs; i++)); do BUCKETS[i]=""; done
+          for ((i=0; i<total_kernels; i++)); do
+            idx=$(( i % jobs ))
+            if [[ -z "${BUCKETS[idx]}" ]]; then
+              BUCKETS[idx]="${K[i]}"
+            else
+              BUCKETS[idx]="${BUCKETS[idx]},${K[i]}"
+            fi
+          done
+
+          json='{"kernels":['
+          for ((i=0; i<jobs; i++)); do
+            json+="\"${BUCKETS[i]}\""
+            if (( i < jobs - 1 )); then json+=","; fi
+          done
+          json+=']}'
+
+          echo "matrix=$json" >> $GITHUB_OUTPUT