From 037ce2ebd69213c2a64049e233725b5c922a7bc1 Mon Sep 17 00:00:00 2001 From: Oguz Ulgen Date: Fri, 10 Oct 2025 23:38:18 -0700 Subject: [PATCH] [Benchmark] Move benchmark kernel sharding to dispatch stack-info: PR: https://github.com/pytorch/helion/pull/905, branch: oulgen/stack/139 --- .github/workflows/benchmark.yml | 32 ++++--------------- .github/workflows/benchmark_dispatch.yml | 12 +++---- .../workflows/compute-benchmark-matrix.yml | 28 ++++++++++++++-- 3 files changed, 39 insertions(+), 33 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 9cba97c96..064da3260 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -21,18 +21,13 @@ on: alias: required: true type: string - num-shards: + kernels: required: true - type: number - description: "Number of shards benchmark is running on" - shard: - required: true - type: number - description: "Maximum parallel runners to determine shards" + type: string jobs: benchmark: - name: benchmark-${{ inputs.runtime-version }}-shard${{ inputs.shard }}-py${{ inputs.python-version }}-${{ inputs.alias }} + name: benchmark-${{ inputs.runtime-version }}-${{ inputs.kernels }}-py${{ inputs.python-version }}-${{ inputs.alias }} container: image: ${{ inputs.image }} @@ -120,25 +115,12 @@ jobs: source .venv/bin/activate - KERNELS=("softmax" "jsd" "welford" "kl_div" "layer_norm" "layer_norm-bwd" "rms_norm" "rms_norm-bwd" "cross_entropy" "flash_attention" "gemm" "grouped_gemm") - NUMSHARDS=${{ inputs.num-shards }} - SHARD=${{ inputs.shard }} - - SHARD_KERNELS=() - for ((i=0; i<${#KERNELS[@]}; i++)); do - if [ $((i % NUMSHARDS)) -eq $SHARD ]; then - SHARD_KERNELS+=("${KERNELS[i]}") - fi - done - - KERNEL_LIST=$(IFS=','; echo "${SHARD_KERNELS[*]}") - echo "Running shard $SHARD of $NUMSHARDS with kernels: $KERNEL_LIST" - TEST_REPORTS_DIR=$(pwd)/test/test-reports mkdir -p "$TEST_REPORTS_DIR" echo "$TEST_REPORTS_DIR" - for kernel in "${SHARD_KERNELS[@]}"; do + KERNEL_LIST="${{ inputs.kernels }}" + for kernel in ${KERNEL_LIST//,/ }; do echo "==========================================" echo "Running benchmark for kernel: $kernel" echo "==========================================" @@ -217,7 +199,7 @@ jobs: - name: Upload the benchmark results to GitHub uses: actions/upload-artifact@v4 with: - name: benchmark-results-${{ inputs.alias }}-${{ inputs.shard }} + name: benchmark-results-${{ inputs.alias }}-${{ inputs.kernels }} path: test/test-reports upload-benchmark-results: @@ -227,7 +209,7 @@ jobs: id-token: write contents: read with: - benchmark-artifact: benchmark-results-${{ inputs.alias }}-${{ inputs.shard }} + benchmark-artifact: benchmark-results-${{ inputs.alias }}-${{ inputs.kernels }} benchmark-metadata: ${{ needs.benchmark.outputs.benchmark-metadata }} runners-info: ${{ needs.benchmark.outputs.runners-info }} dependencies: ${{ needs.benchmark.outputs.dependencies }} diff --git a/.github/workflows/benchmark_dispatch.yml b/.github/workflows/benchmark_dispatch.yml index 028e6bf98..5cd036ec7 100644 --- a/.github/workflows/benchmark_dispatch.yml +++ b/.github/workflows/benchmark_dispatch.yml @@ -27,6 +27,7 @@ jobs: uses: ./.github/workflows/compute-benchmark-matrix.yml with: max-runners: 12 + kernels: "softmax,jsd,welford,kl_div,layer_norm,layer_norm-bwd,rms_norm,rms_norm-bwd,cross_entropy,flash_attention,gemm,grouped_gemm" run-h100: needs: gen-matrix-h100 @@ -44,14 +45,14 @@ jobs: runtime-version: cu128 container-options: --gpus all alias: h100 - num-shards: ${{ matrix.num_shards }} - shard: ${{ matrix.shard }} + kernels: ${{ matrix.kernels }} gen-matrix-b200: uses: ./.github/workflows/compute-benchmark-matrix.yml if: ${{ github.event.inputs.run_b200 == 'true' || github.event_name == 'schedule' }} with: max-runners: 12 + kernels: "softmax,jsd,welford,kl_div,layer_norm,layer_norm-bwd,rms_norm,rms_norm-bwd,cross_entropy,flash_attention,gemm,grouped_gemm" run-b200: needs: gen-matrix-b200 @@ -69,14 +70,14 @@ jobs: runtime-version: cu130 container-options: --gpus all alias: b200 - num-shards: ${{ matrix.num_shards }} - shard: ${{ matrix.shard }} + kernels: ${{ matrix.kernels }} gen-matrix-mi325x: uses: ./.github/workflows/compute-benchmark-matrix.yml if: ${{ github.event.inputs.run_mi325x == 'true' || github.event_name == 'schedule' }} with: max-runners: 6 + kernels: "softmax,jsd,welford,kl_div,layer_norm,layer_norm-bwd,rms_norm,rms_norm-bwd,cross_entropy,flash_attention,gemm,grouped_gemm" run-mi325x: needs: gen-matrix-mi325x @@ -94,5 +95,4 @@ jobs: runtime-version: rocm6.4 container-options: --device=/dev/kfd --device=/dev/dri alias: mi325x - num-shards: ${{ matrix.num_shards }} - shard: ${{ matrix.shard }} + kernels: ${{ matrix.kernels }} diff --git a/.github/workflows/compute-benchmark-matrix.yml b/.github/workflows/compute-benchmark-matrix.yml index 160210814..0bb1d3c26 100644 --- a/.github/workflows/compute-benchmark-matrix.yml +++ b/.github/workflows/compute-benchmark-matrix.yml @@ -6,6 +6,9 @@ on: max-runners: required: true type: string + kernels: + required: true + type: string outputs: matrix: description: "The generated matrix for sharding" @@ -20,5 +23,26 @@ jobs: - id: gen run: | n="${{ inputs.max-runners }}" - shards=$(seq 0 $((n-1)) | paste -sd, -) - echo "matrix={\"shard\": [${shards}], \"num_shards\": [${n}]}" >> $GITHUB_OUTPUT + IFS=',' read -ra K <<< "${{ inputs.kernels }}" + total_kernels=${#K[@]} + jobs=$(( total_kernels < n ? total_kernels : n )) + + declare -a BUCKETS + for ((i=0; i> $GITHUB_OUTPUT