Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 57 additions & 9 deletions .github/workflows/third-party-benchmarks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,13 @@ jobs:
./scripts/test-triton.sh --install-sglang --skip-pip-install --skip-pytorch-install
cd benchmarks/third_party/sglang
python scaled_mm_benchmark.py --reports $REPORTS
python ../vllm/transform_results.py $REPORTS/scaled_mm_benchmark.csv $REPORTS/scaled-mm-int8-report.csv --tag $TAG --benchmark scaled-mm-int8 --param_cols="M,N,K" --bgroup sglang
python ../vllm/transform_results.py \
$REPORTS/scaled_mm_benchmark.csv \
$REPORTS/scaled-mm-int8-report.csv \
--tag $TAG \
--bgroup sglang \
--benchmark scaled-mm-int8 \
--param_cols="M,N,K"

- name: Run sglang benchmark with fp8
if: ${{ steps.install-benchmarks.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'sglang')) }}
Expand All @@ -99,29 +105,68 @@ jobs:

cd benchmarks/third_party/sglang
FP8="1" python scaled_mm_benchmark.py --reports $REPORTS
python ../vllm/transform_results.py $REPORTS/scaled_mm_benchmark.csv $REPORTS/scaled-mm-fp8-report.csv --tag $TAG --benchmark scaled-mm-fp8 --param_cols="M,N,K" --bgroup sglang
python ../vllm/transform_results.py \
$REPORTS/scaled_mm_benchmark.csv \
$REPORTS/scaled-mm-fp8-report.csv \
--tag $TAG \
--bgroup sglang \
--benchmark scaled-mm-fp8 \
--param_cols="M,N,K"

- name: Run vllm benchmarks bf16
- name: Install vllm
id: install-vllm
if: ${{ steps.install-benchmarks.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'vllm')) }}
run: |
source ./scripts/capture-hw-details.sh

./scripts/test-triton.sh --install-vllm --skip-pip-install --skip-pytorch-install

- name: Run vllm unified attention bf16
if: ${{ steps.install-vllm.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'vllm')) }}
run: |
source ./scripts/capture-hw-details.sh

cd benchmarks/third_party/vllm
python unified_attention_benchmark.py --reports $REPORTS
python transform_results.py \
$REPORTS/unified-attention-performance.csv \
$REPORTS/unified-attention-report.csv \
--tag $TAG \
--bgroup "vllm" \
--benchmark "unified-attn-bf16" \
--param_cols "q_heads,k_heads,head_size,dtype,qdtype,seq_lens,sliding_window,soft_cap,num_blocks,block_size"

- name: Run vllm batched moe bf16
if: ${{ steps.install-vllm.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'vllm')) }}
run: |
source ./scripts/capture-hw-details.sh

cp -r vllm/tests benchmarks/third_party/vllm/tests

cd benchmarks/third_party/vllm
python batched_moe_benchmark.py --reports $REPORTS
python transform_results.py $REPORTS/moe-gemm-performance.csv $REPORTS/moe-gemm-report.csv --tag $TAG --benchmark moe-bf16-benchmark --param_cols="num_experts,max_tokens_per_expert,K,N" --bgroup vllm
python transform_results.py \
$REPORTS/moe-gemm-performance.csv \
$REPORTS/moe-gemm-report.csv \
--tag $TAG \
--bgroup vllm \
--benchmark moe-bf16-benchmark \
--param_cols="num_experts,max_tokens_per_expert,K,N"


- name: Run vllm benchmarks fp8
if: ${{ steps.install-benchmarks.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'vllm')) }}
- name: Run vllm batched moe fp8
if: ${{ steps.install-vllm.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'vllm')) }}
run: |
source ./scripts/capture-hw-details.sh

cd benchmarks/third_party/vllm
FP8="1" python batched_moe_benchmark.py --reports $REPORTS
python transform_results.py $REPORTS/moe-gemm-performance.csv $REPORTS/moe-gemm-fp8-report.csv --tag $TAG --benchmark moe-fp8-benchmark --param_cols="num_experts,max_tokens_per_expert,K,N" --bgroup vllm
python transform_results.py \
$REPORTS/moe-gemm-performance.csv \
$REPORTS/moe-gemm-fp8-report.csv \
--tag $TAG \
--bgroup vllm \
--benchmark moe-fp8-benchmark \
--param_cols="num_experts,max_tokens_per_expert,K,N"


- name: Run Liger-Kernel benchmarks
Expand All @@ -136,7 +181,10 @@ jobs:
bash benchmarks/third_party/liger/run_benchmarks.sh || RET_CODE=$?

cp Liger-Kernel/benchmark/data/all_benchmark_data.csv $REPORTS/liger-raw.csv
python benchmarks/third_party/liger/transform.py $REPORTS/liger-raw.csv $REPORTS/liger-report.csv --tag $TAG
python benchmarks/third_party/liger/transform.py \
$REPORTS/liger-raw.csv \
$REPORTS/liger-report.csv \
--tag $TAG

# Return the captured return code at the end
exit "$RET_CODE"
Expand Down
13 changes: 11 additions & 2 deletions benchmarks/third_party/vllm/batched_moe_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -622,11 +622,20 @@ def triton_fn():
# Calculate performance metrics
# Memory bandwidth: A (E*M*K*2) + B (E*K*N*2) + C (E*M*N*4) bytes
# Compute: E * M * N * K * 2 FLOPs (multiply-add)
num_activated_experts = num_expert_tokens.ne(0).sum().item()
num_tokens = num_expert_tokens.sum().item()

def gbps(ms):
n_bytes = 1 if fp8 else 2
total_bytes = num_experts * (max_tokens_per_expert * K * n_bytes + K * N * n_bytes +
max_tokens_per_expert * N * 2)
# In practice due to the uniform distribution of lengths, on average half of the tokens are used,
# let's take that into account
total_bytes = (
# B matrix, we only have to load activated experts
num_activated_experts * (K * N * n_bytes) +
# A matrix - activations, we only load part of tokens
num_tokens * K * n_bytes +
# C matrix - outputs, we only load/store part of tokens
num_tokens * N * 2)
return total_bytes * (1e-9) / (ms * 1e-3)

def tflops(ms):
Expand Down
31 changes: 20 additions & 11 deletions benchmarks/third_party/vllm/transform_results.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,29 +31,38 @@ def parse_csv(csv_file_path, tag, bench_group, benchmark, param_cols):
run_uuid = uuid.uuid4().hex
current_datetime = datetime.now().isoformat()

# Create params for all rows vectorized
df['params'] = df.apply(lambda row: json.dumps({p: int(row[p]) for p in param_cols}), axis=1)
def serialize_params(row):
param2val = {}
for p in param_cols:
try:
param2val[p] = int(row[p])
except ValueError:
param2val[p] = str(row[p])
return json.dumps(param2val)

# Define compiler columns
compilers = [('triton', 'triton-TFlops'), ('pytorch', 'pytorch-TFlops'), ('triton-td', 'triton-td-TFlops')]
df['params'] = df.apply(serialize_params, axis=1)

compilers = ['pytorch', 'triton', 'triton-td']

# Create list of dataframes for each compiler
dfs = []
for compiler_name, tflops_col in compilers:
if tflops_col in df.columns:
for compiler_name in compilers:
for value_name in ['TFlops', 'GB/s']:
col = f'{compiler_name}-{value_name}'
if col not in df.columns:
continue
# Filter out NaN values
valid_rows = df[df[tflops_col].notna()].copy()
valid_rows = df[df[col].notna()].copy()
if len(valid_rows) > 0:
valid_rows['run_uuid'] = run_uuid
valid_rows['ts'] = current_datetime
valid_rows['benchmark_group'] = bench_group
valid_rows['benchmark'] = benchmark
valid_rows['compiler'] = compiler_name
valid_rows['value_name'] = 'tflops'
valid_rows['value'] = valid_rows[tflops_col].astype(float)
# GB/s -> gbps
valid_rows['value_name'] = value_name.lower().replace('/', 'p')
valid_rows['value'] = valid_rows[col].astype(float)
valid_rows['tag'] = tag

# Select only needed columns
result_df = valid_rows[[
'run_uuid', 'ts', 'benchmark_group', 'benchmark', 'compiler', 'value_name', 'value', 'params', 'tag'
]]
Expand Down
Loading
Loading