Skip to content

Commit 1448853

Browse files
authored
updated workshop examples (#17)
* updated workshop examples * update
1 parent 1b96e9e commit 1448853

File tree

4 files changed

+328
-9
lines changed

4 files changed

+328
-9
lines changed

docs/AMD_workshop/README.md

Lines changed: 48 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -23,15 +23,46 @@ powershell -ExecutionPolicy Bypass -Command "iwr -UseBasicParsing https://raw.gi
2323
popcorn-cli register github
2424
```
2525

26-
3. **Submit your solution:**
27-
```bash
28-
popcorn-cli submit --gpu MI300 --leaderboard amd-fp8-mm --mode test example.py
29-
```
30-
31-
4. **Interactive mode** (choose GPU and options):
32-
```bash
33-
popcorn-cli submit example.py
34-
```
26+
## 🏃 Run Examples
27+
28+
Try out the example implementations to get familiar with the system:
29+
30+
### For Linux/macOS:
31+
```bash
32+
# Download and test v1.py (reference implementation)
33+
wget https://raw.githubusercontent.com/gpu-mode/popcorn-cli/main/docs/AMD_workshop/v1.py
34+
popcorn-cli submit --gpu MI300 --leaderboard amd-fp8-mm --mode test v1.py
35+
36+
# Download and test v2.py (basic optimization)
37+
wget https://raw.githubusercontent.com/gpu-mode/popcorn-cli/main/docs/AMD_workshop/v2.py
38+
popcorn-cli submit --gpu MI300 --leaderboard amd-fp8-mm --mode test v2.py
39+
40+
# Download and test v3.py (advanced optimization)
41+
wget https://raw.githubusercontent.com/gpu-mode/popcorn-cli/main/docs/AMD_workshop/v3.py
42+
popcorn-cli submit --gpu MI300 --leaderboard amd-fp8-mm --mode test v3.py
43+
```
44+
45+
### For Windows (PowerShell):
46+
```powershell
47+
# Download and test v1.py (reference implementation)
48+
Invoke-WebRequest -Uri "https://raw.githubusercontent.com/gpu-mode/popcorn-cli/main/docs/AMD_workshop/v1.py" -OutFile "v1.py"
49+
popcorn-cli submit --gpu MI300 --leaderboard amd-fp8-mm --mode test v1.py
50+
51+
# Download and test v2.py (basic optimization)
52+
Invoke-WebRequest -Uri "https://raw.githubusercontent.com/gpu-mode/popcorn-cli/main/docs/AMD_workshop/v2.py" -OutFile "v2.py"
53+
popcorn-cli submit --gpu MI300 --leaderboard amd-fp8-mm --mode test v2.py
54+
55+
# Download and test v3.py (advanced optimization)
56+
Invoke-WebRequest -Uri "https://raw.githubusercontent.com/gpu-mode/popcorn-cli/main/docs/AMD_workshop/v3.py" -OutFile "v3.py"
57+
popcorn-cli submit --gpu MI300 --leaderboard amd-fp8-mm --mode test v3.py
58+
```
59+
60+
### 💡 Pro Tips:
61+
- Start with **v1.py** (reference implementation) to understand the baseline
62+
- Try **v2.py** for basic optimizations
63+
- Challenge yourself with **v3.py** for advanced Triton optimizations
64+
- Use `--mode benchmark` instead of `--mode test` to see performance metrics
65+
3566

3667
## 🛠️ Manual Installation
3768

@@ -58,3 +89,11 @@ If the scripts don't work, you can manually install:
5889
- Run `popcorn-cli --help` for usage information
5990
- Check the [main repository](https://github.com/gpu-mode/popcorn-cli) and open an issue
6091
- Join the [GPU Mode Discord](https://discord.gg/gpumode) and ask a question in #amd-competition
92+
93+
## 🧑‍🎓 Learn more from our favorite writeups
94+
95+
* https://github.com/luongthecong123/fp8-quant-matmul
96+
* https://seb-v.github.io/optimization/update/2025/01/20/Fast-GPU-Matrix-multiplication.html
97+
* https://akashkarnatak.github.io/amd-challenge/
98+
* https://www.bilibili.com/read/cv41954307/?opus_fallback=1
99+
* https://github.com/Snektron/gpumode-amd-fp8-mm

docs/AMD_workshop/example.py renamed to docs/AMD_workshop/v1.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,6 @@
1+
#!POPCORN leaderboard amd-fp8-mm
2+
#!POPCORN gpu MI300
3+
14
import torch
25
from task import input_t, output_t
36

docs/AMD_workshop/v2.py

Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
#!POPCORN leaderboard amd-fp8-mm
2+
#!POPCORN gpu MI300
3+
4+
from task import input_t, output_t
5+
import torch
6+
import triton
7+
import triton.language as tl
8+
9+
10+
@triton.jit
11+
def kernel(
12+
A_ptr,
13+
B_ptr,
14+
A_scale_ptr,
15+
B_scale_ptr,
16+
C_ptr,
17+
M: tl.constexpr,
18+
N: tl.constexpr,
19+
K: tl.constexpr,
20+
BLOCK_M: tl.constexpr,
21+
BLOCK_N: tl.constexpr,
22+
BLOCK_K: tl.constexpr,
23+
BLOCK_Q: tl.constexpr = 128,
24+
):
25+
program_id = tl.program_id(0)
26+
num_pid_across_n = tl.cdiv(N, BLOCK_N)
27+
28+
program_id_m = program_id // num_pid_across_n
29+
program_id_n = program_id % num_pid_across_n
30+
31+
# Simple stride assumptions (no transpose)
32+
A_stride_m, A_stride_k = 1, M
33+
B_stride_n, B_stride_k = 1, N
34+
C_stride_m, C_stride_n = N, 1
35+
36+
# Scale matrices: A is 1x128, B is 128x128 chunks
37+
A_scale_stride_m, A_scale_stride_k = 1, M
38+
B_scale_stride_n, B_scale_stride_k = 1, tl.cdiv(N, BLOCK_Q)
39+
40+
# Calculate output block position
41+
offset_m = program_id_m * BLOCK_M
42+
offset_n = program_id_n * BLOCK_N
43+
44+
# Create block offset arrays
45+
block_offsets_m = offset_m + tl.arange(0, BLOCK_M)
46+
block_offsets_n = offset_n + tl.arange(0, BLOCK_N)
47+
block_offsets_k = tl.arange(0, BLOCK_K)
48+
49+
# Create pointers for A and B blocks
50+
A_block_ptrs = A_ptr + (
51+
block_offsets_m[:, None] * A_stride_m + block_offsets_k[None, :] * A_stride_k
52+
)
53+
B_block_ptrs = B_ptr + (
54+
block_offsets_k[:, None] * B_stride_k + block_offsets_n[None, :] * B_stride_n
55+
)
56+
57+
# Scale pointers
58+
A_scale_block_ptrs = A_scale_ptr + (block_offsets_m[:, None] * A_scale_stride_m)
59+
B_scale_block_ptrs = B_scale_ptr + (offset_n // BLOCK_Q) * B_scale_stride_n
60+
61+
# Main accumulator
62+
master_accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
63+
64+
# Process K dimension in BLOCK_Q chunks (128 elements at a time)
65+
num_k_iters = K // BLOCK_Q
66+
for _ in range(0, num_k_iters):
67+
# Inner accumulator for current 128-element K chunk
68+
inner_accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
69+
70+
# Process the 128-element chunk in smaller BLOCK_K pieces
71+
for _ in tl.range(0, BLOCK_Q // BLOCK_K):
72+
A_block = tl.load(A_block_ptrs) # (BLOCK_M, BLOCK_K)
73+
B_block = tl.load(B_block_ptrs) # (BLOCK_K, BLOCK_N)
74+
inner_accumulator = tl.dot(A_block, B_block, inner_accumulator)
75+
76+
# Move to next K chunk
77+
A_block_ptrs += BLOCK_K * A_stride_k
78+
B_block_ptrs += BLOCK_K * B_stride_k
79+
80+
# Load scales and apply to inner result
81+
A_scales = tl.load(A_scale_block_ptrs) # (BLOCK_M, 1)
82+
B_scales = tl.load(B_scale_block_ptrs) # scalar
83+
master_accumulator += inner_accumulator * (A_scales * B_scales)
84+
85+
# Move to next scale block
86+
A_scale_block_ptrs += A_scale_stride_k
87+
B_scale_block_ptrs += B_scale_stride_k
88+
89+
# Store final result
90+
block_offsets_m = (program_id_m * BLOCK_M + tl.arange(0, BLOCK_M)[:, None])
91+
block_offsets_n = (program_id_n * BLOCK_N + tl.arange(0, BLOCK_N)[None, :])
92+
mask = (block_offsets_m < M) & (block_offsets_n < N)
93+
C_block_ptrs = C_ptr + (block_offsets_m * C_stride_m + block_offsets_n * C_stride_n)
94+
tl.store(C_block_ptrs, master_accumulator, mask=mask)
95+
96+
97+
def custom_kernel(data: input_t) -> output_t:
98+
A_tensor, B_tensor, A_scale_tensor, B_scale_tensor, C_tensor = data
99+
100+
M, K = A_tensor.shape
101+
N, _ = B_tensor.shape
102+
103+
# Fixed, simple configuration - no dynamic tuning
104+
BLOCK_M = 64
105+
BLOCK_N = 64
106+
BLOCK_K = 32
107+
108+
# Launch grid
109+
num_blocks = triton.cdiv(M, BLOCK_M) * triton.cdiv(N, BLOCK_N)
110+
111+
kernel[(num_blocks,)](
112+
A_tensor,
113+
B_tensor,
114+
A_scale_tensor,
115+
B_scale_tensor,
116+
C_tensor,
117+
M, N, K,
118+
BLOCK_M=BLOCK_M,
119+
BLOCK_N=BLOCK_N,
120+
BLOCK_K=BLOCK_K,
121+
num_warps=4,
122+
num_stages=2,
123+
)
124+
125+
return C_tensor

docs/AMD_workshop/v3.py

Lines changed: 152 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
#!POPCORN leaderboard amd-fp8-mm
2+
#!POPCORN gpu MI300
3+
4+
from task import input_t, output_t
5+
import torch
6+
import triton
7+
import triton.language as tl
8+
9+
NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
10+
11+
12+
@triton.jit
13+
def kernel(
14+
A_ptr,
15+
B_ptr,
16+
A_scale_ptr,
17+
B_scale_ptr,
18+
C_ptr,
19+
M: tl.constexpr,
20+
N: tl.constexpr,
21+
K: tl.constexpr,
22+
BLOCK_M: tl.constexpr,
23+
BLOCK_N: tl.constexpr,
24+
BLOCK_K: tl.constexpr,
25+
BLOCK_Q: tl.constexpr = 128,
26+
TRANSPOSE: tl.constexpr = False,
27+
):
28+
program_id = tl.program_id(0)
29+
num_pid_across_n = tl.cdiv(N, BLOCK_N)
30+
31+
program_id_m = program_id // num_pid_across_n
32+
program_id_n = program_id % num_pid_across_n
33+
34+
if not TRANSPOSE:
35+
A_stride_m, A_stride_k = 1, M
36+
B_stride_n, B_stride_k = 1, N
37+
else:
38+
A_stride_m, A_stride_k = K, 1
39+
B_stride_n, B_stride_k = K, 1
40+
C_stride_m, C_stride_n = N, 1
41+
# Scale matrices are stored in column-major order, with A being 1x128 and B being 128x128 chunks
42+
# BLOCK_Q is 128
43+
A_scale_stride_m, A_scale_stride_k = 1, M
44+
B_scale_stride_n, B_scale_stride_k = 1, tl.cdiv(N, BLOCK_Q)
45+
46+
# Calculate the row and column indices in the output matrix for the current pid
47+
offset_m = program_id_m * BLOCK_M
48+
offset_n = program_id_n * BLOCK_N
49+
50+
# Arange to make a row and column ptrs
51+
block_offsets_m = offset_m + tl.arange(0, BLOCK_M)
52+
block_offsets_n = offset_n + tl.arange(0, BLOCK_N)
53+
block_offsets_k = tl.arange(0, BLOCK_K)
54+
55+
# ptrs for BLOCK_M rows of A and BLOCK_N columns of B
56+
A_block_ptrs = A_ptr + (
57+
block_offsets_m[:, None] * A_stride_m + block_offsets_k[None, :] * A_stride_k
58+
)
59+
B_block_ptrs = B_ptr + (
60+
block_offsets_k[:, None] * B_stride_k + block_offsets_n[None, :] * B_stride_n
61+
)
62+
# since a_scales are 1x128, a_scale_ptrs need to be of shape (BLOCK_M, 1)
63+
# since N, K <= BLOCK_Q, b_scale_ptrs is always a scalar ptr
64+
A_scale_block_ptrs = A_scale_ptr + (block_offsets_m[:, None] * A_scale_stride_m)
65+
B_scale_block_ptrs = B_scale_ptr + (offset_n // BLOCK_Q) * B_scale_stride_n
66+
67+
# Initialize accumulator for the currrent pid (responsible for BLOCK_M * BLOCK_N elements)
68+
master_accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
69+
70+
# In each iteration we we load BLOCK_Q elements from K dimension for BLOCK_M rows, resp. BLOCK_N columns
71+
# We choose this to use only 1 scale per iteration
72+
num_k_iters = K // BLOCK_Q
73+
for _ in range(0, num_k_iters):
74+
# Initialize accumulator for the current k iteration
75+
inner_accumulator = tl.zeros((BLOCK_M, BLOCK_N), dtype=tl.float32)
76+
# In each iteration we load BLOCK_K elements from K dimension for BLOCK_M rows, resp. BLOCK_N columns
77+
# We choose this to use small `tl.dot` for the inner accumulator
78+
for _ in tl.range(0, BLOCK_Q // BLOCK_K):
79+
A_block = tl.load(A_block_ptrs) # (BLOCK_M, BLOCK_K)
80+
B_block = tl.load(B_block_ptrs) # (BLOCK_K, BLOCK_N)
81+
inner_accumulator = tl.dot(
82+
A_block, B_block, inner_accumulator
83+
) # (BLOCK_M, BLOCK_N)
84+
85+
# Move along the K dimension of A, B
86+
A_block_ptrs += BLOCK_K * A_stride_k
87+
B_block_ptrs += BLOCK_K * B_stride_k
88+
89+
A_scales = tl.load(A_scale_block_ptrs) # (BLOCK_M, 1)
90+
B_scales = tl.load(B_scale_block_ptrs) # ()
91+
master_accumulator += inner_accumulator * (A_scales * B_scales)
92+
93+
# Move along the K dimension of A, B scales
94+
A_scale_block_ptrs += A_scale_stride_k
95+
B_scale_block_ptrs += B_scale_stride_k
96+
97+
# Store the result for the current pid
98+
block_offsets_m = (
99+
program_id_m * BLOCK_M + tl.arange(0, BLOCK_M)[:, None]
100+
) # (BLOCK_M, 1)
101+
block_offsets_n = (
102+
program_id_n * BLOCK_N + tl.arange(0, BLOCK_N)[None, :]
103+
) # (1, BLOCK_N)
104+
mask = (block_offsets_m < M) & (block_offsets_n < N) # (BLOCK_M, BLOCK_N)
105+
C_block_ptrs = C_ptr + (block_offsets_m * C_stride_m + block_offsets_n * C_stride_n)
106+
tl.store(C_block_ptrs, master_accumulator, mask=mask)
107+
108+
109+
@torch.compile(dynamic=False, mode="max-autotune-no-cudagraphs")
110+
def contiguous(x):
111+
return x.contiguous()
112+
113+
114+
def get_config(M, N, K):
115+
num_blocks_ref = (M // 128) * (N // 128)
116+
TRANSPOSE = False
117+
matrix_instr_nonkdim = 16
118+
BLOCK_M, BLOCK_N, BLOCK_K = (128, 128, 64)
119+
if num_blocks_ref * 8 < NUM_SMS: # 2 and 7
120+
BLOCK_M, BLOCK_N, BLOCK_K = (32, 64, 128)
121+
matrix_instr_nonkdim = 16
122+
elif num_blocks_ref < NUM_SMS:
123+
BLOCK_M, BLOCK_N, BLOCK_K = (64, 64, 64)
124+
125+
config = dict(
126+
BLOCK_M=BLOCK_M,
127+
BLOCK_N=BLOCK_N,
128+
BLOCK_K=BLOCK_K,
129+
waves_per_eu=2,
130+
matrix_instr_nonkdim=matrix_instr_nonkdim,
131+
num_warps=4,
132+
num_stages=2,
133+
TRANSPOSE=TRANSPOSE,
134+
)
135+
return config
136+
137+
138+
def custom_kernel(data: input_t) -> output_t:
139+
A_tensor, B_tensor, A_scale_tensor, B_scale_tensor, C_tensor = data
140+
141+
M, K = A_tensor.shape
142+
N, _ = B_tensor.shape
143+
144+
# heuristic
145+
config = get_config(M, N, K)
146+
147+
num_blocks = triton.cdiv(M, config["BLOCK_M"]) * triton.cdiv(N, config["BLOCK_N"])
148+
kernel[(num_blocks,)](
149+
A_tensor, B_tensor, A_scale_tensor, B_scale_tensor, C_tensor, M, N, K, **config
150+
)
151+
152+
return C_tensor

0 commit comments

Comments
 (0)