Skip to content

Commit 7aafc3b

Browse files
committed
Allow opt-out of implicit bounds-checking
KernelAbstractions currently creates kernels that look like: ``` if __validindex(ctx) # Body end ``` This is problematic due to the convergence requirement on `@synchronize`.
1 parent 3bb80ac commit 7aafc3b

18 files changed

+1835
-7
lines changed

benchmark/gemm/Project.toml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
[deps]
2+
AMDGPU = "21141c5a-9bdb-4563-92ae-f87d6854732e"
3+
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
4+
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"

benchmark/gemm/amd_gemm.jl

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
using AMDGPU
2+
3+
include("ka_gemm.jl")
4+
5+
function gemm!(A,B,C)
6+
row = (workgroupIdx().x - 1) * workgroupDim().x + workitemIdx().x
7+
col = (workgroupIdx().y - 1) * workgroupDim().y + workitemIdx().y
8+
9+
sum = zero(eltype(C))
10+
11+
if row <= size(A, 1) && col <= size(B, 2)
12+
for i = 1:size(A, 2)
13+
@inbounds sum += A[row, i] * B[i, col]
14+
end
15+
@inbounds C[row, col] = sum
16+
end
17+
18+
return
19+
end
20+
21+
function measure(T = Float32, Ns=(512, 1024, 2048, 4096, 6144, 8192, 12288, 14336))
22+
backend = ROCBackend()
23+
kernel = ka_gemm(backend)
24+
BLOCK_SIZE = 32
25+
26+
27+
for N in Ns
28+
A = AMDGPU.ROCArray{T,2}(undef, N, N)
29+
B = AMDGPU.ROCArray{T,2}(undef, N, N)
30+
C = AMDGPU.zeros(T, N, N)
31+
32+
AMDGPU.rand!(A)
33+
AMDGPU.rand!(B)
34+
35+
grid_rows = cld(N, BLOCK_SIZE)
36+
grid_cols = cld(N, BLOCK_SIZE)
37+
grid = (grid_rows, grid_cols)
38+
threads = (BLOCK_SIZE, BLOCK_SIZE)
39+
40+
41+
t_ka = @elapsed for i in 1:10
42+
kernel(A, B, C, ndrange=size(C), workgroupsize=(BLOCK_SIZE, BLOCK_SIZE))
43+
KernelAbstractions.synchronize(backend)
44+
end
45+
t_amd = @elapsed for i in 1:10
46+
AMDGPU.@roc groupsize = threads gridsize = grid gemm!(A, B, C)
47+
AMDGPU.synchronize()
48+
end
49+
gflops_amd = ((2 * N * N * N) * 1E-9) / (t_amd/10)
50+
gflops_ka = ((2 * N * N * N) * 1E-9) / (t_ka/10)
51+
@info "Measuring" N grid threads t_ka=t_ka/10 t_amd=t_amd/10 gflops_amd gflops_ka
52+
end
53+
end

benchmark/gemm/cuda_gemm.jl

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
using CUDA
2+
3+
include("ka_gemm.jl")
4+
5+
function gemm!(A,B,C)
6+
row = (CUDA.blockIdx().x - 1) * CUDA.blockDim().x + CUDA.threadIdx().x
7+
col = (CUDA.blockIdx().y - 1) * CUDA.blockDim().y + CUDA.threadIdx().y
8+
9+
sum = zero(eltype(C))
10+
11+
if row <= size(A, 1) && col <= size(B, 2)
12+
for i = 1:size(A, 2)
13+
@inbounds sum += A[row, i] * B[i, col]
14+
end
15+
@inbounds C[row, col] = sum
16+
end
17+
18+
return
19+
end
20+
21+
function measure(T = Float32, Ns=(512, 1024, 2048, 4096, 6144, 8192, 12288, 14336))
22+
kernel = ka_gemm(CUDABackend())
23+
BLOCK_SIZE = 32
24+
25+
26+
for N in Ns
27+
A = CUDA.CuArray{T,2}(undef, N, N)
28+
B = CUDA.CuArray{T,2}(undef, N, N)
29+
C = CUDA.zeros(T, N, N)
30+
31+
CUDA.rand!(A)
32+
CUDA.rand!(B)
33+
34+
grid_rows = cld(N, BLOCK_SIZE)
35+
grid_cols = cld(N, BLOCK_SIZE)
36+
blocks = (grid_rows, grid_cols)
37+
threads = (BLOCK_SIZE, BLOCK_SIZE)
38+
39+
@info "Measuring" N blocks threads
40+
p = CUDA.@profile begin for i in 1:10
41+
CUDA.@cuda threads = threads blocks = blocks gemm!(A, B, C)
42+
CUDA.synchronize()
43+
44+
kernel(A, B, C, ndrange=size(C), workgroupsize=(BLOCK_SIZE, BLOCK_SIZE))
45+
KernelAbstractions.synchronize(backend)
46+
end end
47+
display(p)
48+
end
49+
end

benchmark/gemm/ka_gemm.jl

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
using KernelAbstractions
2+
3+
@kernel function ka_gemm(A, B, C)
4+
row, col = @index(Global, NTuple)
5+
6+
sum = zero(eltype(C))
7+
for i = 1:size(A, 2)
8+
@inbounds sum += A[row, i] * B[i, col]
9+
end
10+
@inbounds C[row, col] = sum
11+
end

benchmark/tune.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
[{"Julia":"1.10.0-beta3","BenchmarkTools":"1.0.0"},[["BenchmarkGroup",{"data":{"saxpy":["BenchmarkGroup",{"data":{"default":["BenchmarkGroup",{"data":{"Float32":["BenchmarkGroup",{"data":{"4096":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":5,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"256":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":10,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"1024":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":55,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"1048576":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"512":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":10,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"64":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":11,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"2048":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":7,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"16384":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"32768":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"65536":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"262144":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":[]}],"Float64":["BenchmarkGroup",{"data":{"4096":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":5,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"256":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":10,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"1024":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":39,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"1048576":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"512":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":10,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"64":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":10,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"2048":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":7,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"16384":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"32768":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"65536":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"262144":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":[]}],"Float16":["BenchmarkGroup",{"data":{"4096":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":4,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"256":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":10,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"1024":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":10,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"1048576":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"512":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":10,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"64":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":10,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"2048":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":5,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"16384":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"32768":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"65536":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"262144":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":[]}]},"tags":[]}],"static workgroup=(1024,)":["BenchmarkGroup",{"data":{"Float32":["BenchmarkGroup",{"data":{"4096":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":4,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"256":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":9,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"1024":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":10,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"1048576":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"512":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":9,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"64":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":9,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"2048":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":6,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"16384":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"32768":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"65536":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"262144":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":[]}],"Float64":["BenchmarkGroup",{"data":{"4096":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":3,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"256":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":9,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"1024":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":10,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"1048576":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"512":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":9,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"64":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":9,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"2048":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":5,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"16384":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"32768":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"65536":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"262144":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":[]}],"Float16":["BenchmarkGroup",{"data":{"4096":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":4,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"256":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":9,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"1024":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":10,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"1048576":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"512":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":9,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"64":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":9,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"2048":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":5,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"16384":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"32768":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"65536":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}],"262144":["Parameters",{"gctrial":true,"time_tolerance":0.05,"samples":10000,"evals":1,"gcsample":false,"seconds":5.0,"overhead":0.0,"memory_tolerance":0.01}]},"tags":[]}]},"tags":[]}]},"tags":[]}]},"tags":[]}]]]

csp.jl

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
function kernel()
2+
# Region 1
3+
@syncronize
4+
# Region 2
5+
for i in 1:10
6+
# Region 3
7+
@synchronize
8+
# Region 4
9+
end
10+
if cond
11+
@synchronize
12+
end
13+
while cond
14+
@synchronize
15+
end
16+
end
17+
18+
function csp_kernel(jump)
19+
if jump == 1
20+
@goto region2
21+
elseif jump == 2
22+
@goto region2
23+
elseif jump == -1
24+
return
25+
else
26+
error("Unreachable")
27+
end
28+
29+
@label region1
30+
let
31+
# Region 1
32+
return 2
33+
end
34+
@label region2
35+
let
36+
# Region 2
37+
return -1
38+
end
39+
@label region3
40+
let
41+
for i in 1:10
42+
# Region 3
43+
# TODO: Save state
44+
end
45+
end
46+
@lebael region4
47+
end

examples/histogram.jl

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ function create_histogram(input)
1313
end
1414

1515
# This a 1D histogram kernel where the histogramming happens on shmem
16-
@kernel function histogram_kernel!(histogram_output, input)
16+
@kernel implicit_validindex = false function histogram_kernel!(histogram_output, input)
1717
tid = @index(Global, Linear)
1818
lid = @index(Local, Linear)
1919

log

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
The latest version of Julia in the `1.10` channel is 1.10.7+0.x64.linux.gnu. You currently have `1.10.6+0.x64.linux.gnu` installed. Run:
2+
3+
juliaup update
4+
5+
in your terminal shell to install Julia 1.10.7+0.x64.linux.gnu and update the `1.10` channel to that version.
6+
; SPIR-V
7+
; Version: 1.0
8+
; Generator: Khronos LLVM/SPIR-V Translator; 14
9+
; Bound: 6
10+
; Schema: 0
11+
OpCapability Addresses
12+
OpCapability Linkage
13+
OpCapability Kernel
14+
%1 = OpExtInstImport "OpenCL.std"
15+
OpMemoryModel Physical64 OpenCL
16+
OpSource OpenCL_C 200000
17+
OpName %julia_f_379 "julia_f_379"
18+
OpName %top "top"
19+
OpDecorate %julia_f_379 LinkageAttributes "julia_f_379" Export
20+
%void = OpTypeVoid
21+
%3 = OpTypeFunction %void
22+
%julia_f_379 = OpFunction %void None %3
23+
%top = OpLabel
24+
OpReturn
25+
OpFunctionEnd
26+
; @ /home/vchuravy/src/KernelAbstractions/test.jl:3 within `f`
27+
define void @julia_f_291() local_unnamed_addr {
28+
top:
29+
; @ /home/vchuravy/src/KernelAbstractions/test.jl:4 within `f`
30+
ret void
31+
}

0 commit comments

Comments
 (0)