-
Notifications
You must be signed in to change notification settings - Fork 5
Open
Description
Ref FluxML/NNlib.jl#487.
Consider this kernel:
@kernel function _scatter!(op::OP, dst, src, idxs) where OP
i = @index(Global)
idx = Tuple(idxs[i])
Atomix.modify!(Atomix.IndexableRef(dst, idx), op, src[i])
end
On Julia 1.9-rc2 on CUDA fails for -
op, but works for +
, *
, /
, min
, max
:
julia> using CUDA, NNlib
julia> x = CUDA.ones(Float32, 3, 4);
julia> idxs= cu([1 2 3 4; 4 3 2 1; 3 5 5 3]);
julia> y = NNlib.scatter(+, x, idxs);
julia> y = NNlib.scatter(-, x, idxs);
ERROR: LLVM error: Cannot select: 0x817b8d0: f32,ch = <<Unknown DAG Node>><(load store seq_cst (s32) on %ir.33, addrspace 1)> 0x817bad8:1, 0x7efa020, 0x817bad8, /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/base.jl:40 @[ /home/pxl-th/.julia/packages/UnsafeAtomicsLLVM/xcnBP/src/atomics.jl:255 @[ /home/pxl-th/.julia/packages/UnsafeAtomicsLLVM/xcnBP/src/atomics.jl:255 @[ /home/pxl-th/.julia/packages/UnsafeAtomicsLLVM/xcnBP/src/atomics.jl:359 @[ /home/pxl-th/.julia/packages/UnsafeAtomicsLLVM/xcnBP/src/internal.jl:20 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/core.jl:33 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/generic.jl:120 @[ /home/pxl-th/code/NNlib.jl/src/scatter.jl:112 @[ /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:81 @[ none:0 ] ] ] ] ] ] ] ] ]
0x7efa020: i64 = add 0x7ef93f0, Constant:i64<-4>, /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:114 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:52 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/references.jl:99 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/generic.jl:120 @[ /home/pxl-th/code/NNlib.jl/src/scatter.jl:112 @[ /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:81 @[ none:0 ] ] ] ] ] ] ]
0x7ef93f0: i64 = add 0x817b798, 0x7c80840, /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:114 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:52 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/references.jl:99 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/generic.jl:120 @[ /home/pxl-th/code/NNlib.jl/src/scatter.jl:112 @[ /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:81 @[ none:0 ] ] ] ] ] ] ]
0x817b798: i64,ch = CopyFromReg 0x7569e78, Register:i64 %0, /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:114 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:52 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/references.jl:99 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/generic.jl:120 @[ /home/pxl-th/code/NNlib.jl/src/scatter.jl:112 @[ /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:81 @[ none:0 ] ] ] ] ] ] ]
0x817bd48: i64 = Register %0
0x7c80840: i64 = shl 0x817b528, Constant:i32<2>, int.jl:88 @[ abstractarray.jl:1247 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:52 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/references.jl:99 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/generic.jl:120 @[ /home/pxl-th/code/NNlib.jl/src/scatter.jl:112 @[ /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:81 @[ none:0 ] ] ] ] ] ] ] ]
0x817b528: i64,ch = CopyFromReg 0x7569e78, Register:i64 %9, int.jl:88 @[ abstractarray.jl:1247 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:52 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/references.jl:99 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/core.jl:30 @[ /home/pxl-th/.julia/packages/Atomix/F9VIX/src/generic.jl:120 @[ /home/pxl-th/code/NNlib.jl/src/scatter.jl:112 @[ /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:81 @[ none:0 ] ] ] ] ] ] ] ]
0x817bc10: i64 = Register %9
0x817bc78: i32 = Constant<2>
0x817b868: i64 = Constant<-4>
0x817bad8: f32,ch = load<(load (s32) from %ir.28, !tbaa !203, addrspace 1)> 0x7569e78, 0x817be80, undef:i64, /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/base.jl:40 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:9 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:9 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:85 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:91 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:85 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:164 @[ /home/pxl-th/code/NNlib.jl/src/scatter.jl:112 @[ /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:81 @[ none:0 ] ] ] ] ] ] ] ] ]
0x817be80: i64 = add nuw 0x7ef9f50, Constant:i64<4>, /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/base.jl:40 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:9 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:9 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:85 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:91 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:85 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:164 @[ /home/pxl-th/code/NNlib.jl/src/scatter.jl:112 @[ /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:81 @[ none:0 ] ] ] ] ] ] ] ] ]
0x7ef9f50: i64 = add 0x7ef9e80, 0x7ef9db0, /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/base.jl:40 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:9 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:9 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:85 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:91 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:85 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:164 @[ /home/pxl-th/code/NNlib.jl/src/scatter.jl:112 @[ /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:81 @[ none:0 ] ] ] ] ] ] ] ] ]
0x7ef9e80: i64,ch = CopyFromReg 0x7569e78, Register:i64 %1, /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:80 @[ none:0 ]
0x817b9a0: i64 = Register %1
0x7ef9db0: i64 = shl 0x7efa0f0, Constant:i32<2>, /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/base.jl:40 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:9 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:9 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:85 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:91 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:85 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:164 @[ /home/pxl-th/code/NNlib.jl/src/scatter.jl:112 @[ /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:81 @[ none:0 ] ] ] ] ] ] ] ] ]
0x7efa0f0: i64,ch = CopyFromReg 0x7569e78, Register:i64 %8, /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/base.jl:40 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:9 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:9 @[ /home/pxl-th/.julia/packages/LLVM/TLGyi/src/interop/pointer.jl:85 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:91 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:85 @[ /home/pxl-th/.julia/packages/CUDA/is36v/src/device/array.jl:164 @[ /home/pxl-th/code/NNlib.jl/src/scatter.jl:112 @[ /home/pxl-th/.julia/packages/KernelAbstractions/XhtMv/src/macros.jl:81 @[ none:0 ] ] ] ] ] ] ] ] ]
0x817bf50: i64 = Register %8
0x817bc78: i32 = Constant<2>
0x817b4c0: i64 = Constant<4>
0x817b5f8: i64 = undef
In function: _Z13gpu__scatter_16CompilerMetadataI11DynamicSize12DynamicCheckv16CartesianIndicesILi1E5TupleI5OneToI5Int64EEE7NDRangeILi1ES0_S0_S2_ILi1ES3_IS4_IS5_EEES2_ILi1ES3_IS4_IS5_EEEEE1_13CuDeviceArrayI7Float32Li1ELi1EES8_IS9_Li2ELi1EES8_IS5_Li2ELi1EE
Stacktrace:
[1] handle_error(reason::Cstring)
@ LLVM ~/.julia/packages/LLVM/TLGyi/src/core/context.jl:118
[2] LLVMTargetMachineEmitToMemoryBuffer
@ ~/.julia/packages/LLVM/TLGyi/lib/13/libLLVM_h.jl:947 [inlined]
[3] emit(tm::LLVM.TargetMachine, mod::LLVM.Module, filetype::LLVM.API.LLVMCodeGenFileType)
@ LLVM ~/.julia/packages/LLVM/TLGyi/src/targetmachine.jl:45
[4] mcgen(job::GPUCompiler.CompilerJob, mod::LLVM.Module, format::LLVM.API.LLVMCodeGenFileType)
@ GPUCompiler ~/.julia/packages/GPUCompiler/HQBY9/src/mcgen.jl:73
[5] macro expansion
@ ~/.julia/packages/TimerOutputs/LHjFw/src/TimerOutput.jl:253 [inlined]
[6] macro expansion
@ ~/.julia/packages/GPUCompiler/HQBY9/src/driver.jl:424 [inlined]
[7] macro expansion
@ ~/.julia/packages/TimerOutputs/LHjFw/src/TimerOutput.jl:253 [inlined]
[8] macro expansion
@ ~/.julia/packages/GPUCompiler/HQBY9/src/driver.jl:421 [inlined]
[9] emit_asm(job::GPUCompiler.CompilerJob, ir::LLVM.Module; strip::Bool, validate::Bool, format::LLVM.API.LLVMCodeGenFileType)
@ GPUCompiler ~/.julia/packages/GPUCompiler/HQBY9/src/utils.jl:83
[10] emit_asm
@ ~/.julia/packages/GPUCompiler/HQBY9/src/utils.jl:77 [inlined]
[11] codegen(output::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, deferred_codegen::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool, parent_job::Nothing, ctx::LLVM.ThreadSafeContext)
@ GPUCompiler ~/.julia/packages/GPUCompiler/HQBY9/src/driver.jl:140
[12] codegen
@ ~/.julia/packages/GPUCompiler/HQBY9/src/driver.jl:94 [inlined]
[13] compile(target::Symbol, job::GPUCompiler.CompilerJob; libraries::Bool, deferred_codegen::Bool, optimize::Bool, cleanup::Bool, strip::Bool, validate::Bool, only_entry::Bool, ctx::LLVM.ThreadSafeContext)
@ GPUCompiler ~/.julia/packages/GPUCompiler/HQBY9/src/driver.jl:90
[14] compile
@ ~/.julia/packages/GPUCompiler/HQBY9/src/driver.jl:81 [inlined]
[15] compile(job::GPUCompiler.CompilerJob, ctx::LLVM.ThreadSafeContext)
@ CUDA ~/.julia/packages/CUDA/is36v/src/compiler/compilation.jl:105
[16] #203
@ ~/.julia/packages/CUDA/is36v/src/compiler/compilation.jl:100 [inlined]
[17] LLVM.ThreadSafeContext(f::CUDA.var"#203#204"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}})
@ LLVM ~/.julia/packages/LLVM/TLGyi/src/executionengine/ts_module.jl:14
[18] JuliaContext(f::CUDA.var"#203#204"{GPUCompiler.CompilerJob{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}})
@ GPUCompiler ~/.julia/packages/GPUCompiler/HQBY9/src/driver.jl:35
[19] compile
@ ~/.julia/packages/CUDA/is36v/src/compiler/compilation.jl:99 [inlined]
[20] actual_compilation(cache::Dict{UInt64, Any}, key::UInt64, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, ft::Type, tt::Type, compiler::typeof(CUDA.compile), linker::typeof(CUDA.link))
@ GPUCompiler ~/.julia/packages/GPUCompiler/HQBY9/src/cache.jl:53
[21] cached_compilation(cache::Dict{UInt64, Any}, cfg::GPUCompiler.CompilerConfig{GPUCompiler.PTXCompilerTarget, CUDA.CUDACompilerParams}, ft::Type, tt::Type, compiler::Function, linker::Function)
@ GPUCompiler ~/.julia/packages/GPUCompiler/HQBY9/src/cache.jl:37
[22] macro expansion
@ ~/.julia/packages/CUDA/is36v/src/compiler/execution.jl:310 [inlined]
[23] macro expansion
@ ./lock.jl:267 [inlined]
[24] cufunction(f::typeof(NNlib.gpu__scatter!), tt::Type{Tuple{KernelAbstractions.CompilerMetadata{KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicCheck, Nothing, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, KernelAbstractions.NDIteration.NDRange{1, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}, CartesianIndices{1, Tuple{Base.OneTo{Int64}}}}}, typeof(-), CuDeviceVector{Float32, 1}, CuDeviceMatrix{Float32, 1}, CuDeviceMatrix{Int64, 1}}}; kwargs::Base.Pairs{Symbol, Union{Nothing, Bool}, Tuple{Symbol, Symbol}, NamedTuple{(:always_inline, :maxthreads), Tuple{Bool, Nothing}}})
@ CUDA ~/.julia/packages/CUDA/is36v/src/compiler/execution.jl:306
[25] macro expansion
@ ~/.julia/packages/CUDA/is36v/src/compiler/execution.jl:104 [inlined]
[26] (::KernelAbstractions.Kernel{CUDABackend, KernelAbstractions.NDIteration.DynamicSize, KernelAbstractions.NDIteration.DynamicSize, typeof(NNlib.gpu__scatter!)})(::Function, ::Vararg{Any}; ndrange::Int64, workgroupsize::Nothing)
@ CUDA.CUDAKernels ~/.julia/packages/CUDA/is36v/src/CUDAKernels.jl:116
[27] Kernel
@ ~/.julia/packages/CUDA/is36v/src/CUDAKernels.jl:102 [inlined]
[28] scatter!
@ ~/code/NNlib.jl/src/scatter.jl:104 [inlined]
[29] scatter(op::typeof(-), src::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, idx::CuArray{Int64, 2, CUDA.Mem.DeviceBuffer}; init::Nothing, dstsize::Nothing)
@ NNlib ~/code/NNlib.jl/src/scatter.jl:177
[30] scatter(op::typeof(-), src::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, idx::CuArray{Int64, 2, CUDA.Mem.DeviceBuffer})
@ NNlib ~/code/NNlib.jl/src/scatter.jl:168
[31] top-level scope
@ REPL[7]:1
[32] top-level scope
@ ~/.julia/packages/CUDA/is36v/src/initialization.jl:162
Additionally, irrespective of backend, replacing:
Atomix.modify!(Atomix.IndexableRef(dst, idx), op, src[i])
with:
@atomic dst[idx...] = op(dst[idx...], src[i])
Does not perform op
atomically.
nick4f42
Metadata
Metadata
Assignees
Labels
No labels