-
Notifications
You must be signed in to change notification settings - Fork 11
Open
Description
It would be very nice to have high-level methods of norm
and dot
for JuliaGPU/GPUArrays.jl#66 and JuliaGPU/GPUArrays.jl#122.
It seems to be possible to get some working version of high-level functions by adding to highlevel.jl:
## NRM2
import Base.LinAlg.BLAS: nrm2
for (func, elty) in [(:clblasSnrm2, Float32), (:clblasDnrm2, Float64),
(:clblasCnrm2, CL_float2), (:clblasZnrm2, CL_double2)]
@eval function nrm2(n::Integer, x::CLArray{$elty}, incx::Integer;
queue=cl.queue(x))
# need temporary buffers
ctx = cl.context(x)
norm2_buff = cl.Buffer($elty, ctx, :w, 1)
scratch_buff = cl.Buffer($elty, ctx, :rw, 2*length(x))
$func(Csize_t(n), pointer(norm2_buff), Csize_t(0), pointer(x), Csize_t(0), Cint(incx),
pointer(scratch_buff), [queue])
# read return value
result = Vector{$elty}(1)
cl.enqueue_read_buffer(queue, norm2_buff, result, Csize_t(0), nothing, true)
@inbounds norm2 = result[1]
return norm2
end
end
However, this seems to be far from optimal. The corresponding clBLAS functions use a temporary buffer scratch_buff
that has to be allocated for each call. Here are some benchmarks using the implementation above (I did not make a PR since I think it is too bad):
julia> using CuArrays, CLArrays, GPUArrays, BenchmarkTools
julia> v = rand(Float32, 100^3); dvu = CuArray(v); dvl = CLArray(v);
julia> @benchmark norm($v)
BenchmarkTools.Trial:
memory estimate: 0 bytes
allocs estimate: 0
--------------
minimum time: 225.994 μs (0.00% GC)
median time: 226.209 μs (0.00% GC)
mean time: 226.688 μs (0.00% GC)
maximum time: 337.346 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
julia> @benchmark norm($dvu)
BenchmarkTools.Trial:
memory estimate: 128 bytes
allocs estimate: 2
--------------
minimum time: 55.020 μs (0.00% GC)
median time: 64.425 μs (0.00% GC)
mean time: 62.889 μs (0.00% GC)
maximum time: 582.534 μs (0.00% GC)
--------------
samples: 10000
evals/sample: 1
julia> @benchmark sqrt(mapreduce(abs2, +, dvl))
BenchmarkTools.Trial:
memory estimate: 13.03 KiB
allocs estimate: 298
--------------
minimum time: 226.478 μs (0.00% GC)
median time: 251.947 μs (0.00% GC)
mean time: 261.367 μs (1.16% GC)
maximum time: 17.595 ms (29.41% GC)
--------------
samples: 10000
evals/sample: 1
ulia> @time LinAlg.BLAS.nrm2(length(dvl), GPUArrays.blasbuffer(dvl), 1)
0.001456 seconds (83 allocations: 2.438 KiB)
577.3568f0
julia> @time LinAlg.BLAS.nrm2(length(dvl), GPUArrays.blasbuffer(dvl), 1)
0.000734 seconds (83 allocations: 2.438 KiB)
577.3568f0
julia> @time LinAlg.BLAS.nrm2(length(dvl), GPUArrays.blasbuffer(dvl), 1)
0.001154 seconds (83 allocations: 2.438 KiB)
577.3568f0
julia> @time LinAlg.BLAS.nrm2(length(dvl), GPUArrays.blasbuffer(dvl), 1)
0.001699 seconds (83 allocations: 2.438 KiB)
577.3568f0
julia> @time LinAlg.BLAS.nrm2(length(dvl), GPUArrays.blasbuffer(dvl), 1)
0.000760 seconds (83 allocations: 2.438 KiB)
577.3568f0
I have used @time
for the last tests, since I get the following error if I run @benchmark
on LinAlg.BLAS.nrm2(length(dvl), GPUArrays.blasbuffer(dvl), 1)
:
julia> @benchmark LinAlg.BLAS.nrm2(length($dvl), GPUArrays.blasbuffer($dvl), 1)
ERROR: CLError(code=-4, CL_MEM_OBJECT_ALLOCATION_FAILURE)
Stacktrace:
[1] #clblasSnrm2#119(::Array{Ptr{Void},1}, ::Function, ::UInt64, ::Ptr{Void}, ::UInt64, ::Ptr{Void}, ::UInt64, ::Int32, ::Ptr{Void}, ::Array{OpenCL.cl.CmdQueue,1}) at /home/.../.julia/v0.6/CLBLAS/src/macros.jl:132
[2] #nrm2#451(::OpenCL.cl.CmdQueue, ::Function, ::Int64, ::OpenCL.cl.CLArray{Float32,1}, ::Int64) at /home/.../.julia/v0.6/CLBLAS/src/highlevel.jl:57
[3] ##core#743(::CLArrays.CLArray{Float32,1}, ::CLArrays.CLArray{Float32,1}) at /home/.../.julia/v0.6/BenchmarkTools/src/execution.jl:316
[4] ##sample#744(::BenchmarkTools.Parameters) at /home/.../.julia/v0.6/BenchmarkTools/src/execution.jl:324
[5] #_lineartrial#23(::Int64, ::Array{Any,1}, ::Function, ::BenchmarkTools.Benchmark{Symbol("##benchmark#742")}, ::BenchmarkTools.Parameters) at /home/.../.julia/v0.6/BenchmarkTools/src/execution.jl:92
[6] _lineartrial(::BenchmarkTools.Benchmark{Symbol("##benchmark#742")}, ::BenchmarkTools.Parameters) at /home/.../.julia/v0.6/BenchmarkTools/src/execution.jl:84
[7] #lineartrial#20(::Array{Any,1}, ::Function, ::BenchmarkTools.Benchmark{Symbol("##benchmark#742")}, ::BenchmarkTools.Parameters) at /home/.../.julia/v0.6/BenchmarkTools/src/execution.jl:47
[8] #tune!#26(::Bool, ::String, ::Array{Any,1}, ::Function, ::BenchmarkTools.Benchmark{Symbol("##benchmark#742")}, ::BenchmarkTools.Parameters) at /home/.../.julia/v0.6/BenchmarkTools/src/execution.jl:156
[9] tune!(::BenchmarkTools.Benchmark{Symbol("##benchmark#742")}) at /home/.../.julia/v0.6/BenchmarkTools/src/execution.jl:155
chriscoey
Metadata
Metadata
Assignees
Labels
No labels