Skip to content

High-level API for norms and inner products #49

@ranocha

Description

@ranocha

It would be very nice to have high-level methods of norm and dot for JuliaGPU/GPUArrays.jl#66 and JuliaGPU/GPUArrays.jl#122.

It seems to be possible to get some working version of high-level functions by adding to highlevel.jl:

## NRM2
import Base.LinAlg.BLAS: nrm2

for (func, elty) in [(:clblasSnrm2, Float32), (:clblasDnrm2, Float64),
                    (:clblasCnrm2, CL_float2), (:clblasZnrm2, CL_double2)]

    @eval function nrm2(n::Integer, x::CLArray{$elty}, incx::Integer;
                        queue=cl.queue(x))
        # need temporary buffers
        ctx = cl.context(x)
        norm2_buff = cl.Buffer($elty, ctx, :w, 1)
        scratch_buff = cl.Buffer($elty, ctx, :rw, 2*length(x))

        $func(Csize_t(n), pointer(norm2_buff), Csize_t(0), pointer(x), Csize_t(0), Cint(incx),
              pointer(scratch_buff), [queue])

        # read return value
        result = Vector{$elty}(1)
        cl.enqueue_read_buffer(queue, norm2_buff, result, Csize_t(0), nothing, true)
        @inbounds norm2 = result[1]

        return norm2
    end

end

However, this seems to be far from optimal. The corresponding clBLAS functions use a temporary buffer scratch_buff that has to be allocated for each call. Here are some benchmarks using the implementation above (I did not make a PR since I think it is too bad):

julia> using CuArrays, CLArrays, GPUArrays, BenchmarkTools

julia> v = rand(Float32, 100^3); dvu = CuArray(v); dvl = CLArray(v);

julia> @benchmark norm($v)
BenchmarkTools.Trial: 
  memory estimate:  0 bytes
  allocs estimate:  0
  --------------
  minimum time:     225.994 μs (0.00% GC)
  median time:      226.209 μs (0.00% GC)
  mean time:        226.688 μs (0.00% GC)
  maximum time:     337.346 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> @benchmark norm($dvu)
BenchmarkTools.Trial: 
  memory estimate:  128 bytes
  allocs estimate:  2
  --------------
  minimum time:     55.020 μs (0.00% GC)
  median time:      64.425 μs (0.00% GC)
  mean time:        62.889 μs (0.00% GC)
  maximum time:     582.534 μs (0.00% GC)
  --------------
  samples:          10000
  evals/sample:     1

julia> @benchmark sqrt(mapreduce(abs2, +, dvl))
BenchmarkTools.Trial: 
  memory estimate:  13.03 KiB
  allocs estimate:  298
  --------------
  minimum time:     226.478 μs (0.00% GC)
  median time:      251.947 μs (0.00% GC)
  mean time:        261.367 μs (1.16% GC)
  maximum time:     17.595 ms (29.41% GC)
  --------------
  samples:          10000
  evals/sample:     1

ulia> @time LinAlg.BLAS.nrm2(length(dvl), GPUArrays.blasbuffer(dvl), 1)
  0.001456 seconds (83 allocations: 2.438 KiB)
577.3568f0

julia> @time LinAlg.BLAS.nrm2(length(dvl), GPUArrays.blasbuffer(dvl), 1)
  0.000734 seconds (83 allocations: 2.438 KiB)
577.3568f0

julia> @time LinAlg.BLAS.nrm2(length(dvl), GPUArrays.blasbuffer(dvl), 1)
  0.001154 seconds (83 allocations: 2.438 KiB)
577.3568f0

julia> @time LinAlg.BLAS.nrm2(length(dvl), GPUArrays.blasbuffer(dvl), 1)
  0.001699 seconds (83 allocations: 2.438 KiB)
577.3568f0

julia> @time LinAlg.BLAS.nrm2(length(dvl), GPUArrays.blasbuffer(dvl), 1)
  0.000760 seconds (83 allocations: 2.438 KiB)
577.3568f0

I have used @time for the last tests, since I get the following error if I run @benchmark on LinAlg.BLAS.nrm2(length(dvl), GPUArrays.blasbuffer(dvl), 1):

julia> @benchmark LinAlg.BLAS.nrm2(length($dvl), GPUArrays.blasbuffer($dvl), 1)
ERROR: CLError(code=-4, CL_MEM_OBJECT_ALLOCATION_FAILURE)
Stacktrace:
 [1] #clblasSnrm2#119(::Array{Ptr{Void},1}, ::Function, ::UInt64, ::Ptr{Void}, ::UInt64, ::Ptr{Void}, ::UInt64, ::Int32, ::Ptr{Void}, ::Array{OpenCL.cl.CmdQueue,1}) at /home/.../.julia/v0.6/CLBLAS/src/macros.jl:132
 [2] #nrm2#451(::OpenCL.cl.CmdQueue, ::Function, ::Int64, ::OpenCL.cl.CLArray{Float32,1}, ::Int64) at /home/.../.julia/v0.6/CLBLAS/src/highlevel.jl:57
 [3] ##core#743(::CLArrays.CLArray{Float32,1}, ::CLArrays.CLArray{Float32,1}) at /home/.../.julia/v0.6/BenchmarkTools/src/execution.jl:316
 [4] ##sample#744(::BenchmarkTools.Parameters) at /home/.../.julia/v0.6/BenchmarkTools/src/execution.jl:324
 [5] #_lineartrial#23(::Int64, ::Array{Any,1}, ::Function, ::BenchmarkTools.Benchmark{Symbol("##benchmark#742")}, ::BenchmarkTools.Parameters) at /home/.../.julia/v0.6/BenchmarkTools/src/execution.jl:92
 [6] _lineartrial(::BenchmarkTools.Benchmark{Symbol("##benchmark#742")}, ::BenchmarkTools.Parameters) at /home/.../.julia/v0.6/BenchmarkTools/src/execution.jl:84
 [7] #lineartrial#20(::Array{Any,1}, ::Function, ::BenchmarkTools.Benchmark{Symbol("##benchmark#742")}, ::BenchmarkTools.Parameters) at /home/.../.julia/v0.6/BenchmarkTools/src/execution.jl:47
 [8] #tune!#26(::Bool, ::String, ::Array{Any,1}, ::Function, ::BenchmarkTools.Benchmark{Symbol("##benchmark#742")}, ::BenchmarkTools.Parameters) at /home/.../.julia/v0.6/BenchmarkTools/src/execution.jl:156
 [9] tune!(::BenchmarkTools.Benchmark{Symbol("##benchmark#742")}) at /home/.../.julia/v0.6/BenchmarkTools/src/execution.jl:155

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions