From 449ab06472ef31f38193d393001ed4f79141e328 Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Wed, 12 Mar 2025 11:01:55 +0100 Subject: [PATCH 1/3] Add Enzyme as a normal test dependency --- test/Project.toml | 3 ++- test/runtests.jl | 10 ++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/test/Project.toml b/test/Project.toml index c845b2df0..1c67b9dc4 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,10 +1,11 @@ [deps] Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e" +Enzyme = "7da242da-08ed-463a-9acd-ee780be4f1d9" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" SparseArrays = "2f01184e-e22b-5df5-ae63-d93ebab69eaf" SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" StaticArrays = "90137ffa-7385-5640-81b9-e52037218182" -pocl_jll = "627d6b7a-bbe6-5189-83e7-98cc0a5aeadd" Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" +pocl_jll = "627d6b7a-bbe6-5189-83e7-98cc0a5aeadd" diff --git a/test/runtests.jl b/test/runtests.jl index 9dba56e78..8b779d13b 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -36,9 +36,7 @@ struct NewBackend <: KernelAbstractions.GPU end end -# include("extensions/enzyme.jl") -# @static if VERSION >= v"1.7.0" -# @testset "Enzyme" begin -# enzyme_testsuite(CPU, Array) -# end -# end +include("extensions/enzyme.jl") +@testset "Enzyme" begin + enzyme_testsuite(CPU, Array) +end From 2716056faabfc2aa37e3338758c3ccf9885248df Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Wed, 12 Mar 2025 13:24:04 +0100 Subject: [PATCH 2/3] treat Enzyme more as a normal dependency --- .buildkite/pipeline.yml | 9 +++------ .github/workflows/ci.yml | 18 ------------------ 2 files changed, 3 insertions(+), 24 deletions(-) diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml index 43999a1df..1918a8113 100644 --- a/.buildkite/pipeline.yml +++ b/.buildkite/pipeline.yml @@ -40,12 +40,9 @@ steps: command: | julia -e 'println("--- :julia: Instantiating project") using Pkg - try - Pkg.develop([PackageSpec(; path=pwd()), PackageSpec("Enzyme"), PackageSpec("EnzymeCore"), PackageSpec("CUDA")]) - catch err - Pkg.develop(; path=pwd()) - Pkg.add(["CUDA", "Enzyme"]) - end' || exit 3 + Pkg.develop(; path=pwd()) + Pkg.add(["CUDA", "Enzyme"]) + ' || exit 3 julia -e 'println("+++ :julia: Running tests") using CUDA diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9ed61b671..2b89f1f19 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -128,24 +128,6 @@ jobs: end' echo '[pocl_jll]' > test/LocalPreferences.toml echo 'libpocl_path="${{ github.workspace }}/target/lib/libpocl.so"' >> test/LocalPreferences.toml - - name: "Co-develop Enzyme and KA" - run: | - julia -e ' - using Pkg - withenv("JULIA_PKG_PRECOMPILE_AUTO" => 0) do - Pkg.activate("test") - Pkg.add(["Enzyme", "EnzymeCore"]) - - # to check compatibility, also add Enzyme to the main environment - # (or Pkg.test, which merges both environments, could fail) - Pkg.activate(".") - # Try to co-develop Enzyme and KA - try - Pkg.develop([PackageSpec("Enzyme"), PackageSpec("EnzymeCore")]) - catch err - end - end - ' - uses: julia-actions/julia-buildpkg@v1 - uses: julia-actions/julia-runtest@v1 with: From 043270b6d5f1b4c173560baa8d82ad9459fed75e Mon Sep 17 00:00:00 2001 From: Valentin Churavy Date: Wed, 12 Mar 2025 13:46:18 +0100 Subject: [PATCH 3/3] remove special handling of CPU --- ext/EnzymeCore07Ext.jl | 126 ++++------------------------------------- ext/EnzymeCore08Ext.jl | 123 +++------------------------------------- 2 files changed, 20 insertions(+), 229 deletions(-) diff --git a/ext/EnzymeCore07Ext.jl b/ext/EnzymeCore07Ext.jl index 93159886c..fe10b76c6 100644 --- a/ext/EnzymeCore07Ext.jl +++ b/ext/EnzymeCore07Ext.jl @@ -1,32 +1,10 @@ -# https://github.com/EnzymeAD/Enzyme.jl/issues/1516 -# On the CPU `autodiff_deferred` can deadlock. -# Hence a specialized CPU version -function cpu_fwd(ctx, f, args...) - EnzymeCore.autodiff(Forward, Const(f), Const{Nothing}, Const(ctx), args...) - return nothing -end - -function gpu_fwd(ctx, f, args...) +function fwd(ctx, f, args...) EnzymeCore.autodiff_deferred(Forward, Const(f), Const{Nothing}, Const(ctx), args...) return nothing end function EnzymeRules.forward( - func::Const{<:Kernel{CPU}}, - ::Type{Const{Nothing}}, - args...; - ndrange = nothing, - workgroupsize = nothing, - ) - kernel = func.val - f = kernel.f - fwd_kernel = similar(kernel, cpu_fwd) - - return fwd_kernel(f, args...; ndrange, workgroupsize) -end - -function EnzymeRules.forward( - func::Const{<:Kernel{<:GPU}}, + func::Const{<:Kernel}, ::Type{Const{Nothing}}, args...; ndrange = nothing, @@ -34,48 +12,19 @@ function EnzymeRules.forward( ) kernel = func.val f = kernel.f - fwd_kernel = similar(kernel, gpu_fwd) + fwd_kernel = similar(kernel, fwd) return fwd_kernel(f, args...; ndrange, workgroupsize) end -_enzyme_mkcontext(kernel::Kernel{CPU}, ndrange, iterspace, dynamic) = - mkcontext(kernel, first(blocks(iterspace)), ndrange, iterspace, dynamic) -_enzyme_mkcontext(kernel::Kernel{<:GPU}, ndrange, iterspace, dynamic) = +_enzyme_mkcontext(kernel::Kernel, ndrange, iterspace, dynamic) = mkcontext(kernel, ndrange, iterspace) -_augmented_return(::Kernel{CPU}, subtape, arg_refs, tape_type) = - AugmentedReturn{Nothing, Nothing, Tuple{Array, typeof(arg_refs), typeof(tape_type)}}( - nothing, - nothing, - (subtape, arg_refs, tape_type), -) -_augmented_return(::Kernel{<:GPU}, subtape, arg_refs, tape_type) = +_augmented_return(::Kernel, subtape, arg_refs, tape_type) = AugmentedReturn{Nothing, Nothing, Any}(nothing, nothing, (subtape, arg_refs, tape_type)) function _create_tape_kernel( - kernel::Kernel{CPU}, - ModifiedBetween, - FT, - ctxTy, - ndrange, - iterspace, - args2..., - ) - TapeType = EnzymeCore.tape_type( - ReverseSplitModified(ReverseSplitWithPrimal, ModifiedBetween), - FT, - Const{Nothing}, - Const{ctxTy}, - map(Core.Typeof, args2)..., - ) - subtape = Array{TapeType}(undef, size(blocks(iterspace))) - aug_kernel = similar(kernel, cpu_aug_fwd) - return TapeType, subtape, aug_kernel -end - -function _create_tape_kernel( - kernel::Kernel{<:GPU}, + kernel::Kernel, ModifiedBetween, FT, ctxTy, @@ -104,60 +53,11 @@ function _create_tape_kernel( # Allocate per thread subtape = allocate(backend(kernel), TapeType, prod(ndrange)) - aug_kernel = similar(kernel, gpu_aug_fwd) + aug_kernel = similar(kernel, aug_fwd) return TapeType, subtape, aug_kernel end -_create_rev_kernel(kernel::Kernel{CPU}) = similar(kernel, cpu_rev) -_create_rev_kernel(kernel::Kernel{<:GPU}) = similar(kernel, gpu_rev) - -function cpu_aug_fwd( - ctx, - f::FT, - ::Val{ModifiedBetween}, - subtape, - ::Val{TapeType}, - args..., - ) where {ModifiedBetween, FT, TapeType} - # A2 = Const{Nothing} -- since f->Nothing - forward, _ = EnzymeCore.autodiff_thunk( - ReverseSplitModified(ReverseSplitWithPrimal, Val(ModifiedBetween)), - Const{Core.Typeof(f)}, - Const{Nothing}, - Const{Core.Typeof(ctx)}, - map(Core.Typeof, args)..., - ) - - # On the CPU: F is a per block function - # On the CPU: subtape::Vector{Vector} - I = __index_Group_Cartesian(ctx, CartesianIndex(1, 1)) #=fake=# - subtape[I] = forward(Const(f), Const(ctx), args...)[1] - return nothing -end - -function cpu_rev( - ctx, - f::FT, - ::Val{ModifiedBetween}, - subtape, - ::Val{TapeType}, - args..., - ) where {ModifiedBetween, FT, TapeType} - _, reverse = EnzymeCore.autodiff_thunk( - ReverseSplitModified(ReverseSplitWithPrimal, Val(ModifiedBetween)), - Const{Core.Typeof(f)}, - Const{Nothing}, - Const{Core.Typeof(ctx)}, - map(Core.Typeof, args)..., - ) - I = __index_Group_Cartesian(ctx, CartesianIndex(1, 1)) #=fake=# - tp = subtape[I] - reverse(Const(f), Const(ctx), args..., tp) - return nothing -end - -# GPU support -function gpu_aug_fwd( +function aug_fwd( ctx, f::FT, ::Val{ModifiedBetween}, @@ -184,7 +84,7 @@ function gpu_aug_fwd( return nothing end -function gpu_rev( +function rev( ctx, f::FT, ::Val{ModifiedBetween}, @@ -232,11 +132,7 @@ function EnzymeRules.augmented_primal( arg_refs = ntuple(Val(N)) do i Base.@_inline_meta if args[i] isa Active - if func.val isa Kernel{<:GPU} - error("Active kernel arguments not supported on GPU") - else - Ref(EnzymeCore.make_zero(args[i].val)) - end + error("Active kernel arguments not supported") else nothing end @@ -292,7 +188,7 @@ function EnzymeRules.reverse( ModifiedBetween = Val((overwritten(config)[1], false, overwritten(config)[2:end]...)) - rev_kernel = _create_rev_kernel(kernel) + rev_kernel = similar(kernel, rev) rev_kernel( f, ModifiedBetween, diff --git a/ext/EnzymeCore08Ext.jl b/ext/EnzymeCore08Ext.jl index 1fda85120..77bc004ae 100644 --- a/ext/EnzymeCore08Ext.jl +++ b/ext/EnzymeCore08Ext.jl @@ -1,34 +1,11 @@ -# https://github.com/EnzymeAD/Enzyme.jl/issues/1516 -# On the CPU `autodiff_deferred` can deadlock. -# Hence a specialized CPU version -function cpu_fwd(ctx, config, f, args...) - EnzymeCore.autodiff(EnzymeCore.set_runtime_activity(Forward, config), Const(f), Const{Nothing}, Const(ctx), args...) - return nothing -end - -function gpu_fwd(ctx, config, f, args...) +function fwd(ctx, config, f, args...) EnzymeCore.autodiff_deferred(EnzymeCore.set_runtime_activity(Forward, config), Const(f), Const{Nothing}, Const(ctx), args...) return nothing end function EnzymeRules.forward( config, - func::Const{<:Kernel{CPU}}, - ::Type{Const{Nothing}}, - args...; - ndrange = nothing, - workgroupsize = nothing, - ) - kernel = func.val - f = kernel.f - fwd_kernel = similar(kernel, cpu_fwd) - - return fwd_kernel(config, f, args...; ndrange, workgroupsize) -end - -function EnzymeRules.forward( - config, - func::Const{<:Kernel{<:GPU}}, + func::Const{<:Kernel}, ::Type{Const{Nothing}}, args...; ndrange = nothing, @@ -41,41 +18,12 @@ function EnzymeRules.forward( return fwd_kernel(config, f, args...; ndrange, workgroupsize) end -_enzyme_mkcontext(kernel::Kernel{CPU}, ndrange, iterspace, dynamic) = - mkcontext(kernel, first(blocks(iterspace)), ndrange, iterspace, dynamic) -_enzyme_mkcontext(kernel::Kernel{<:GPU}, ndrange, iterspace, dynamic) = +_enzyme_mkcontext(kernel::Kernel, ndrange, iterspace, dynamic) = mkcontext(kernel, ndrange, iterspace) -_augmented_return(::Kernel{CPU}, subtape, arg_refs, tape_type) = - AugmentedReturn{Nothing, Nothing, Tuple{Array, typeof(arg_refs), typeof(tape_type)}}( - nothing, - nothing, - (subtape, arg_refs, tape_type), -) -_augmented_return(::Kernel{<:GPU}, subtape, arg_refs, tape_type) = +_augmented_return(::Kernel, subtape, arg_refs, tape_type) = AugmentedReturn{Nothing, Nothing, Any}(nothing, nothing, (subtape, arg_refs, tape_type)) -function _create_tape_kernel( - kernel::Kernel{CPU}, - Mode, - FT, - ctxTy, - ndrange, - iterspace, - args2..., - ) - TapeType = EnzymeCore.tape_type( - Mode, - FT, - Const{Nothing}, - Const{ctxTy}, - map(Core.Typeof, args2)..., - ) - subtape = Array{TapeType}(undef, size(blocks(iterspace))) - aug_kernel = similar(kernel, cpu_aug_fwd) - return TapeType, subtape, aug_kernel -end - function _create_tape_kernel( kernel::Kernel{<:GPU}, Mode, @@ -106,60 +54,11 @@ function _create_tape_kernel( # Allocate per thread subtape = allocate(backend(kernel), TapeType, prod(ndrange)) - aug_kernel = similar(kernel, gpu_aug_fwd) + aug_kernel = similar(kernel, aug_fwd) return TapeType, subtape, aug_kernel end -_create_rev_kernel(kernel::Kernel{CPU}) = similar(kernel, cpu_rev) -_create_rev_kernel(kernel::Kernel{<:GPU}) = similar(kernel, gpu_rev) - -function cpu_aug_fwd( - ctx, - f::FT, - mode::Mode, - subtape, - ::Val{TapeType}, - args..., - ) where {Mode, FT, TapeType} - # A2 = Const{Nothing} -- since f->Nothing - forward, _ = EnzymeCore.autodiff_thunk( - mode, - Const{Core.Typeof(f)}, - Const{Nothing}, - Const{Core.Typeof(ctx)}, - map(Core.Typeof, args)..., - ) - - # On the CPU: F is a per block function - # On the CPU: subtape::Vector{Vector} - I = __index_Group_Cartesian(ctx, CartesianIndex(1, 1)) #=fake=# - subtape[I] = forward(Const(f), Const(ctx), args...)[1] - return nothing -end - -function cpu_rev( - ctx, - f::FT, - mode::Mode, - subtape, - ::Val{TapeType}, - args..., - ) where {Mode, FT, TapeType} - _, reverse = EnzymeCore.autodiff_thunk( - mode, - Const{Core.Typeof(f)}, - Const{Nothing}, - Const{Core.Typeof(ctx)}, - map(Core.Typeof, args)..., - ) - I = __index_Group_Cartesian(ctx, CartesianIndex(1, 1)) #=fake=# - tp = subtape[I] - reverse(Const(f), Const(ctx), args..., tp) - return nothing -end - -# GPU support -function gpu_aug_fwd( +function fwd( ctx, f::FT, mode::Mode, @@ -186,7 +85,7 @@ function gpu_aug_fwd( return nothing end -function gpu_rev( +function rev( ctx, f::FT, mode::Mode, @@ -234,11 +133,7 @@ function EnzymeRules.augmented_primal( arg_refs = ntuple(Val(N)) do i Base.@_inline_meta if args[i] isa Active - if func.val isa Kernel{<:GPU} - error("Active kernel arguments not supported on GPU") - else - Ref(EnzymeCore.make_zero(args[i].val)) - end + error("Active kernel arguments not supported") else nothing end @@ -294,7 +189,7 @@ function EnzymeRules.reverse( ModifiedBetween = Val((overwritten(config)[1], false, overwritten(config)[2:end]...)) Mode = EnzymeCore.set_runtime_activity(ReverseSplitModified(ReverseSplitWithPrimal, ModifiedBetween), config) - rev_kernel = _create_rev_kernel(kernel) + rev_kernel = similar(kernel, rev) rev_kernel( f, Mode,