From 772bd94c2560158ab72764bfca6e9518933182dc Mon Sep 17 00:00:00 2001 From: Collin R Warner Date: Sat, 8 Apr 2023 20:32:43 -0400 Subject: [PATCH 01/14] add GPUCompiler precompilation caching --- src/GPUCompiler.jl | 3 +++ src/jlgen.jl | 20 ++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/src/GPUCompiler.jl b/src/GPUCompiler.jl index 34f3fbd6..809b50d5 100644 --- a/src/GPUCompiler.jl +++ b/src/GPUCompiler.jl @@ -40,7 +40,10 @@ include("cache.jl") include("execution.jl") include("reflection.jl") + include("precompile.jl") +include("precompile_native.jl") + _precompile_() function __init__() diff --git a/src/jlgen.jl b/src/jlgen.jl index d8b7ca4f..47da7579 100644 --- a/src/jlgen.jl +++ b/src/jlgen.jl @@ -254,8 +254,28 @@ struct CodeCache dict::IdDict{MethodInstance,Vector{CodeInstance}} CodeCache() = new(Dict{MethodInstance,Vector{CodeInstance}}()) + CodeCache(cache::CodeCache) = new(GPUCompiler.copyAndFilter(cache.dict)) +end + +function copyAndFilter(dict::IdDict) + out= IdDict() + for key in keys(dict) + useKey = true + # why is it an array of code instances, can there be more than 1? + for ci in dict[key] + if ci.max_world < typemax(typeof(ci.max_world)) + useKey = false + break + end + end + if useKey + out[key] = dict[key] + end + end + return out end + function Base.show(io::IO, ::MIME"text/plain", cc::CodeCache) print(io, "CodeCache with $(mapreduce(length, +, values(cc.dict); init=0)) entries") if !isempty(cc.dict) From a4bad27c437caa7b92564794afb7e7757a7eb495 Mon Sep 17 00:00:00 2001 From: Collin R Warner Date: Sun, 9 Apr 2023 14:27:27 -0400 Subject: [PATCH 02/14] add precompile file --- src/precompile_native.jl | 110 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 src/precompile_native.jl diff --git a/src/precompile_native.jl b/src/precompile_native.jl new file mode 100644 index 00000000..f560dadc --- /dev/null +++ b/src/precompile_native.jl @@ -0,0 +1,110 @@ +const CACHE_NAME = gensym(:CACHE) # is now a const symbol (not a variable) +is_precompiling() = ccall(:jl_generating_output, Cint, ()) != 0 + +struct NativeCompilerParams <: AbstractCompilerParams end +export @declare_cache, @snapshot_cache, @reinit_cache, @get_cache + +macro declare_cache() + var = esc(CACHE_NAME) #this will esc variable from our const symbol + quote + #const $esc(CACHE_NAME) function esc is executed when macro is executed, not when code is defined + # dollar sign means will have the value of esc cachename here + const $var = $IdDict() + end +end + +macro snapshot_cache() + var = esc(CACHE_NAME) + quote + $snapshot_cache($var) + end +end + +macro reinit_cache() + var = esc(CACHE_NAME) + quote + # will need to keep track of this is CUDA so that GPUCompiler caches are not overfilled + $reinit_cache($var) + end +end + +macro get_cache() + var = esc(CACHE_NAME) + quote + $var + end +end + +""" +Given a function and param types caches the function to the global cache +""" +function precompile_gpucompiler(job) + # populate the cache + cache = GPUCompiler.ci_cache(job) + mt = GPUCompiler.method_table(job) + interp = GPUCompiler.get_interpreter(job) + if GPUCompiler.ci_cache_lookup(cache, job.source, job.world, typemax(Cint)) === nothing + GPUCompiler.ci_cache_populate(interp, cache, mt, job.source, job.world, typemax(Cint)) + end +end + +function get_code_cache_i(i) + for (j, cc) in enumerate(GPUCompiler.GLOBAL_CI_CACHES) + if j == i + return cc + end + end +end +""" +Reloads Global Cache from global variable which stores the previous +cached results +""" +function reinit_cache(LOCAL_CACHE) + if !is_precompiling() + # need to merge caches at the code instance level + for key in keys(LOCAL_CACHE) + if haskey(GPUCompiler.GLOBAL_CI_CACHES, key) + global_cache = GPUCompiler.GLOBAL_CI_CACHES[key] + local_cache = LOCAL_CACHE[key] + for (mi, civ) in (local_cache.dict) + # this should be one since there is only one range that is infinite + @assert length(civ) == 1 + # add all code instances to global cache + # could move truncating code to set index + ci = civ[1] + if haskey(global_cache.dict, mi) + gciv = global_cache.dict[mi] + # truncation cod3 + # sort by min world age, then make sure no age ranges overlap // this part is uneeded + sort(gciv, by=x->x.min_world) + if ci.min_world > gciv[length(gciv)].min_world + invalidate_code_cache(global_cache, mi, ci.min_world - 1) + Core.Compiler.setindex!(global_cache, ci, mi) + else + println("Should not get here?") + @assert false + end + else + # occurs if we kill everything in the parent and then need to store in child + Core.Compiler.setindex!(global_cache, ci, mi) + end + end + else + # no conflict at cache level + GPUCompiler.GLOBAL_CI_CACHES[key] = LOCAL_CACHE[key] + end + end + end +end + +""" +Takes a snapshot of the current status of the cache + +The cache returned is a deep copy with finite world age endings removed +""" +function snapshot_cache(LOCAL_CACHE) + cleaned_cache_to_save = IdDict() + for key in keys(GPUCompiler.GLOBAL_CI_CACHES) + # Will only keep those elements with infinite ranges + merge!(LOCAL_CACHE, cleaned_cache_to_save) +end From 4de3f62bdf3ec5470381d3995276fb14c7a7306f Mon Sep 17 00:00:00 2001 From: Collin R Warner Date: Sun, 9 Apr 2023 16:46:43 -0400 Subject: [PATCH 03/14] fix accidental deletion --- src/precompile_native.jl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/precompile_native.jl b/src/precompile_native.jl index f560dadc..ff3f69d1 100644 --- a/src/precompile_native.jl +++ b/src/precompile_native.jl @@ -106,5 +106,10 @@ function snapshot_cache(LOCAL_CACHE) cleaned_cache_to_save = IdDict() for key in keys(GPUCompiler.GLOBAL_CI_CACHES) # Will only keep those elements with infinite ranges + cleaned_cache_to_save[key] = GPUCompiler.CodeCache(GPUCompiler.GLOBAL_CI_CACHES[key]) + end + global MY_CACHE #technically don't need the global + #empty insert + empty!(LOCAL_CACHE) merge!(LOCAL_CACHE, cleaned_cache_to_save) end From 11007f2726284d070f4c491fe277e816220089ce Mon Sep 17 00:00:00 2001 From: Collin R Warner Date: Tue, 11 Apr 2023 13:31:27 -0400 Subject: [PATCH 04/14] Add examples detailing functionality --- examples/Example/Manifest.toml | 194 ++++++++++++++++++++++++++++ examples/Example/Project.toml | 8 ++ examples/Example/src/Example.jl | 15 +++ examples/SimpleGPU/Manifest.toml | 188 +++++++++++++++++++++++++++ examples/SimpleGPU/Project.toml | 7 + examples/SimpleGPU/src/SimpleGPU.jl | 28 ++++ 6 files changed, 440 insertions(+) create mode 100644 examples/Example/Manifest.toml create mode 100644 examples/Example/Project.toml create mode 100644 examples/Example/src/Example.jl create mode 100644 examples/SimpleGPU/Manifest.toml create mode 100644 examples/SimpleGPU/Project.toml create mode 100644 examples/SimpleGPU/src/SimpleGPU.jl diff --git a/examples/Example/Manifest.toml b/examples/Example/Manifest.toml new file mode 100644 index 00000000..a5b1f1c4 --- /dev/null +++ b/examples/Example/Manifest.toml @@ -0,0 +1,194 @@ +# This file is machine-generated - editing it directly is not advised + +julia_version = "1.10.0-DEV" +manifest_format = "2.0" +project_hash = "6afd6f1a57af520013070870c6f183d98c839ff4" + +[[deps.ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" +version = "1.1.1" + +[[deps.Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[deps.Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[deps.CEnum]] +git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.4.2" + +[[deps.Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[deps.Downloads]] +deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +version = "1.6.0" + +[[deps.ExprTools]] +git-tree-sha1 = "c1d06d129da9f55715c6c212866f5b1bddc5fa00" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.9" + +[[deps.FileWatching]] +uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" + +[[deps.GPUCompiler]] +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"] +path = "/home/collinw/.julia/dev/GPUCompiler" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.19.0" + +[[deps.InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[deps.JLLWrappers]] +deps = ["Preferences"] +git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.4.1" + +[[deps.LLVM]] +deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] +git-tree-sha1 = "a8960cae30b42b66dd41808beb76490519f6f9e2" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "5.0.0" + +[[deps.LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "09b7505cc0b1cee87e5d4a26eea61d2e1b0dcd35" +uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" +version = "0.0.21+0" + +[[deps.LazyArtifacts]] +deps = ["Artifacts", "Pkg"] +uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" + +[[deps.LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" +version = "0.6.3" + +[[deps.LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" +version = "8.0.1+0" + +[[deps.LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[deps.LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" +version = "1.10.2+0" + +[[deps.Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[deps.Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[deps.Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[deps.MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" +version = "2.28.2+0" + +[[deps.MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" +version = "2023.1.10" + +[[deps.NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" +version = "1.2.0" + +[[deps.Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +version = "1.10.0" + +[[deps.Preferences]] +deps = ["TOML"] +git-tree-sha1 = "47e5f437cc0e7ef2ce8406ce1e7e24d44915f88d" +uuid = "21216c6a-2e73-6563-6e65-726566657250" +version = "1.3.0" + +[[deps.Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[deps.REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[deps.Random]] +deps = ["SHA", "Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[deps.SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +version = "0.7.0" + +[[deps.Scratch]] +deps = ["Dates"] +git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a" +uuid = "6c6a2e73-6563-6170-7368-637461726353" +version = "1.2.0" + +[[deps.Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[deps.SimpleGPU]] +deps = ["GPUCompiler"] +path = "../SimpleGPU" +uuid = "0f92ac95-628b-4f27-9a96-2faf96da70df" +version = "0.1.0" + +[[deps.Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[deps.TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" +version = "1.0.3" + +[[deps.Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" +version = "1.10.0" + +[[deps.TimerOutputs]] +deps = ["ExprTools", "Printf"] +git-tree-sha1 = "f2fd3f288dfc6f507b0c3a2eb3bac009251e548b" +uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +version = "0.5.22" + +[[deps.UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[deps.Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[deps.Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" +version = "1.2.13+0" + +[[deps.nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" +version = "1.52.0+0" + +[[deps.p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" +version = "17.4.0+0" diff --git a/examples/Example/Project.toml b/examples/Example/Project.toml new file mode 100644 index 00000000..22ffa2ea --- /dev/null +++ b/examples/Example/Project.toml @@ -0,0 +1,8 @@ +name = "Example" +uuid = "3a86cd2f-4474-4e46-89c8-15adf66897e9" +authors = ["collinw "] +version = "0.1.0" + +[deps] +GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55" +SimpleGPU = "0f92ac95-628b-4f27-9a96-2faf96da70df" diff --git a/examples/Example/src/Example.jl b/examples/Example/src/Example.jl new file mode 100644 index 00000000..776793ec --- /dev/null +++ b/examples/Example/src/Example.jl @@ -0,0 +1,15 @@ +module Example +using GPUCompiler +using SimpleGPU +SimpleGPU.@declare_cache() + +f(x) = 1 +SimpleGPU.precompile_simple(f, (Int, )) + +function __init__() + SimpleGPU.@reinit_cache() +end + +SimpleGPU.@snapshot_cache() + +end # module Example diff --git a/examples/SimpleGPU/Manifest.toml b/examples/SimpleGPU/Manifest.toml new file mode 100644 index 00000000..564425c9 --- /dev/null +++ b/examples/SimpleGPU/Manifest.toml @@ -0,0 +1,188 @@ +# This file is machine-generated - editing it directly is not advised + +julia_version = "1.10.0-DEV" +manifest_format = "2.0" +project_hash = "7e4bd5a8a18c1099e483003348b1afb869b6d01e" + +[[deps.ArgTools]] +uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" +version = "1.1.1" + +[[deps.Artifacts]] +uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" + +[[deps.Base64]] +uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" + +[[deps.CEnum]] +git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90" +uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" +version = "0.4.2" + +[[deps.Dates]] +deps = ["Printf"] +uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" + +[[deps.Downloads]] +deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] +uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" +version = "1.6.0" + +[[deps.ExprTools]] +git-tree-sha1 = "c1d06d129da9f55715c6c212866f5b1bddc5fa00" +uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" +version = "0.1.9" + +[[deps.FileWatching]] +uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" + +[[deps.GPUCompiler]] +deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"] +path = "/home/collinw/.julia/dev/GPUCompiler" +uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" +version = "0.19.0" + +[[deps.InteractiveUtils]] +deps = ["Markdown"] +uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" + +[[deps.JLLWrappers]] +deps = ["Preferences"] +git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1" +uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" +version = "1.4.1" + +[[deps.LLVM]] +deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] +git-tree-sha1 = "a8960cae30b42b66dd41808beb76490519f6f9e2" +uuid = "929cbde3-209d-540e-8aea-75f648917ca0" +version = "5.0.0" + +[[deps.LLVMExtra_jll]] +deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] +git-tree-sha1 = "09b7505cc0b1cee87e5d4a26eea61d2e1b0dcd35" +uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" +version = "0.0.21+0" + +[[deps.LazyArtifacts]] +deps = ["Artifacts", "Pkg"] +uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" + +[[deps.LibCURL]] +deps = ["LibCURL_jll", "MozillaCACerts_jll"] +uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" +version = "0.6.3" + +[[deps.LibCURL_jll]] +deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] +uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" +version = "8.0.1+0" + +[[deps.LibGit2]] +deps = ["Base64", "NetworkOptions", "Printf", "SHA"] +uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" + +[[deps.LibSSH2_jll]] +deps = ["Artifacts", "Libdl", "MbedTLS_jll"] +uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" +version = "1.10.2+0" + +[[deps.Libdl]] +uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" + +[[deps.Logging]] +uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" + +[[deps.Markdown]] +deps = ["Base64"] +uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" + +[[deps.MbedTLS_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" +version = "2.28.2+0" + +[[deps.MozillaCACerts_jll]] +uuid = "14a3606d-f60d-562e-9121-12d972cd8159" +version = "2023.1.10" + +[[deps.NetworkOptions]] +uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" +version = "1.2.0" + +[[deps.Pkg]] +deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] +uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" +version = "1.10.0" + +[[deps.Preferences]] +deps = ["TOML"] +git-tree-sha1 = "47e5f437cc0e7ef2ce8406ce1e7e24d44915f88d" +uuid = "21216c6a-2e73-6563-6e65-726566657250" +version = "1.3.0" + +[[deps.Printf]] +deps = ["Unicode"] +uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" + +[[deps.REPL]] +deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] +uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" + +[[deps.Random]] +deps = ["SHA", "Serialization"] +uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" + +[[deps.SHA]] +uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" +version = "0.7.0" + +[[deps.Scratch]] +deps = ["Dates"] +git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a" +uuid = "6c6a2e73-6563-6170-7368-637461726353" +version = "1.2.0" + +[[deps.Serialization]] +uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" + +[[deps.Sockets]] +uuid = "6462fe0b-24de-5631-8697-dd941f90decc" + +[[deps.TOML]] +deps = ["Dates"] +uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" +version = "1.0.3" + +[[deps.Tar]] +deps = ["ArgTools", "SHA"] +uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" +version = "1.10.0" + +[[deps.TimerOutputs]] +deps = ["ExprTools", "Printf"] +git-tree-sha1 = "f2fd3f288dfc6f507b0c3a2eb3bac009251e548b" +uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" +version = "0.5.22" + +[[deps.UUIDs]] +deps = ["Random", "SHA"] +uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" + +[[deps.Unicode]] +uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" + +[[deps.Zlib_jll]] +deps = ["Libdl"] +uuid = "83775a58-1f1d-513f-b197-d71354ab007a" +version = "1.2.13+0" + +[[deps.nghttp2_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" +version = "1.52.0+0" + +[[deps.p7zip_jll]] +deps = ["Artifacts", "Libdl"] +uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" +version = "17.4.0+0" diff --git a/examples/SimpleGPU/Project.toml b/examples/SimpleGPU/Project.toml new file mode 100644 index 00000000..247624c5 --- /dev/null +++ b/examples/SimpleGPU/Project.toml @@ -0,0 +1,7 @@ +name = "SimpleGPU" +uuid = "0f92ac95-628b-4f27-9a96-2faf96da70df" +authors = ["collinw "] +version = "0.1.0" + +[deps] +GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55" diff --git a/examples/SimpleGPU/src/SimpleGPU.jl b/examples/SimpleGPU/src/SimpleGPU.jl new file mode 100644 index 00000000..894291e9 --- /dev/null +++ b/examples/SimpleGPU/src/SimpleGPU.jl @@ -0,0 +1,28 @@ +module SimpleGPU +using GPUCompiler +struct NativeCompilerParams <: AbstractCompilerParams + entry_safepoint::Bool + method_table + + NativeCompilerParams(entry_safepoint::Bool=false, method_table=test_method_table) = + new(entry_safepoint, method_table) +end + +const test_method_table = nothing + +function native_job(@nospecialize(func), @nospecialize(types); kernel::Bool=false, + entry_abi=:specfunc, entry_safepoint::Bool=false, always_inline=false, + method_table=test_method_table, kwargs...) + source = methodinstance(typeof(func), Base.to_tuple_type(types)) + target = NativeCompilerTarget() + params = NativeCompilerParams(entry_safepoint, method_table) + config = CompilerConfig(target, params; kernel, entry_abi, always_inline) + CompilerJob(source, config), kwargs +end + +function precompile_simple(f, t) + job, _ = native_job(f, t) + GPUCompiler.precompile_gpucompiler(job) +end + +end # module SimpleGPU From 3dbe9d5b7c7c5f56f18553f0e4d4bd9c2bdcaca5 Mon Sep 17 00:00:00 2001 From: Collin R Warner Date: Thu, 13 Apr 2023 17:32:33 -0400 Subject: [PATCH 05/14] remove debugging function --- src/precompile_native.jl | 7 ------- 1 file changed, 7 deletions(-) diff --git a/src/precompile_native.jl b/src/precompile_native.jl index ff3f69d1..2ca10062 100644 --- a/src/precompile_native.jl +++ b/src/precompile_native.jl @@ -48,13 +48,6 @@ function precompile_gpucompiler(job) end end -function get_code_cache_i(i) - for (j, cc) in enumerate(GPUCompiler.GLOBAL_CI_CACHES) - if j == i - return cc - end - end -end """ Reloads Global Cache from global variable which stores the previous cached results From db12163f02578705b1f723ad17a3f7d4e4c13e94 Mon Sep 17 00:00:00 2001 From: Collin R Warner Date: Thu, 13 Apr 2023 17:41:24 -0400 Subject: [PATCH 06/14] Change name to precompilation_cache --- src/GPUCompiler.jl | 2 +- src/precompilation_cache.jl | 108 ++++++++++++++++++++++++++++++++++++ 2 files changed, 109 insertions(+), 1 deletion(-) create mode 100644 src/precompilation_cache.jl diff --git a/src/GPUCompiler.jl b/src/GPUCompiler.jl index 809b50d5..8d477705 100644 --- a/src/GPUCompiler.jl +++ b/src/GPUCompiler.jl @@ -42,7 +42,7 @@ include("reflection.jl") include("precompile.jl") -include("precompile_native.jl") +include("precompilation_cache.jl") _precompile_() diff --git a/src/precompilation_cache.jl b/src/precompilation_cache.jl new file mode 100644 index 00000000..2ca10062 --- /dev/null +++ b/src/precompilation_cache.jl @@ -0,0 +1,108 @@ +const CACHE_NAME = gensym(:CACHE) # is now a const symbol (not a variable) +is_precompiling() = ccall(:jl_generating_output, Cint, ()) != 0 + +struct NativeCompilerParams <: AbstractCompilerParams end +export @declare_cache, @snapshot_cache, @reinit_cache, @get_cache + +macro declare_cache() + var = esc(CACHE_NAME) #this will esc variable from our const symbol + quote + #const $esc(CACHE_NAME) function esc is executed when macro is executed, not when code is defined + # dollar sign means will have the value of esc cachename here + const $var = $IdDict() + end +end + +macro snapshot_cache() + var = esc(CACHE_NAME) + quote + $snapshot_cache($var) + end +end + +macro reinit_cache() + var = esc(CACHE_NAME) + quote + # will need to keep track of this is CUDA so that GPUCompiler caches are not overfilled + $reinit_cache($var) + end +end + +macro get_cache() + var = esc(CACHE_NAME) + quote + $var + end +end + +""" +Given a function and param types caches the function to the global cache +""" +function precompile_gpucompiler(job) + # populate the cache + cache = GPUCompiler.ci_cache(job) + mt = GPUCompiler.method_table(job) + interp = GPUCompiler.get_interpreter(job) + if GPUCompiler.ci_cache_lookup(cache, job.source, job.world, typemax(Cint)) === nothing + GPUCompiler.ci_cache_populate(interp, cache, mt, job.source, job.world, typemax(Cint)) + end +end + +""" +Reloads Global Cache from global variable which stores the previous +cached results +""" +function reinit_cache(LOCAL_CACHE) + if !is_precompiling() + # need to merge caches at the code instance level + for key in keys(LOCAL_CACHE) + if haskey(GPUCompiler.GLOBAL_CI_CACHES, key) + global_cache = GPUCompiler.GLOBAL_CI_CACHES[key] + local_cache = LOCAL_CACHE[key] + for (mi, civ) in (local_cache.dict) + # this should be one since there is only one range that is infinite + @assert length(civ) == 1 + # add all code instances to global cache + # could move truncating code to set index + ci = civ[1] + if haskey(global_cache.dict, mi) + gciv = global_cache.dict[mi] + # truncation cod3 + # sort by min world age, then make sure no age ranges overlap // this part is uneeded + sort(gciv, by=x->x.min_world) + if ci.min_world > gciv[length(gciv)].min_world + invalidate_code_cache(global_cache, mi, ci.min_world - 1) + Core.Compiler.setindex!(global_cache, ci, mi) + else + println("Should not get here?") + @assert false + end + else + # occurs if we kill everything in the parent and then need to store in child + Core.Compiler.setindex!(global_cache, ci, mi) + end + end + else + # no conflict at cache level + GPUCompiler.GLOBAL_CI_CACHES[key] = LOCAL_CACHE[key] + end + end + end +end + +""" +Takes a snapshot of the current status of the cache + +The cache returned is a deep copy with finite world age endings removed +""" +function snapshot_cache(LOCAL_CACHE) + cleaned_cache_to_save = IdDict() + for key in keys(GPUCompiler.GLOBAL_CI_CACHES) + # Will only keep those elements with infinite ranges + cleaned_cache_to_save[key] = GPUCompiler.CodeCache(GPUCompiler.GLOBAL_CI_CACHES[key]) + end + global MY_CACHE #technically don't need the global + #empty insert + empty!(LOCAL_CACHE) + merge!(LOCAL_CACHE, cleaned_cache_to_save) +end From 9bfdceee40cc6022ce3ce7f5eb5f8bda1df6e012 Mon Sep 17 00:00:00 2001 From: Collin R Warner Date: Thu, 13 Apr 2023 17:43:05 -0400 Subject: [PATCH 07/14] remove uneeded code --- src/precompilation_cache.jl | 1 - src/precompile_native.jl | 108 ------------------------------------ 2 files changed, 109 deletions(-) delete mode 100644 src/precompile_native.jl diff --git a/src/precompilation_cache.jl b/src/precompilation_cache.jl index 2ca10062..bdfcf4dc 100644 --- a/src/precompilation_cache.jl +++ b/src/precompilation_cache.jl @@ -1,7 +1,6 @@ const CACHE_NAME = gensym(:CACHE) # is now a const symbol (not a variable) is_precompiling() = ccall(:jl_generating_output, Cint, ()) != 0 -struct NativeCompilerParams <: AbstractCompilerParams end export @declare_cache, @snapshot_cache, @reinit_cache, @get_cache macro declare_cache() diff --git a/src/precompile_native.jl b/src/precompile_native.jl deleted file mode 100644 index 2ca10062..00000000 --- a/src/precompile_native.jl +++ /dev/null @@ -1,108 +0,0 @@ -const CACHE_NAME = gensym(:CACHE) # is now a const symbol (not a variable) -is_precompiling() = ccall(:jl_generating_output, Cint, ()) != 0 - -struct NativeCompilerParams <: AbstractCompilerParams end -export @declare_cache, @snapshot_cache, @reinit_cache, @get_cache - -macro declare_cache() - var = esc(CACHE_NAME) #this will esc variable from our const symbol - quote - #const $esc(CACHE_NAME) function esc is executed when macro is executed, not when code is defined - # dollar sign means will have the value of esc cachename here - const $var = $IdDict() - end -end - -macro snapshot_cache() - var = esc(CACHE_NAME) - quote - $snapshot_cache($var) - end -end - -macro reinit_cache() - var = esc(CACHE_NAME) - quote - # will need to keep track of this is CUDA so that GPUCompiler caches are not overfilled - $reinit_cache($var) - end -end - -macro get_cache() - var = esc(CACHE_NAME) - quote - $var - end -end - -""" -Given a function and param types caches the function to the global cache -""" -function precompile_gpucompiler(job) - # populate the cache - cache = GPUCompiler.ci_cache(job) - mt = GPUCompiler.method_table(job) - interp = GPUCompiler.get_interpreter(job) - if GPUCompiler.ci_cache_lookup(cache, job.source, job.world, typemax(Cint)) === nothing - GPUCompiler.ci_cache_populate(interp, cache, mt, job.source, job.world, typemax(Cint)) - end -end - -""" -Reloads Global Cache from global variable which stores the previous -cached results -""" -function reinit_cache(LOCAL_CACHE) - if !is_precompiling() - # need to merge caches at the code instance level - for key in keys(LOCAL_CACHE) - if haskey(GPUCompiler.GLOBAL_CI_CACHES, key) - global_cache = GPUCompiler.GLOBAL_CI_CACHES[key] - local_cache = LOCAL_CACHE[key] - for (mi, civ) in (local_cache.dict) - # this should be one since there is only one range that is infinite - @assert length(civ) == 1 - # add all code instances to global cache - # could move truncating code to set index - ci = civ[1] - if haskey(global_cache.dict, mi) - gciv = global_cache.dict[mi] - # truncation cod3 - # sort by min world age, then make sure no age ranges overlap // this part is uneeded - sort(gciv, by=x->x.min_world) - if ci.min_world > gciv[length(gciv)].min_world - invalidate_code_cache(global_cache, mi, ci.min_world - 1) - Core.Compiler.setindex!(global_cache, ci, mi) - else - println("Should not get here?") - @assert false - end - else - # occurs if we kill everything in the parent and then need to store in child - Core.Compiler.setindex!(global_cache, ci, mi) - end - end - else - # no conflict at cache level - GPUCompiler.GLOBAL_CI_CACHES[key] = LOCAL_CACHE[key] - end - end - end -end - -""" -Takes a snapshot of the current status of the cache - -The cache returned is a deep copy with finite world age endings removed -""" -function snapshot_cache(LOCAL_CACHE) - cleaned_cache_to_save = IdDict() - for key in keys(GPUCompiler.GLOBAL_CI_CACHES) - # Will only keep those elements with infinite ranges - cleaned_cache_to_save[key] = GPUCompiler.CodeCache(GPUCompiler.GLOBAL_CI_CACHES[key]) - end - global MY_CACHE #technically don't need the global - #empty insert - empty!(LOCAL_CACHE) - merge!(LOCAL_CACHE, cleaned_cache_to_save) -end From 44c5a7e7840028fe0f9f9a89ccc8eb2e51d884ab Mon Sep 17 00:00:00 2001 From: Collin R Warner Date: Sun, 16 Apr 2023 16:56:36 -0400 Subject: [PATCH 08/14] switch from macros --- src/precompilation_cache.jl | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/precompilation_cache.jl b/src/precompilation_cache.jl index bdfcf4dc..3de68d6b 100644 --- a/src/precompilation_cache.jl +++ b/src/precompilation_cache.jl @@ -2,6 +2,7 @@ const CACHE_NAME = gensym(:CACHE) # is now a const symbol (not a variable) is_precompiling() = ccall(:jl_generating_output, Cint, ()) != 0 export @declare_cache, @snapshot_cache, @reinit_cache, @get_cache +export reinit_cache, snapshot_cache macro declare_cache() var = esc(CACHE_NAME) #this will esc variable from our const symbol @@ -34,6 +35,10 @@ macro get_cache() end end +function declare_cache() + return IdDict() +end + """ Given a function and param types caches the function to the global cache """ From 09d05dfd4d4dddb1bde3529818c5f301ea9214d6 Mon Sep 17 00:00:00 2001 From: Collin R Warner Date: Wed, 19 Apr 2023 17:09:54 -0400 Subject: [PATCH 09/14] change api --- src/precompilation_cache.jl | 116 ++++++++++++++++++++++++++++-------- test/EnzymeTest.jl | 19 ++++++ 2 files changed, 109 insertions(+), 26 deletions(-) create mode 100644 test/EnzymeTest.jl diff --git a/src/precompilation_cache.jl b/src/precompilation_cache.jl index 3de68d6b..2268853f 100644 --- a/src/precompilation_cache.jl +++ b/src/precompilation_cache.jl @@ -1,43 +1,107 @@ const CACHE_NAME = gensym(:CACHE) # is now a const symbol (not a variable) is_precompiling() = ccall(:jl_generating_output, Cint, ()) != 0 -export @declare_cache, @snapshot_cache, @reinit_cache, @get_cache -export reinit_cache, snapshot_cache +export ci_cache_snapshot, ci_cache_delta, ci_cache_insert -macro declare_cache() - var = esc(CACHE_NAME) #this will esc variable from our const symbol - quote - #const $esc(CACHE_NAME) function esc is executed when macro is executed, not when code is defined - # dollar sign means will have the value of esc cachename here - const $var = $IdDict() +function ci_cache_snapshot() + cleaned_cache_to_save = IdDict() + for key in keys(GPUCompiler.GLOBAL_CI_CACHES) + # Will only keep those elements with infinite ranges + cleaned_cache_to_save[key] = GPUCompiler.CodeCache(GPUCompiler.GLOBAL_CI_CACHES[key]) end + println("cleaned cache to save") + @show cleaned_cache_to_save + return cleaned_cache_to_save end -macro snapshot_cache() - var = esc(CACHE_NAME) - quote - $snapshot_cache($var) - end -end +function ci_cache_delta(previous_snapshot) + current_snapshot = ci_cache_snapshot() + println("current snapshot") + @show current_snapshot + delta_snapshot = IdDict{Tuple{DataType, Core.Compiler.InferenceParams, Core.Compiler.OptimizationParams}, GPUCompiler.CodeCache}() + for (cachekey, cache) in current_snapshot + if cachekey in keys(previous_snapshot) + for (mi, civ) in cache + if mi in keys(previous_snapshot[cachekey]) + for ci in civ + if !(ci in previous_snapshot[cachekey][mi]) + if !(cachekey in delta_snapshot) + delta_snapshot[cachekey] = GPUCompiler.CodeCache() + delta_snapshot[cachekey][mi] = Vector{CodeInstance}() + elseif !(mi in delta_snapshot[cachekey]) + delta_snapshot[cachekey][mi] = Vector{CodeInstance}() + end -macro reinit_cache() - var = esc(CACHE_NAME) - quote - # will need to keep track of this is CUDA so that GPUCompiler caches are not overfilled - $reinit_cache($var) + append!(delta_snapshot[cachekey][mi], ci) + end + end + else + # this whole cache is not present in the previous snapshot, can add all + if !(cachekey in delta_snapshot) + delta_snapshot[cachekey] = GPUCompiler.CodeCache() + end + delta_snapshot[cachekey][mi] = civ + end + end + else + delta_snapshot[cachekey] = current_snapshot[cachekey] + end end + println("delta snapshot") + @show delta_snapshot + return delta_snapshot end -macro get_cache() - var = esc(CACHE_NAME) - quote - $var +function ci_cache_insert(caches) + empty!(GPUCompiler.GLOBAL_CI_CACHES) + for (key, cache) in caches + GPUCompiler.GLOBAL_CI_CACHES[key] = GPUCompiler.CodeCache(cache) end end -function declare_cache() - return IdDict() -end +#=function ci_cache_insert(cache) + if !is_precompiling() + # need to merge caches at the code instance level + for key in keys(cache) + if haskey(GPUCompiler.GLOBAL_CI_CACHES, key) + global_cache = GPUCompiler.GLOBAL_CI_CACHES[key] + local_cache = cache[key] + for (mi, civ) in (local_cache.dict) + # this should be one since there is only one range that is infinite + @assert length(civ) == 1 + # add all code instances to global cache + # could move truncating code to set index + ci = civ[1] + if haskey(global_cache.dict, mi) + gciv = global_cache.dict[mi] + # truncation cod3 + # sort by min world age, then make sure no age ranges overlap // this part is uneeded + sort(gciv, by=x->x.min_world) + if ci.min_world > gciv[length(gciv)].min_world + println("invalidating mi [$mi] in world age [$(ci.min_world-1)]") + println("adding ci [$ci]") + invalidate_code_cache(global_cache, mi, ci.min_world - 1) + Core.Compiler.setindex!(global_cache, ci, mi) + else + println("Should not get here?") + @assert false + end + else + println("adding method instance [$mi] code instance [$ci]") + # occurs if we kill everything in the parent and then need to store in child + Core.Compiler.setindex!(global_cache, ci, mi) + end + end + else + # no conflict at cache level + println("no conflictt adding cache $(cache[key])") + GPUCompiler.GLOBAL_CI_CACHES[key] = cache[key] + end + end + println("global cache post insert") + @show GPUCompiler.GLOBAL_CI_CACHES + end +end=# """ Given a function and param types caches the function to the global cache diff --git a/test/EnzymeTest.jl b/test/EnzymeTest.jl new file mode 100644 index 00000000..05723bc8 --- /dev/null +++ b/test/EnzymeTest.jl @@ -0,0 +1,19 @@ +module EnzymeTest +using GPUCompiler +using Enzyme + +f1(x) = x*x +autodiff_wrapper(f) = first(autodiff(Reverse, f, Active(1.0))) + +println("precompilation!") + + + +const cache = let + cache_snapshot = GPUCompiler.ci_cache_snapshot() + autodiff_wrapper(f1) + GPUCompiler.ci_cache_delta(cache_snapshot) +end + +__init__() = GPUCompiler.ci_cache_insert(cache) +end # module EnzymeTest From c0d25a3782ebef81dd693c5453da449be9fad6eb Mon Sep 17 00:00:00 2001 From: Collin R Warner Date: Wed, 19 Apr 2023 18:44:51 -0400 Subject: [PATCH 10/14] debugging --- src/precompilation_cache.jl | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/src/precompilation_cache.jl b/src/precompilation_cache.jl index 2268853f..187b6606 100644 --- a/src/precompilation_cache.jl +++ b/src/precompilation_cache.jl @@ -1,7 +1,7 @@ const CACHE_NAME = gensym(:CACHE) # is now a const symbol (not a variable) is_precompiling() = ccall(:jl_generating_output, Cint, ()) != 0 -export ci_cache_snapshot, ci_cache_delta, ci_cache_insert +export ci_cache_snapshot, ci_cache_delta, ci_cache_insert, precompile_gpucompiler function ci_cache_snapshot() cleaned_cache_to_save = IdDict() @@ -9,15 +9,11 @@ function ci_cache_snapshot() # Will only keep those elements with infinite ranges cleaned_cache_to_save[key] = GPUCompiler.CodeCache(GPUCompiler.GLOBAL_CI_CACHES[key]) end - println("cleaned cache to save") - @show cleaned_cache_to_save return cleaned_cache_to_save end function ci_cache_delta(previous_snapshot) current_snapshot = ci_cache_snapshot() - println("current snapshot") - @show current_snapshot delta_snapshot = IdDict{Tuple{DataType, Core.Compiler.InferenceParams, Core.Compiler.OptimizationParams}, GPUCompiler.CodeCache}() for (cachekey, cache) in current_snapshot if cachekey in keys(previous_snapshot) @@ -47,19 +43,17 @@ function ci_cache_delta(previous_snapshot) delta_snapshot[cachekey] = current_snapshot[cachekey] end end - println("delta snapshot") - @show delta_snapshot return delta_snapshot end -function ci_cache_insert(caches) +#=function ci_cache_insert(caches) empty!(GPUCompiler.GLOBAL_CI_CACHES) for (key, cache) in caches GPUCompiler.GLOBAL_CI_CACHES[key] = GPUCompiler.CodeCache(cache) end -end +end=# -#=function ci_cache_insert(cache) +function ci_cache_insert(cache) if !is_precompiling() # need to merge caches at the code instance level for key in keys(cache) @@ -101,7 +95,7 @@ end println("global cache post insert") @show GPUCompiler.GLOBAL_CI_CACHES end -end=# +end """ Given a function and param types caches the function to the global cache From 845be17550beac14d5588e1068d186747f4d4f64 Mon Sep 17 00:00:00 2001 From: Collin R Warner Date: Fri, 21 Apr 2023 22:08:46 -0400 Subject: [PATCH 11/14] modifying to get orig --- examples/Example/Manifest.toml | 194 ---------------------------- examples/Example/Project.toml | 8 -- examples/Example/src/Example.jl | 15 --- examples/SimpleGPU/Manifest.toml | 188 --------------------------- examples/SimpleGPU/Project.toml | 7 - examples/SimpleGPU/src/SimpleGPU.jl | 28 ---- src/precompilation_cache.jl | 54 +++++--- test/EnzymeTest.jl | 19 --- test/Project.toml | 1 + 9 files changed, 36 insertions(+), 478 deletions(-) delete mode 100644 examples/Example/Manifest.toml delete mode 100644 examples/Example/Project.toml delete mode 100644 examples/Example/src/Example.jl delete mode 100644 examples/SimpleGPU/Manifest.toml delete mode 100644 examples/SimpleGPU/Project.toml delete mode 100644 examples/SimpleGPU/src/SimpleGPU.jl delete mode 100644 test/EnzymeTest.jl diff --git a/examples/Example/Manifest.toml b/examples/Example/Manifest.toml deleted file mode 100644 index a5b1f1c4..00000000 --- a/examples/Example/Manifest.toml +++ /dev/null @@ -1,194 +0,0 @@ -# This file is machine-generated - editing it directly is not advised - -julia_version = "1.10.0-DEV" -manifest_format = "2.0" -project_hash = "6afd6f1a57af520013070870c6f183d98c839ff4" - -[[deps.ArgTools]] -uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" -version = "1.1.1" - -[[deps.Artifacts]] -uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" - -[[deps.Base64]] -uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" - -[[deps.CEnum]] -git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90" -uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" -version = "0.4.2" - -[[deps.Dates]] -deps = ["Printf"] -uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" - -[[deps.Downloads]] -deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] -uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" -version = "1.6.0" - -[[deps.ExprTools]] -git-tree-sha1 = "c1d06d129da9f55715c6c212866f5b1bddc5fa00" -uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" -version = "0.1.9" - -[[deps.FileWatching]] -uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" - -[[deps.GPUCompiler]] -deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"] -path = "/home/collinw/.julia/dev/GPUCompiler" -uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.19.0" - -[[deps.InteractiveUtils]] -deps = ["Markdown"] -uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" - -[[deps.JLLWrappers]] -deps = ["Preferences"] -git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1" -uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.4.1" - -[[deps.LLVM]] -deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "a8960cae30b42b66dd41808beb76490519f6f9e2" -uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "5.0.0" - -[[deps.LLVMExtra_jll]] -deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] -git-tree-sha1 = "09b7505cc0b1cee87e5d4a26eea61d2e1b0dcd35" -uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.21+0" - -[[deps.LazyArtifacts]] -deps = ["Artifacts", "Pkg"] -uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" - -[[deps.LibCURL]] -deps = ["LibCURL_jll", "MozillaCACerts_jll"] -uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" -version = "0.6.3" - -[[deps.LibCURL_jll]] -deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] -uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" -version = "8.0.1+0" - -[[deps.LibGit2]] -deps = ["Base64", "NetworkOptions", "Printf", "SHA"] -uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" - -[[deps.LibSSH2_jll]] -deps = ["Artifacts", "Libdl", "MbedTLS_jll"] -uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" -version = "1.10.2+0" - -[[deps.Libdl]] -uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" - -[[deps.Logging]] -uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" - -[[deps.Markdown]] -deps = ["Base64"] -uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" - -[[deps.MbedTLS_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" -version = "2.28.2+0" - -[[deps.MozillaCACerts_jll]] -uuid = "14a3606d-f60d-562e-9121-12d972cd8159" -version = "2023.1.10" - -[[deps.NetworkOptions]] -uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" -version = "1.2.0" - -[[deps.Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] -uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" -version = "1.10.0" - -[[deps.Preferences]] -deps = ["TOML"] -git-tree-sha1 = "47e5f437cc0e7ef2ce8406ce1e7e24d44915f88d" -uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.3.0" - -[[deps.Printf]] -deps = ["Unicode"] -uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" - -[[deps.REPL]] -deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] -uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" - -[[deps.Random]] -deps = ["SHA", "Serialization"] -uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" - -[[deps.SHA]] -uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" -version = "0.7.0" - -[[deps.Scratch]] -deps = ["Dates"] -git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a" -uuid = "6c6a2e73-6563-6170-7368-637461726353" -version = "1.2.0" - -[[deps.Serialization]] -uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" - -[[deps.SimpleGPU]] -deps = ["GPUCompiler"] -path = "../SimpleGPU" -uuid = "0f92ac95-628b-4f27-9a96-2faf96da70df" -version = "0.1.0" - -[[deps.Sockets]] -uuid = "6462fe0b-24de-5631-8697-dd941f90decc" - -[[deps.TOML]] -deps = ["Dates"] -uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" -version = "1.0.3" - -[[deps.Tar]] -deps = ["ArgTools", "SHA"] -uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" -version = "1.10.0" - -[[deps.TimerOutputs]] -deps = ["ExprTools", "Printf"] -git-tree-sha1 = "f2fd3f288dfc6f507b0c3a2eb3bac009251e548b" -uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" -version = "0.5.22" - -[[deps.UUIDs]] -deps = ["Random", "SHA"] -uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" - -[[deps.Unicode]] -uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" - -[[deps.Zlib_jll]] -deps = ["Libdl"] -uuid = "83775a58-1f1d-513f-b197-d71354ab007a" -version = "1.2.13+0" - -[[deps.nghttp2_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" -version = "1.52.0+0" - -[[deps.p7zip_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" -version = "17.4.0+0" diff --git a/examples/Example/Project.toml b/examples/Example/Project.toml deleted file mode 100644 index 22ffa2ea..00000000 --- a/examples/Example/Project.toml +++ /dev/null @@ -1,8 +0,0 @@ -name = "Example" -uuid = "3a86cd2f-4474-4e46-89c8-15adf66897e9" -authors = ["collinw "] -version = "0.1.0" - -[deps] -GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55" -SimpleGPU = "0f92ac95-628b-4f27-9a96-2faf96da70df" diff --git a/examples/Example/src/Example.jl b/examples/Example/src/Example.jl deleted file mode 100644 index 776793ec..00000000 --- a/examples/Example/src/Example.jl +++ /dev/null @@ -1,15 +0,0 @@ -module Example -using GPUCompiler -using SimpleGPU -SimpleGPU.@declare_cache() - -f(x) = 1 -SimpleGPU.precompile_simple(f, (Int, )) - -function __init__() - SimpleGPU.@reinit_cache() -end - -SimpleGPU.@snapshot_cache() - -end # module Example diff --git a/examples/SimpleGPU/Manifest.toml b/examples/SimpleGPU/Manifest.toml deleted file mode 100644 index 564425c9..00000000 --- a/examples/SimpleGPU/Manifest.toml +++ /dev/null @@ -1,188 +0,0 @@ -# This file is machine-generated - editing it directly is not advised - -julia_version = "1.10.0-DEV" -manifest_format = "2.0" -project_hash = "7e4bd5a8a18c1099e483003348b1afb869b6d01e" - -[[deps.ArgTools]] -uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f" -version = "1.1.1" - -[[deps.Artifacts]] -uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33" - -[[deps.Base64]] -uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f" - -[[deps.CEnum]] -git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90" -uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82" -version = "0.4.2" - -[[deps.Dates]] -deps = ["Printf"] -uuid = "ade2ca70-3891-5945-98fb-dc099432e06a" - -[[deps.Downloads]] -deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"] -uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6" -version = "1.6.0" - -[[deps.ExprTools]] -git-tree-sha1 = "c1d06d129da9f55715c6c212866f5b1bddc5fa00" -uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04" -version = "0.1.9" - -[[deps.FileWatching]] -uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee" - -[[deps.GPUCompiler]] -deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"] -path = "/home/collinw/.julia/dev/GPUCompiler" -uuid = "61eb1bfa-7361-4325-ad38-22787b887f55" -version = "0.19.0" - -[[deps.InteractiveUtils]] -deps = ["Markdown"] -uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240" - -[[deps.JLLWrappers]] -deps = ["Preferences"] -git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1" -uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210" -version = "1.4.1" - -[[deps.LLVM]] -deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"] -git-tree-sha1 = "a8960cae30b42b66dd41808beb76490519f6f9e2" -uuid = "929cbde3-209d-540e-8aea-75f648917ca0" -version = "5.0.0" - -[[deps.LLVMExtra_jll]] -deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"] -git-tree-sha1 = "09b7505cc0b1cee87e5d4a26eea61d2e1b0dcd35" -uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab" -version = "0.0.21+0" - -[[deps.LazyArtifacts]] -deps = ["Artifacts", "Pkg"] -uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3" - -[[deps.LibCURL]] -deps = ["LibCURL_jll", "MozillaCACerts_jll"] -uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21" -version = "0.6.3" - -[[deps.LibCURL_jll]] -deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"] -uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0" -version = "8.0.1+0" - -[[deps.LibGit2]] -deps = ["Base64", "NetworkOptions", "Printf", "SHA"] -uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" - -[[deps.LibSSH2_jll]] -deps = ["Artifacts", "Libdl", "MbedTLS_jll"] -uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8" -version = "1.10.2+0" - -[[deps.Libdl]] -uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb" - -[[deps.Logging]] -uuid = "56ddb016-857b-54e1-b83d-db4d58db5568" - -[[deps.Markdown]] -deps = ["Base64"] -uuid = "d6f4376e-aef5-505a-96c1-9c027394607a" - -[[deps.MbedTLS_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1" -version = "2.28.2+0" - -[[deps.MozillaCACerts_jll]] -uuid = "14a3606d-f60d-562e-9121-12d972cd8159" -version = "2023.1.10" - -[[deps.NetworkOptions]] -uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908" -version = "1.2.0" - -[[deps.Pkg]] -deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"] -uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" -version = "1.10.0" - -[[deps.Preferences]] -deps = ["TOML"] -git-tree-sha1 = "47e5f437cc0e7ef2ce8406ce1e7e24d44915f88d" -uuid = "21216c6a-2e73-6563-6e65-726566657250" -version = "1.3.0" - -[[deps.Printf]] -deps = ["Unicode"] -uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7" - -[[deps.REPL]] -deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"] -uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" - -[[deps.Random]] -deps = ["SHA", "Serialization"] -uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" - -[[deps.SHA]] -uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" -version = "0.7.0" - -[[deps.Scratch]] -deps = ["Dates"] -git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a" -uuid = "6c6a2e73-6563-6170-7368-637461726353" -version = "1.2.0" - -[[deps.Serialization]] -uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" - -[[deps.Sockets]] -uuid = "6462fe0b-24de-5631-8697-dd941f90decc" - -[[deps.TOML]] -deps = ["Dates"] -uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76" -version = "1.0.3" - -[[deps.Tar]] -deps = ["ArgTools", "SHA"] -uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e" -version = "1.10.0" - -[[deps.TimerOutputs]] -deps = ["ExprTools", "Printf"] -git-tree-sha1 = "f2fd3f288dfc6f507b0c3a2eb3bac009251e548b" -uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" -version = "0.5.22" - -[[deps.UUIDs]] -deps = ["Random", "SHA"] -uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" - -[[deps.Unicode]] -uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5" - -[[deps.Zlib_jll]] -deps = ["Libdl"] -uuid = "83775a58-1f1d-513f-b197-d71354ab007a" -version = "1.2.13+0" - -[[deps.nghttp2_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d" -version = "1.52.0+0" - -[[deps.p7zip_jll]] -deps = ["Artifacts", "Libdl"] -uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0" -version = "17.4.0+0" diff --git a/examples/SimpleGPU/Project.toml b/examples/SimpleGPU/Project.toml deleted file mode 100644 index 247624c5..00000000 --- a/examples/SimpleGPU/Project.toml +++ /dev/null @@ -1,7 +0,0 @@ -name = "SimpleGPU" -uuid = "0f92ac95-628b-4f27-9a96-2faf96da70df" -authors = ["collinw "] -version = "0.1.0" - -[deps] -GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55" diff --git a/examples/SimpleGPU/src/SimpleGPU.jl b/examples/SimpleGPU/src/SimpleGPU.jl deleted file mode 100644 index 894291e9..00000000 --- a/examples/SimpleGPU/src/SimpleGPU.jl +++ /dev/null @@ -1,28 +0,0 @@ -module SimpleGPU -using GPUCompiler -struct NativeCompilerParams <: AbstractCompilerParams - entry_safepoint::Bool - method_table - - NativeCompilerParams(entry_safepoint::Bool=false, method_table=test_method_table) = - new(entry_safepoint, method_table) -end - -const test_method_table = nothing - -function native_job(@nospecialize(func), @nospecialize(types); kernel::Bool=false, - entry_abi=:specfunc, entry_safepoint::Bool=false, always_inline=false, - method_table=test_method_table, kwargs...) - source = methodinstance(typeof(func), Base.to_tuple_type(types)) - target = NativeCompilerTarget() - params = NativeCompilerParams(entry_safepoint, method_table) - config = CompilerConfig(target, params; kernel, entry_abi, always_inline) - CompilerJob(source, config), kwargs -end - -function precompile_simple(f, t) - job, _ = native_job(f, t) - GPUCompiler.precompile_gpucompiler(job) -end - -end # module SimpleGPU diff --git a/src/precompilation_cache.jl b/src/precompilation_cache.jl index 187b6606..d9ba0e84 100644 --- a/src/precompilation_cache.jl +++ b/src/precompilation_cache.jl @@ -15,28 +15,28 @@ end function ci_cache_delta(previous_snapshot) current_snapshot = ci_cache_snapshot() delta_snapshot = IdDict{Tuple{DataType, Core.Compiler.InferenceParams, Core.Compiler.OptimizationParams}, GPUCompiler.CodeCache}() - for (cachekey, cache) in current_snapshot + for (cachekey, codecache) in current_snapshot if cachekey in keys(previous_snapshot) - for (mi, civ) in cache - if mi in keys(previous_snapshot[cachekey]) + for (mi, civ) in codecache.dict + if mi in keys(previous_snapshot[cachekey].dict) for ci in civ - if !(ci in previous_snapshot[cachekey][mi]) - if !(cachekey in delta_snapshot) + if !(ci in previous_snapshot[cachekey].dict[mi]) + if !(cachekey in keys(delta_snapshot)) delta_snapshot[cachekey] = GPUCompiler.CodeCache() - delta_snapshot[cachekey][mi] = Vector{CodeInstance}() - elseif !(mi in delta_snapshot[cachekey]) - delta_snapshot[cachekey][mi] = Vector{CodeInstance}() + delta_snapshot[cachekey].dict[mi] = Vector{CodeInstance}() + elseif !(mi in keys(delta_snapshot[cachekey].dict)) + delta_snapshot[cachekey].dict[mi] = Vector{CodeInstance}() end - append!(delta_snapshot[cachekey][mi], ci) + push!(delta_snapshot[cachekey].dict[mi], ci) end end else # this whole cache is not present in the previous snapshot, can add all - if !(cachekey in delta_snapshot) + if !(cachekey in keys(delta_snapshot)) delta_snapshot[cachekey] = GPUCompiler.CodeCache() end - delta_snapshot[cachekey][mi] = civ + delta_snapshot[cachekey].dict[mi] = civ end end else @@ -55,11 +55,33 @@ end=# function ci_cache_insert(cache) if !is_precompiling() + #first clean the cache + cleaned_cache = IdDict() + for (key, c) in cache + usedCache = false + newCodeCache = GPUCompiler.CodeCache() + for (mi, civ) in c.dict + new_civ = Vector() + for ci in civ + if ci.min_world <= ci.max_world + push!(new_civ, ci) + end + end + if length(new_civ) > 0 + usedCache = true + newCodeCache.dict[mi] = new_civ + end + end + if usedCache + cleaned_cache[key] = newCodeCache + end + end + # need to merge caches at the code instance level - for key in keys(cache) + for (key, local_cache) in cleaned_cache if haskey(GPUCompiler.GLOBAL_CI_CACHES, key) global_cache = GPUCompiler.GLOBAL_CI_CACHES[key] - local_cache = cache[key] + #local_cache = cache[key] for (mi, civ) in (local_cache.dict) # this should be one since there is only one range that is infinite @assert length(civ) == 1 @@ -72,8 +94,6 @@ function ci_cache_insert(cache) # sort by min world age, then make sure no age ranges overlap // this part is uneeded sort(gciv, by=x->x.min_world) if ci.min_world > gciv[length(gciv)].min_world - println("invalidating mi [$mi] in world age [$(ci.min_world-1)]") - println("adding ci [$ci]") invalidate_code_cache(global_cache, mi, ci.min_world - 1) Core.Compiler.setindex!(global_cache, ci, mi) else @@ -81,19 +101,15 @@ function ci_cache_insert(cache) @assert false end else - println("adding method instance [$mi] code instance [$ci]") # occurs if we kill everything in the parent and then need to store in child Core.Compiler.setindex!(global_cache, ci, mi) end end else # no conflict at cache level - println("no conflictt adding cache $(cache[key])") GPUCompiler.GLOBAL_CI_CACHES[key] = cache[key] end end - println("global cache post insert") - @show GPUCompiler.GLOBAL_CI_CACHES end end diff --git a/test/EnzymeTest.jl b/test/EnzymeTest.jl deleted file mode 100644 index 05723bc8..00000000 --- a/test/EnzymeTest.jl +++ /dev/null @@ -1,19 +0,0 @@ -module EnzymeTest -using GPUCompiler -using Enzyme - -f1(x) = x*x -autodiff_wrapper(f) = first(autodiff(Reverse, f, Active(1.0))) - -println("precompilation!") - - - -const cache = let - cache_snapshot = GPUCompiler.ci_cache_snapshot() - autodiff_wrapper(f1) - GPUCompiler.ci_cache_delta(cache_snapshot) -end - -__init__() = GPUCompiler.ci_cache_insert(cache) -end # module EnzymeTest diff --git a/test/Project.toml b/test/Project.toml index e602d235..002ebb1c 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -1,5 +1,6 @@ [deps] Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f" +GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55" InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" Metal_LLVM_Tools_jll = "0418c028-ff8c-56b8-a53e-0f9676ed36fc" From a6bd41aaa497753b70d2b33cee11cd59ce3e6b64 Mon Sep 17 00:00:00 2001 From: Collin R Warner Date: Sat, 22 Apr 2023 20:14:32 -0400 Subject: [PATCH 12/14] Add persistent Cache example --- test/ExamplePersistentCache/GPUKernel.jl | 26 ++++++++++++++++++++++ test/ExamplePersistentCache/README.txt | 20 +++++++++++++++++ test/ExamplePersistentCache/TestRuntime.jl | 8 +++++++ 3 files changed, 54 insertions(+) create mode 100644 test/ExamplePersistentCache/GPUKernel.jl create mode 100644 test/ExamplePersistentCache/README.txt create mode 100644 test/ExamplePersistentCache/TestRuntime.jl diff --git a/test/ExamplePersistentCache/GPUKernel.jl b/test/ExamplePersistentCache/GPUKernel.jl new file mode 100644 index 00000000..628e50f1 --- /dev/null +++ b/test/ExamplePersistentCache/GPUKernel.jl @@ -0,0 +1,26 @@ +module GPUKernel +using GPUCompiler +using TestRuntime +snapshot = GPUCompiler.ci_cache_snapshot() + +struct TestCompilerParams <: AbstractCompilerParams end +GPUCompiler.runtime_module(::CompilerJob{<:Any,TestCompilerParams}) = TestRuntime + +kernel() = nothing +function main() + source = methodinstance(typeof(kernel), Tuple{}) + target = NativeCompilerTarget() + params = TestCompilerParams() + config = CompilerConfig(target, params) + job = CompilerJob(source, config) + + println(GPUCompiler.compile(:asm, job)[1]) +end + +main() +const persistent_cache = GPUCompiler.ci_cache_delta(snapshot) + +function __init__() + GPUCompiler.ci_cache_insert(persistent_cache) +end +end # module GPUKernel diff --git a/test/ExamplePersistentCache/README.txt b/test/ExamplePersistentCache/README.txt new file mode 100644 index 00000000..80462fd5 --- /dev/null +++ b/test/ExamplePersistentCache/README.txt @@ -0,0 +1,20 @@ +Persistent Cache api: + +GPUCompiler.ci_cache_snapshot() -> cache: returns a snapshot of GLOBAL_CI_CACHES used +as a base point for what will be persistently cached. + +GPUCompiler.ci_cache_delta(snapshot::cache) -> cache: takes a snapshot and returns +the cache that represents the difference between (current GLOBAL_CI_CACHES - snapshot) + +GPUCompiler.ci_cache_insert(snapshot::cache): inserts snapshot into GLOBAL_CI_CACHES + + +Usage: +snapshot = GPUCompiler.ci_cache_snapshot() +... precompile work ... +const persistent_cache = GPUCompiler.ci_cache_delta(snapshot) + +function __init__() + GPUCompiler.ci_cache_insert(persistent_cache) + ... rest of init logic ... +end diff --git a/test/ExamplePersistentCache/TestRuntime.jl b/test/ExamplePersistentCache/TestRuntime.jl new file mode 100644 index 00000000..1d29e4ba --- /dev/null +++ b/test/ExamplePersistentCache/TestRuntime.jl @@ -0,0 +1,8 @@ +module TestRuntime + signal_exception() = return + malloc(sz) = C_NULL + report_oom(sz) = return + report_exception(ex) = return + report_exception_name(ex) = return + report_exception_frame(idx, func, file, line) = return +end # module TestRuntime From cc34d2117bb85a9f3ce34ac8906514ecb52daa5b Mon Sep 17 00:00:00 2001 From: Collin R Warner Date: Sun, 23 Apr 2023 15:26:31 -0400 Subject: [PATCH 13/14] Remove dead code --- src/precompilation_cache.jl | 66 ------------------------------------- 1 file changed, 66 deletions(-) diff --git a/src/precompilation_cache.jl b/src/precompilation_cache.jl index d9ba0e84..1bc5c19a 100644 --- a/src/precompilation_cache.jl +++ b/src/precompilation_cache.jl @@ -46,13 +46,6 @@ function ci_cache_delta(previous_snapshot) return delta_snapshot end -#=function ci_cache_insert(caches) - empty!(GPUCompiler.GLOBAL_CI_CACHES) - for (key, cache) in caches - GPUCompiler.GLOBAL_CI_CACHES[key] = GPUCompiler.CodeCache(cache) - end -end=# - function ci_cache_insert(cache) if !is_precompiling() #first clean the cache @@ -125,62 +118,3 @@ function precompile_gpucompiler(job) GPUCompiler.ci_cache_populate(interp, cache, mt, job.source, job.world, typemax(Cint)) end end - -""" -Reloads Global Cache from global variable which stores the previous -cached results -""" -function reinit_cache(LOCAL_CACHE) - if !is_precompiling() - # need to merge caches at the code instance level - for key in keys(LOCAL_CACHE) - if haskey(GPUCompiler.GLOBAL_CI_CACHES, key) - global_cache = GPUCompiler.GLOBAL_CI_CACHES[key] - local_cache = LOCAL_CACHE[key] - for (mi, civ) in (local_cache.dict) - # this should be one since there is only one range that is infinite - @assert length(civ) == 1 - # add all code instances to global cache - # could move truncating code to set index - ci = civ[1] - if haskey(global_cache.dict, mi) - gciv = global_cache.dict[mi] - # truncation cod3 - # sort by min world age, then make sure no age ranges overlap // this part is uneeded - sort(gciv, by=x->x.min_world) - if ci.min_world > gciv[length(gciv)].min_world - invalidate_code_cache(global_cache, mi, ci.min_world - 1) - Core.Compiler.setindex!(global_cache, ci, mi) - else - println("Should not get here?") - @assert false - end - else - # occurs if we kill everything in the parent and then need to store in child - Core.Compiler.setindex!(global_cache, ci, mi) - end - end - else - # no conflict at cache level - GPUCompiler.GLOBAL_CI_CACHES[key] = LOCAL_CACHE[key] - end - end - end -end - -""" -Takes a snapshot of the current status of the cache - -The cache returned is a deep copy with finite world age endings removed -""" -function snapshot_cache(LOCAL_CACHE) - cleaned_cache_to_save = IdDict() - for key in keys(GPUCompiler.GLOBAL_CI_CACHES) - # Will only keep those elements with infinite ranges - cleaned_cache_to_save[key] = GPUCompiler.CodeCache(GPUCompiler.GLOBAL_CI_CACHES[key]) - end - global MY_CACHE #technically don't need the global - #empty insert - empty!(LOCAL_CACHE) - merge!(LOCAL_CACHE, cleaned_cache_to_save) -end From 195108754ed25587311484e7dda53339b5df7e74 Mon Sep 17 00:00:00 2001 From: collinwarner Date: Thu, 18 May 2023 18:40:31 -0400 Subject: [PATCH 14/14] add native caching --- src/cache.jl | 12 +++-- src/jlgen.jl | 11 ++--- src/precompilation_cache.jl | 88 +++++++++++++++++++------------------ 3 files changed, 60 insertions(+), 51 deletions(-) diff --git a/src/cache.jl b/src/cache.jl index fa71ab19..7a38a88e 100644 --- a/src/cache.jl +++ b/src/cache.jl @@ -26,7 +26,6 @@ function cached_compilation(cache::AbstractDict{UInt,V}, key = hash(tt, key) key = hash(world, key) key = hash(cfg, key) - # NOTE: no use of lock(::Function)/@lock/get! to avoid try/catch and closure overhead lock(cache_lock) obj = get(cache, key, nothing) @@ -36,6 +35,7 @@ function cached_compilation(cache::AbstractDict{UInt,V}, if obj === nothing || compile_hook[] !== nothing obj = actual_compilation(cache, key, cfg, ft, tt, compiler, linker)::V end + return obj::V end @@ -45,10 +45,14 @@ end src = methodinstance(ft, tt) job = CompilerJob(src, cfg) + global_cache = ci_cache(job) asm = nothing - # TODO: consider loading the assembly from an on-disk cache here - # compile + # read asm from persistent offline cache + if haskey(global_cache.asm, src) + asm = global_cache.asm[src] + end + if asm === nothing asm = compiler(job) end @@ -57,7 +61,7 @@ end # in which case the cache will already be populated) lock(cache_lock) do haskey(cache, key) && return cache[key] - + global_cache.asm[src] = asm obj = linker(job, asm) cache[key] = obj obj diff --git a/src/jlgen.jl b/src/jlgen.jl index 74c0fd4e..44e95a0a 100644 --- a/src/jlgen.jl +++ b/src/jlgen.jl @@ -255,16 +255,19 @@ using Core.Compiler: CodeInstance, MethodInstance, InferenceParams, Optimization struct CodeCache dict::IdDict{MethodInstance,Vector{CodeInstance}} + asm::IdDict{MethodInstance, NamedTuple{(:image, :entry, :external_gvars), Tuple{Vector{UInt8}, String, Vector{String}}}} - CodeCache() = new(Dict{MethodInstance,Vector{CodeInstance}}()) - CodeCache(cache::CodeCache) = new(GPUCompiler.copyAndFilter(cache.dict)) + CodeCache() = new(Dict{MethodInstance,Vector{CodeInstance}}(), + Dict{MethodInstance, NamedTuple{(:image, :entry, :external_gvars), Tuple{Vector{UInt8}, String, Vector{String}}}}()) + + CodeCache(cache::CodeCache) = new(GPUCompiler.copyAndFilter(cache.dict), cache.asm) end function copyAndFilter(dict::IdDict) out= IdDict() for key in keys(dict) useKey = true - # why is it an array of code instances, can there be more than 1? + for ci in dict[key] if ci.max_world < typemax(typeof(ci.max_world)) useKey = false @@ -590,7 +593,6 @@ end function ci_cache_populate(interp, cache, mt, mi, min_world, max_world) src = Core.Compiler.typeinf_ext_toplevel(interp, mi) - # inference populates the cache, so we don't need to jl_get_method_inferred wvc = WorldView(cache, min_world, max_world) @assert Core.Compiler.haskey(wvc, mi) @@ -622,7 +624,6 @@ function ci_cache_lookup(cache, mi, min_world, max_world) return ci end - ## interface # for platforms without @cfunction-with-closure support diff --git a/src/precompilation_cache.jl b/src/precompilation_cache.jl index 1bc5c19a..138d850e 100644 --- a/src/precompilation_cache.jl +++ b/src/precompilation_cache.jl @@ -7,25 +7,33 @@ function ci_cache_snapshot() cleaned_cache_to_save = IdDict() for key in keys(GPUCompiler.GLOBAL_CI_CACHES) # Will only keep those elements with infinite ranges + # copy constructor cleaned_cache_to_save[key] = GPUCompiler.CodeCache(GPUCompiler.GLOBAL_CI_CACHES[key]) end + return cleaned_cache_to_save end function ci_cache_delta(previous_snapshot) current_snapshot = ci_cache_snapshot() delta_snapshot = IdDict{Tuple{DataType, Core.Compiler.InferenceParams, Core.Compiler.OptimizationParams}, GPUCompiler.CodeCache}() - for (cachekey, codecache) in current_snapshot + for (cachekey, codecache) in current_snapshot # iterate through all caches if cachekey in keys(previous_snapshot) - for (mi, civ) in codecache.dict + for (mi, civ) in codecache.dict # iterate through all mi if mi in keys(previous_snapshot[cachekey].dict) for ci in civ if !(ci in previous_snapshot[cachekey].dict[mi]) if !(cachekey in keys(delta_snapshot)) delta_snapshot[cachekey] = GPUCompiler.CodeCache() delta_snapshot[cachekey].dict[mi] = Vector{CodeInstance}() + if haskey(codecache.asm, mi) + delta_snapshot[cachekey].asm[mi] = codecache.asm[mi] + end elseif !(mi in keys(delta_snapshot[cachekey].dict)) delta_snapshot[cachekey].dict[mi] = Vector{CodeInstance}() + if haskey(codecache.asm, mi) + delta_snapshot[cachekey].asm[mi] = codecache.asm[mi] + end end push!(delta_snapshot[cachekey].dict[mi], ci) @@ -36,6 +44,10 @@ function ci_cache_delta(previous_snapshot) if !(cachekey in keys(delta_snapshot)) delta_snapshot[cachekey] = GPUCompiler.CodeCache() end + + if haskey(codecache.asm, mi) + delta_snapshot[cachekey].asm[mi] = codecache.asm[mi] + end delta_snapshot[cachekey].dict[mi] = civ end end @@ -43,59 +55,34 @@ function ci_cache_delta(previous_snapshot) delta_snapshot[cachekey] = current_snapshot[cachekey] end end + return delta_snapshot end +function print_keys(caches) + println("************") + for (key, cache) in caches + for (mi, civ) in cache.dict + println("$mi -> $(length(civ))") + end + end + println("************") +end function ci_cache_insert(cache) if !is_precompiling() - #first clean the cache - cleaned_cache = IdDict() - for (key, c) in cache - usedCache = false - newCodeCache = GPUCompiler.CodeCache() - for (mi, civ) in c.dict - new_civ = Vector() - for ci in civ - if ci.min_world <= ci.max_world - push!(new_civ, ci) - end - end - if length(new_civ) > 0 - usedCache = true - newCodeCache.dict[mi] = new_civ - end - end - if usedCache - cleaned_cache[key] = newCodeCache - end - end - # need to merge caches at the code instance level - for (key, local_cache) in cleaned_cache + for (key, local_cache) in cache if haskey(GPUCompiler.GLOBAL_CI_CACHES, key) global_cache = GPUCompiler.GLOBAL_CI_CACHES[key] - #local_cache = cache[key] for (mi, civ) in (local_cache.dict) # this should be one since there is only one range that is infinite @assert length(civ) == 1 # add all code instances to global cache # could move truncating code to set index - ci = civ[1] - if haskey(global_cache.dict, mi) - gciv = global_cache.dict[mi] - # truncation cod3 - # sort by min world age, then make sure no age ranges overlap // this part is uneeded - sort(gciv, by=x->x.min_world) - if ci.min_world > gciv[length(gciv)].min_world - invalidate_code_cache(global_cache, mi, ci.min_world - 1) - Core.Compiler.setindex!(global_cache, ci, mi) - else - println("Should not get here?") - @assert false - end - else - # occurs if we kill everything in the parent and then need to store in child - Core.Compiler.setindex!(global_cache, ci, mi) + Core.Compiler.setindex!(global_cache, civ[1], mi) + #@assert haskey(local_cache.asm, mi) + if haskey(local_cache.asm, mi) + global_cache.asm[mi] = local_cache.asm[mi] end end else @@ -118,3 +105,20 @@ function precompile_gpucompiler(job) GPUCompiler.ci_cache_populate(interp, cache, mt, job.source, job.world, typemax(Cint)) end end + +""" +Generate a precompile file for the current state of the cache +""" +function generate_precompilation_file(snapshot, filename, precompilation_function) + method_instances = [] + for (cachekey, cache) in snapshot + for (mi, civ) in cache.dict + push!(method_instances, mi) + end + end + + precompile_statements = join(["$precompilation_function($(mi.specTypes.parameters[1]), Core.$(mi.specTypes.parameters[2:length(mi.specTypes.parameters)]))" for mi in method_instances], '\n') + open(filename, "w") do file + write(file, precompile_statements) + end +end