From 772bd94c2560158ab72764bfca6e9518933182dc Mon Sep 17 00:00:00 2001
From: Collin R Warner <collinw@amdci2.julia.csail.mit.edu>
Date: Sat, 8 Apr 2023 20:32:43 -0400
Subject: [PATCH 01/14] add GPUCompiler precompilation caching

---
 src/GPUCompiler.jl |  3 +++
 src/jlgen.jl       | 20 ++++++++++++++++++++
 2 files changed, 23 insertions(+)

diff --git a/src/GPUCompiler.jl b/src/GPUCompiler.jl
index 34f3fbd6..809b50d5 100644
--- a/src/GPUCompiler.jl
+++ b/src/GPUCompiler.jl
@@ -40,7 +40,10 @@ include("cache.jl")
 include("execution.jl")
 include("reflection.jl")
 
+
 include("precompile.jl")
+include("precompile_native.jl")
+
 _precompile_()
 
 function __init__()
diff --git a/src/jlgen.jl b/src/jlgen.jl
index d8b7ca4f..47da7579 100644
--- a/src/jlgen.jl
+++ b/src/jlgen.jl
@@ -254,8 +254,28 @@ struct CodeCache
     dict::IdDict{MethodInstance,Vector{CodeInstance}}
 
     CodeCache() = new(Dict{MethodInstance,Vector{CodeInstance}}())
+    CodeCache(cache::CodeCache) = new(GPUCompiler.copyAndFilter(cache.dict))
+end
+
+function copyAndFilter(dict::IdDict)
+    out= IdDict()
+    for key in keys(dict)
+        useKey = true
+        # why is it an array of code instances, can there be more than 1?
+        for ci in dict[key]
+            if ci.max_world < typemax(typeof(ci.max_world))
+                useKey = false
+                break
+            end
+        end
+        if useKey
+            out[key] = dict[key]
+        end
+    end
+    return out
 end
 
+
 function Base.show(io::IO, ::MIME"text/plain", cc::CodeCache)
     print(io, "CodeCache with $(mapreduce(length, +, values(cc.dict); init=0)) entries")
     if !isempty(cc.dict)

From a4bad27c437caa7b92564794afb7e7757a7eb495 Mon Sep 17 00:00:00 2001
From: Collin R Warner <collinw@amdci2.julia.csail.mit.edu>
Date: Sun, 9 Apr 2023 14:27:27 -0400
Subject: [PATCH 02/14] add precompile file

---
 src/precompile_native.jl | 110 +++++++++++++++++++++++++++++++++++++++
 1 file changed, 110 insertions(+)
 create mode 100644 src/precompile_native.jl

diff --git a/src/precompile_native.jl b/src/precompile_native.jl
new file mode 100644
index 00000000..f560dadc
--- /dev/null
+++ b/src/precompile_native.jl
@@ -0,0 +1,110 @@
+const CACHE_NAME = gensym(:CACHE) # is now a const symbol (not a variable)
+is_precompiling() = ccall(:jl_generating_output, Cint, ()) != 0
+
+struct NativeCompilerParams <: AbstractCompilerParams end
+export @declare_cache, @snapshot_cache, @reinit_cache, @get_cache
+
+macro declare_cache()
+    var = esc(CACHE_NAME) #this will esc variable from our const symbol
+    quote
+        #const $esc(CACHE_NAME) function esc is executed when macro is executed, not when code is defined
+        # dollar sign means will have the value of esc cachename here
+        const $var = $IdDict()
+    end
+end
+
+macro snapshot_cache()
+    var = esc(CACHE_NAME)
+    quote
+        $snapshot_cache($var)
+    end
+end
+
+macro reinit_cache()
+    var = esc(CACHE_NAME)
+    quote
+        # will need to keep track of this is CUDA so that GPUCompiler caches are not overfilled
+        $reinit_cache($var)
+    end
+end
+
+macro get_cache()
+    var = esc(CACHE_NAME)
+    quote
+        $var
+    end
+end
+
+"""
+Given a function and param types caches the function to the global cache
+"""
+function precompile_gpucompiler(job)
+    # populate the cache
+    cache = GPUCompiler.ci_cache(job)
+    mt = GPUCompiler.method_table(job)
+    interp = GPUCompiler.get_interpreter(job)
+    if GPUCompiler.ci_cache_lookup(cache, job.source, job.world, typemax(Cint)) === nothing
+        GPUCompiler.ci_cache_populate(interp, cache, mt, job.source, job.world, typemax(Cint))
+    end
+end
+
+function get_code_cache_i(i)
+    for (j, cc) in enumerate(GPUCompiler.GLOBAL_CI_CACHES)
+        if j == i
+            return cc
+        end
+    end
+end
+"""
+Reloads Global Cache from global variable which stores the previous
+cached results
+"""
+function reinit_cache(LOCAL_CACHE)
+    if !is_precompiling()
+        # need to merge caches at the code instance level
+        for key in keys(LOCAL_CACHE)
+            if haskey(GPUCompiler.GLOBAL_CI_CACHES, key)
+                global_cache = GPUCompiler.GLOBAL_CI_CACHES[key]
+                local_cache = LOCAL_CACHE[key]
+                for (mi, civ) in (local_cache.dict)
+                    # this should be one since there is only one range that is infinite
+                    @assert length(civ) == 1
+                    # add all code instances to global cache
+                    # could move truncating code to set index
+                    ci = civ[1]
+                    if haskey(global_cache.dict, mi)
+                        gciv = global_cache.dict[mi]
+                        # truncation cod3
+                        # sort by min world age, then make sure no age ranges overlap // this part is uneeded
+                        sort(gciv, by=x->x.min_world)
+                        if ci.min_world > gciv[length(gciv)].min_world
+                            invalidate_code_cache(global_cache, mi, ci.min_world - 1)
+                            Core.Compiler.setindex!(global_cache, ci, mi)
+                        else
+                            println("Should not get here?")
+                            @assert false
+                        end
+                    else
+                        # occurs if we kill everything in the parent and then need to store in child
+                        Core.Compiler.setindex!(global_cache, ci, mi)
+                    end
+                end
+            else
+                # no conflict at cache level
+                GPUCompiler.GLOBAL_CI_CACHES[key] = LOCAL_CACHE[key]
+            end
+        end
+    end
+end
+
+"""
+Takes a snapshot of the current status of the cache
+
+The cache returned is a deep copy with finite world age endings removed
+"""
+function snapshot_cache(LOCAL_CACHE)
+    cleaned_cache_to_save = IdDict()
+    for key in keys(GPUCompiler.GLOBAL_CI_CACHES)
+        # Will only keep those elements with infinite ranges
+    merge!(LOCAL_CACHE, cleaned_cache_to_save)
+end

From 4de3f62bdf3ec5470381d3995276fb14c7a7306f Mon Sep 17 00:00:00 2001
From: Collin R Warner <collinw@amdci2.julia.csail.mit.edu>
Date: Sun, 9 Apr 2023 16:46:43 -0400
Subject: [PATCH 03/14] fix accidental deletion

---
 src/precompile_native.jl | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/precompile_native.jl b/src/precompile_native.jl
index f560dadc..ff3f69d1 100644
--- a/src/precompile_native.jl
+++ b/src/precompile_native.jl
@@ -106,5 +106,10 @@ function snapshot_cache(LOCAL_CACHE)
     cleaned_cache_to_save = IdDict()
     for key in keys(GPUCompiler.GLOBAL_CI_CACHES)
         # Will only keep those elements with infinite ranges
+        cleaned_cache_to_save[key] = GPUCompiler.CodeCache(GPUCompiler.GLOBAL_CI_CACHES[key])
+    end
+    global MY_CACHE #technically don't need the global
+    #empty insert
+    empty!(LOCAL_CACHE)
     merge!(LOCAL_CACHE, cleaned_cache_to_save)
 end

From 11007f2726284d070f4c491fe277e816220089ce Mon Sep 17 00:00:00 2001
From: Collin R Warner <collinw@amdci2.julia.csail.mit.edu>
Date: Tue, 11 Apr 2023 13:31:27 -0400
Subject: [PATCH 04/14] Add examples detailing functionality

---
 examples/Example/Manifest.toml      | 194 ++++++++++++++++++++++++++++
 examples/Example/Project.toml       |   8 ++
 examples/Example/src/Example.jl     |  15 +++
 examples/SimpleGPU/Manifest.toml    | 188 +++++++++++++++++++++++++++
 examples/SimpleGPU/Project.toml     |   7 +
 examples/SimpleGPU/src/SimpleGPU.jl |  28 ++++
 6 files changed, 440 insertions(+)
 create mode 100644 examples/Example/Manifest.toml
 create mode 100644 examples/Example/Project.toml
 create mode 100644 examples/Example/src/Example.jl
 create mode 100644 examples/SimpleGPU/Manifest.toml
 create mode 100644 examples/SimpleGPU/Project.toml
 create mode 100644 examples/SimpleGPU/src/SimpleGPU.jl

diff --git a/examples/Example/Manifest.toml b/examples/Example/Manifest.toml
new file mode 100644
index 00000000..a5b1f1c4
--- /dev/null
+++ b/examples/Example/Manifest.toml
@@ -0,0 +1,194 @@
+# This file is machine-generated - editing it directly is not advised
+
+julia_version = "1.10.0-DEV"
+manifest_format = "2.0"
+project_hash = "6afd6f1a57af520013070870c6f183d98c839ff4"
+
+[[deps.ArgTools]]
+uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
+version = "1.1.1"
+
+[[deps.Artifacts]]
+uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
+
+[[deps.Base64]]
+uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
+
+[[deps.CEnum]]
+git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90"
+uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
+version = "0.4.2"
+
+[[deps.Dates]]
+deps = ["Printf"]
+uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
+
+[[deps.Downloads]]
+deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
+uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
+version = "1.6.0"
+
+[[deps.ExprTools]]
+git-tree-sha1 = "c1d06d129da9f55715c6c212866f5b1bddc5fa00"
+uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
+version = "0.1.9"
+
+[[deps.FileWatching]]
+uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
+
+[[deps.GPUCompiler]]
+deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
+path = "/home/collinw/.julia/dev/GPUCompiler"
+uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
+version = "0.19.0"
+
+[[deps.InteractiveUtils]]
+deps = ["Markdown"]
+uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+
+[[deps.JLLWrappers]]
+deps = ["Preferences"]
+git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1"
+uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
+version = "1.4.1"
+
+[[deps.LLVM]]
+deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
+git-tree-sha1 = "a8960cae30b42b66dd41808beb76490519f6f9e2"
+uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
+version = "5.0.0"
+
+[[deps.LLVMExtra_jll]]
+deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
+git-tree-sha1 = "09b7505cc0b1cee87e5d4a26eea61d2e1b0dcd35"
+uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
+version = "0.0.21+0"
+
+[[deps.LazyArtifacts]]
+deps = ["Artifacts", "Pkg"]
+uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
+
+[[deps.LibCURL]]
+deps = ["LibCURL_jll", "MozillaCACerts_jll"]
+uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
+version = "0.6.3"
+
+[[deps.LibCURL_jll]]
+deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
+uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
+version = "8.0.1+0"
+
+[[deps.LibGit2]]
+deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
+uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
+
+[[deps.LibSSH2_jll]]
+deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
+uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
+version = "1.10.2+0"
+
+[[deps.Libdl]]
+uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
+
+[[deps.Logging]]
+uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
+
+[[deps.Markdown]]
+deps = ["Base64"]
+uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
+
+[[deps.MbedTLS_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
+version = "2.28.2+0"
+
+[[deps.MozillaCACerts_jll]]
+uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
+version = "2023.1.10"
+
+[[deps.NetworkOptions]]
+uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
+version = "1.2.0"
+
+[[deps.Pkg]]
+deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
+uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+version = "1.10.0"
+
+[[deps.Preferences]]
+deps = ["TOML"]
+git-tree-sha1 = "47e5f437cc0e7ef2ce8406ce1e7e24d44915f88d"
+uuid = "21216c6a-2e73-6563-6e65-726566657250"
+version = "1.3.0"
+
+[[deps.Printf]]
+deps = ["Unicode"]
+uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+
+[[deps.REPL]]
+deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
+uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
+
+[[deps.Random]]
+deps = ["SHA", "Serialization"]
+uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+
+[[deps.SHA]]
+uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
+version = "0.7.0"
+
+[[deps.Scratch]]
+deps = ["Dates"]
+git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a"
+uuid = "6c6a2e73-6563-6170-7368-637461726353"
+version = "1.2.0"
+
+[[deps.Serialization]]
+uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+
+[[deps.SimpleGPU]]
+deps = ["GPUCompiler"]
+path = "../SimpleGPU"
+uuid = "0f92ac95-628b-4f27-9a96-2faf96da70df"
+version = "0.1.0"
+
+[[deps.Sockets]]
+uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
+
+[[deps.TOML]]
+deps = ["Dates"]
+uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
+version = "1.0.3"
+
+[[deps.Tar]]
+deps = ["ArgTools", "SHA"]
+uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
+version = "1.10.0"
+
+[[deps.TimerOutputs]]
+deps = ["ExprTools", "Printf"]
+git-tree-sha1 = "f2fd3f288dfc6f507b0c3a2eb3bac009251e548b"
+uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
+version = "0.5.22"
+
+[[deps.UUIDs]]
+deps = ["Random", "SHA"]
+uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+
+[[deps.Unicode]]
+uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
+
+[[deps.Zlib_jll]]
+deps = ["Libdl"]
+uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
+version = "1.2.13+0"
+
+[[deps.nghttp2_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
+version = "1.52.0+0"
+
+[[deps.p7zip_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
+version = "17.4.0+0"
diff --git a/examples/Example/Project.toml b/examples/Example/Project.toml
new file mode 100644
index 00000000..22ffa2ea
--- /dev/null
+++ b/examples/Example/Project.toml
@@ -0,0 +1,8 @@
+name = "Example"
+uuid = "3a86cd2f-4474-4e46-89c8-15adf66897e9"
+authors = ["collinw "]
+version = "0.1.0"
+
+[deps]
+GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
+SimpleGPU = "0f92ac95-628b-4f27-9a96-2faf96da70df"
diff --git a/examples/Example/src/Example.jl b/examples/Example/src/Example.jl
new file mode 100644
index 00000000..776793ec
--- /dev/null
+++ b/examples/Example/src/Example.jl
@@ -0,0 +1,15 @@
+module Example
+using GPUCompiler
+using SimpleGPU
+SimpleGPU.@declare_cache()
+
+f(x) = 1
+SimpleGPU.precompile_simple(f, (Int, ))
+
+function __init__()
+    SimpleGPU.@reinit_cache()
+end
+
+SimpleGPU.@snapshot_cache()
+
+end # module Example
diff --git a/examples/SimpleGPU/Manifest.toml b/examples/SimpleGPU/Manifest.toml
new file mode 100644
index 00000000..564425c9
--- /dev/null
+++ b/examples/SimpleGPU/Manifest.toml
@@ -0,0 +1,188 @@
+# This file is machine-generated - editing it directly is not advised
+
+julia_version = "1.10.0-DEV"
+manifest_format = "2.0"
+project_hash = "7e4bd5a8a18c1099e483003348b1afb869b6d01e"
+
+[[deps.ArgTools]]
+uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
+version = "1.1.1"
+
+[[deps.Artifacts]]
+uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
+
+[[deps.Base64]]
+uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
+
+[[deps.CEnum]]
+git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90"
+uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
+version = "0.4.2"
+
+[[deps.Dates]]
+deps = ["Printf"]
+uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
+
+[[deps.Downloads]]
+deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
+uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
+version = "1.6.0"
+
+[[deps.ExprTools]]
+git-tree-sha1 = "c1d06d129da9f55715c6c212866f5b1bddc5fa00"
+uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
+version = "0.1.9"
+
+[[deps.FileWatching]]
+uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
+
+[[deps.GPUCompiler]]
+deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
+path = "/home/collinw/.julia/dev/GPUCompiler"
+uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
+version = "0.19.0"
+
+[[deps.InteractiveUtils]]
+deps = ["Markdown"]
+uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
+
+[[deps.JLLWrappers]]
+deps = ["Preferences"]
+git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1"
+uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
+version = "1.4.1"
+
+[[deps.LLVM]]
+deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
+git-tree-sha1 = "a8960cae30b42b66dd41808beb76490519f6f9e2"
+uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
+version = "5.0.0"
+
+[[deps.LLVMExtra_jll]]
+deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
+git-tree-sha1 = "09b7505cc0b1cee87e5d4a26eea61d2e1b0dcd35"
+uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
+version = "0.0.21+0"
+
+[[deps.LazyArtifacts]]
+deps = ["Artifacts", "Pkg"]
+uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
+
+[[deps.LibCURL]]
+deps = ["LibCURL_jll", "MozillaCACerts_jll"]
+uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
+version = "0.6.3"
+
+[[deps.LibCURL_jll]]
+deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
+uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
+version = "8.0.1+0"
+
+[[deps.LibGit2]]
+deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
+uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
+
+[[deps.LibSSH2_jll]]
+deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
+uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
+version = "1.10.2+0"
+
+[[deps.Libdl]]
+uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
+
+[[deps.Logging]]
+uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
+
+[[deps.Markdown]]
+deps = ["Base64"]
+uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
+
+[[deps.MbedTLS_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
+version = "2.28.2+0"
+
+[[deps.MozillaCACerts_jll]]
+uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
+version = "2023.1.10"
+
+[[deps.NetworkOptions]]
+uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
+version = "1.2.0"
+
+[[deps.Pkg]]
+deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
+uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
+version = "1.10.0"
+
+[[deps.Preferences]]
+deps = ["TOML"]
+git-tree-sha1 = "47e5f437cc0e7ef2ce8406ce1e7e24d44915f88d"
+uuid = "21216c6a-2e73-6563-6e65-726566657250"
+version = "1.3.0"
+
+[[deps.Printf]]
+deps = ["Unicode"]
+uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+
+[[deps.REPL]]
+deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
+uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
+
+[[deps.Random]]
+deps = ["SHA", "Serialization"]
+uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
+
+[[deps.SHA]]
+uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
+version = "0.7.0"
+
+[[deps.Scratch]]
+deps = ["Dates"]
+git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a"
+uuid = "6c6a2e73-6563-6170-7368-637461726353"
+version = "1.2.0"
+
+[[deps.Serialization]]
+uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+
+[[deps.Sockets]]
+uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
+
+[[deps.TOML]]
+deps = ["Dates"]
+uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
+version = "1.0.3"
+
+[[deps.Tar]]
+deps = ["ArgTools", "SHA"]
+uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
+version = "1.10.0"
+
+[[deps.TimerOutputs]]
+deps = ["ExprTools", "Printf"]
+git-tree-sha1 = "f2fd3f288dfc6f507b0c3a2eb3bac009251e548b"
+uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
+version = "0.5.22"
+
+[[deps.UUIDs]]
+deps = ["Random", "SHA"]
+uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+
+[[deps.Unicode]]
+uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
+
+[[deps.Zlib_jll]]
+deps = ["Libdl"]
+uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
+version = "1.2.13+0"
+
+[[deps.nghttp2_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
+version = "1.52.0+0"
+
+[[deps.p7zip_jll]]
+deps = ["Artifacts", "Libdl"]
+uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
+version = "17.4.0+0"
diff --git a/examples/SimpleGPU/Project.toml b/examples/SimpleGPU/Project.toml
new file mode 100644
index 00000000..247624c5
--- /dev/null
+++ b/examples/SimpleGPU/Project.toml
@@ -0,0 +1,7 @@
+name = "SimpleGPU"
+uuid = "0f92ac95-628b-4f27-9a96-2faf96da70df"
+authors = ["collinw "]
+version = "0.1.0"
+
+[deps]
+GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
diff --git a/examples/SimpleGPU/src/SimpleGPU.jl b/examples/SimpleGPU/src/SimpleGPU.jl
new file mode 100644
index 00000000..894291e9
--- /dev/null
+++ b/examples/SimpleGPU/src/SimpleGPU.jl
@@ -0,0 +1,28 @@
+module SimpleGPU
+using GPUCompiler
+struct NativeCompilerParams <: AbstractCompilerParams
+    entry_safepoint::Bool
+    method_table
+
+    NativeCompilerParams(entry_safepoint::Bool=false, method_table=test_method_table) =
+        new(entry_safepoint, method_table)
+end
+
+const test_method_table = nothing
+
+function native_job(@nospecialize(func), @nospecialize(types); kernel::Bool=false,
+                    entry_abi=:specfunc, entry_safepoint::Bool=false, always_inline=false,
+                    method_table=test_method_table, kwargs...)
+    source = methodinstance(typeof(func), Base.to_tuple_type(types))
+    target = NativeCompilerTarget()
+    params = NativeCompilerParams(entry_safepoint, method_table)
+    config = CompilerConfig(target, params; kernel, entry_abi, always_inline)
+    CompilerJob(source, config), kwargs
+end
+
+function precompile_simple(f, t)
+    job, _ = native_job(f, t)
+    GPUCompiler.precompile_gpucompiler(job)
+end
+
+end # module SimpleGPU

From 3dbe9d5b7c7c5f56f18553f0e4d4bd9c2bdcaca5 Mon Sep 17 00:00:00 2001
From: Collin R Warner <collinw@amdci2.julia.csail.mit.edu>
Date: Thu, 13 Apr 2023 17:32:33 -0400
Subject: [PATCH 05/14] remove debugging function

---
 src/precompile_native.jl | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/precompile_native.jl b/src/precompile_native.jl
index ff3f69d1..2ca10062 100644
--- a/src/precompile_native.jl
+++ b/src/precompile_native.jl
@@ -48,13 +48,6 @@ function precompile_gpucompiler(job)
     end
 end
 
-function get_code_cache_i(i)
-    for (j, cc) in enumerate(GPUCompiler.GLOBAL_CI_CACHES)
-        if j == i
-            return cc
-        end
-    end
-end
 """
 Reloads Global Cache from global variable which stores the previous
 cached results

From db12163f02578705b1f723ad17a3f7d4e4c13e94 Mon Sep 17 00:00:00 2001
From: Collin R Warner <collinw@amdci2.julia.csail.mit.edu>
Date: Thu, 13 Apr 2023 17:41:24 -0400
Subject: [PATCH 06/14] Change name to precompilation_cache

---
 src/GPUCompiler.jl          |   2 +-
 src/precompilation_cache.jl | 108 ++++++++++++++++++++++++++++++++++++
 2 files changed, 109 insertions(+), 1 deletion(-)
 create mode 100644 src/precompilation_cache.jl

diff --git a/src/GPUCompiler.jl b/src/GPUCompiler.jl
index 809b50d5..8d477705 100644
--- a/src/GPUCompiler.jl
+++ b/src/GPUCompiler.jl
@@ -42,7 +42,7 @@ include("reflection.jl")
 
 
 include("precompile.jl")
-include("precompile_native.jl")
+include("precompilation_cache.jl")
 
 _precompile_()
 
diff --git a/src/precompilation_cache.jl b/src/precompilation_cache.jl
new file mode 100644
index 00000000..2ca10062
--- /dev/null
+++ b/src/precompilation_cache.jl
@@ -0,0 +1,108 @@
+const CACHE_NAME = gensym(:CACHE) # is now a const symbol (not a variable)
+is_precompiling() = ccall(:jl_generating_output, Cint, ()) != 0
+
+struct NativeCompilerParams <: AbstractCompilerParams end
+export @declare_cache, @snapshot_cache, @reinit_cache, @get_cache
+
+macro declare_cache()
+    var = esc(CACHE_NAME) #this will esc variable from our const symbol
+    quote
+        #const $esc(CACHE_NAME) function esc is executed when macro is executed, not when code is defined
+        # dollar sign means will have the value of esc cachename here
+        const $var = $IdDict()
+    end
+end
+
+macro snapshot_cache()
+    var = esc(CACHE_NAME)
+    quote
+        $snapshot_cache($var)
+    end
+end
+
+macro reinit_cache()
+    var = esc(CACHE_NAME)
+    quote
+        # will need to keep track of this is CUDA so that GPUCompiler caches are not overfilled
+        $reinit_cache($var)
+    end
+end
+
+macro get_cache()
+    var = esc(CACHE_NAME)
+    quote
+        $var
+    end
+end
+
+"""
+Given a function and param types caches the function to the global cache
+"""
+function precompile_gpucompiler(job)
+    # populate the cache
+    cache = GPUCompiler.ci_cache(job)
+    mt = GPUCompiler.method_table(job)
+    interp = GPUCompiler.get_interpreter(job)
+    if GPUCompiler.ci_cache_lookup(cache, job.source, job.world, typemax(Cint)) === nothing
+        GPUCompiler.ci_cache_populate(interp, cache, mt, job.source, job.world, typemax(Cint))
+    end
+end
+
+"""
+Reloads Global Cache from global variable which stores the previous
+cached results
+"""
+function reinit_cache(LOCAL_CACHE)
+    if !is_precompiling()
+        # need to merge caches at the code instance level
+        for key in keys(LOCAL_CACHE)
+            if haskey(GPUCompiler.GLOBAL_CI_CACHES, key)
+                global_cache = GPUCompiler.GLOBAL_CI_CACHES[key]
+                local_cache = LOCAL_CACHE[key]
+                for (mi, civ) in (local_cache.dict)
+                    # this should be one since there is only one range that is infinite
+                    @assert length(civ) == 1
+                    # add all code instances to global cache
+                    # could move truncating code to set index
+                    ci = civ[1]
+                    if haskey(global_cache.dict, mi)
+                        gciv = global_cache.dict[mi]
+                        # truncation cod3
+                        # sort by min world age, then make sure no age ranges overlap // this part is uneeded
+                        sort(gciv, by=x->x.min_world)
+                        if ci.min_world > gciv[length(gciv)].min_world
+                            invalidate_code_cache(global_cache, mi, ci.min_world - 1)
+                            Core.Compiler.setindex!(global_cache, ci, mi)
+                        else
+                            println("Should not get here?")
+                            @assert false
+                        end
+                    else
+                        # occurs if we kill everything in the parent and then need to store in child
+                        Core.Compiler.setindex!(global_cache, ci, mi)
+                    end
+                end
+            else
+                # no conflict at cache level
+                GPUCompiler.GLOBAL_CI_CACHES[key] = LOCAL_CACHE[key]
+            end
+        end
+    end
+end
+
+"""
+Takes a snapshot of the current status of the cache
+
+The cache returned is a deep copy with finite world age endings removed
+"""
+function snapshot_cache(LOCAL_CACHE)
+    cleaned_cache_to_save = IdDict()
+    for key in keys(GPUCompiler.GLOBAL_CI_CACHES)
+        # Will only keep those elements with infinite ranges
+        cleaned_cache_to_save[key] = GPUCompiler.CodeCache(GPUCompiler.GLOBAL_CI_CACHES[key])
+    end
+    global MY_CACHE #technically don't need the global
+    #empty insert
+    empty!(LOCAL_CACHE)
+    merge!(LOCAL_CACHE, cleaned_cache_to_save)
+end

From 9bfdceee40cc6022ce3ce7f5eb5f8bda1df6e012 Mon Sep 17 00:00:00 2001
From: Collin R Warner <collinw@amdci2.julia.csail.mit.edu>
Date: Thu, 13 Apr 2023 17:43:05 -0400
Subject: [PATCH 07/14] remove uneeded code

---
 src/precompilation_cache.jl |   1 -
 src/precompile_native.jl    | 108 ------------------------------------
 2 files changed, 109 deletions(-)
 delete mode 100644 src/precompile_native.jl

diff --git a/src/precompilation_cache.jl b/src/precompilation_cache.jl
index 2ca10062..bdfcf4dc 100644
--- a/src/precompilation_cache.jl
+++ b/src/precompilation_cache.jl
@@ -1,7 +1,6 @@
 const CACHE_NAME = gensym(:CACHE) # is now a const symbol (not a variable)
 is_precompiling() = ccall(:jl_generating_output, Cint, ()) != 0
 
-struct NativeCompilerParams <: AbstractCompilerParams end
 export @declare_cache, @snapshot_cache, @reinit_cache, @get_cache
 
 macro declare_cache()
diff --git a/src/precompile_native.jl b/src/precompile_native.jl
deleted file mode 100644
index 2ca10062..00000000
--- a/src/precompile_native.jl
+++ /dev/null
@@ -1,108 +0,0 @@
-const CACHE_NAME = gensym(:CACHE) # is now a const symbol (not a variable)
-is_precompiling() = ccall(:jl_generating_output, Cint, ()) != 0
-
-struct NativeCompilerParams <: AbstractCompilerParams end
-export @declare_cache, @snapshot_cache, @reinit_cache, @get_cache
-
-macro declare_cache()
-    var = esc(CACHE_NAME) #this will esc variable from our const symbol
-    quote
-        #const $esc(CACHE_NAME) function esc is executed when macro is executed, not when code is defined
-        # dollar sign means will have the value of esc cachename here
-        const $var = $IdDict()
-    end
-end
-
-macro snapshot_cache()
-    var = esc(CACHE_NAME)
-    quote
-        $snapshot_cache($var)
-    end
-end
-
-macro reinit_cache()
-    var = esc(CACHE_NAME)
-    quote
-        # will need to keep track of this is CUDA so that GPUCompiler caches are not overfilled
-        $reinit_cache($var)
-    end
-end
-
-macro get_cache()
-    var = esc(CACHE_NAME)
-    quote
-        $var
-    end
-end
-
-"""
-Given a function and param types caches the function to the global cache
-"""
-function precompile_gpucompiler(job)
-    # populate the cache
-    cache = GPUCompiler.ci_cache(job)
-    mt = GPUCompiler.method_table(job)
-    interp = GPUCompiler.get_interpreter(job)
-    if GPUCompiler.ci_cache_lookup(cache, job.source, job.world, typemax(Cint)) === nothing
-        GPUCompiler.ci_cache_populate(interp, cache, mt, job.source, job.world, typemax(Cint))
-    end
-end
-
-"""
-Reloads Global Cache from global variable which stores the previous
-cached results
-"""
-function reinit_cache(LOCAL_CACHE)
-    if !is_precompiling()
-        # need to merge caches at the code instance level
-        for key in keys(LOCAL_CACHE)
-            if haskey(GPUCompiler.GLOBAL_CI_CACHES, key)
-                global_cache = GPUCompiler.GLOBAL_CI_CACHES[key]
-                local_cache = LOCAL_CACHE[key]
-                for (mi, civ) in (local_cache.dict)
-                    # this should be one since there is only one range that is infinite
-                    @assert length(civ) == 1
-                    # add all code instances to global cache
-                    # could move truncating code to set index
-                    ci = civ[1]
-                    if haskey(global_cache.dict, mi)
-                        gciv = global_cache.dict[mi]
-                        # truncation cod3
-                        # sort by min world age, then make sure no age ranges overlap // this part is uneeded
-                        sort(gciv, by=x->x.min_world)
-                        if ci.min_world > gciv[length(gciv)].min_world
-                            invalidate_code_cache(global_cache, mi, ci.min_world - 1)
-                            Core.Compiler.setindex!(global_cache, ci, mi)
-                        else
-                            println("Should not get here?")
-                            @assert false
-                        end
-                    else
-                        # occurs if we kill everything in the parent and then need to store in child
-                        Core.Compiler.setindex!(global_cache, ci, mi)
-                    end
-                end
-            else
-                # no conflict at cache level
-                GPUCompiler.GLOBAL_CI_CACHES[key] = LOCAL_CACHE[key]
-            end
-        end
-    end
-end
-
-"""
-Takes a snapshot of the current status of the cache
-
-The cache returned is a deep copy with finite world age endings removed
-"""
-function snapshot_cache(LOCAL_CACHE)
-    cleaned_cache_to_save = IdDict()
-    for key in keys(GPUCompiler.GLOBAL_CI_CACHES)
-        # Will only keep those elements with infinite ranges
-        cleaned_cache_to_save[key] = GPUCompiler.CodeCache(GPUCompiler.GLOBAL_CI_CACHES[key])
-    end
-    global MY_CACHE #technically don't need the global
-    #empty insert
-    empty!(LOCAL_CACHE)
-    merge!(LOCAL_CACHE, cleaned_cache_to_save)
-end

From 44c5a7e7840028fe0f9f9a89ccc8eb2e51d884ab Mon Sep 17 00:00:00 2001
From: Collin R Warner <collinw@amdci2.julia.csail.mit.edu>
Date: Sun, 16 Apr 2023 16:56:36 -0400
Subject: [PATCH 08/14] switch from macros

---
 src/precompilation_cache.jl | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/precompilation_cache.jl b/src/precompilation_cache.jl
index bdfcf4dc..3de68d6b 100644
--- a/src/precompilation_cache.jl
+++ b/src/precompilation_cache.jl
@@ -2,6 +2,7 @@ const CACHE_NAME = gensym(:CACHE) # is now a const symbol (not a variable)
 is_precompiling() = ccall(:jl_generating_output, Cint, ()) != 0
 
 export @declare_cache, @snapshot_cache, @reinit_cache, @get_cache
+export reinit_cache, snapshot_cache
 
 macro declare_cache()
     var = esc(CACHE_NAME) #this will esc variable from our const symbol
@@ -34,6 +35,10 @@ macro get_cache()
     end
 end
 
+function declare_cache()
+    return IdDict()
+end
+
 """
 Given a function and param types caches the function to the global cache
 """

From 09d05dfd4d4dddb1bde3529818c5f301ea9214d6 Mon Sep 17 00:00:00 2001
From: Collin R Warner <collinw@amdci2.julia.csail.mit.edu>
Date: Wed, 19 Apr 2023 17:09:54 -0400
Subject: [PATCH 09/14] change api

---
 src/precompilation_cache.jl | 116 ++++++++++++++++++++++++++++--------
 test/EnzymeTest.jl          |  19 ++++++
 2 files changed, 109 insertions(+), 26 deletions(-)
 create mode 100644 test/EnzymeTest.jl

diff --git a/src/precompilation_cache.jl b/src/precompilation_cache.jl
index 3de68d6b..2268853f 100644
--- a/src/precompilation_cache.jl
+++ b/src/precompilation_cache.jl
@@ -1,43 +1,107 @@
 const CACHE_NAME = gensym(:CACHE) # is now a const symbol (not a variable)
 is_precompiling() = ccall(:jl_generating_output, Cint, ()) != 0
 
-export @declare_cache, @snapshot_cache, @reinit_cache, @get_cache
-export reinit_cache, snapshot_cache
+export ci_cache_snapshot, ci_cache_delta, ci_cache_insert
 
-macro declare_cache()
-    var = esc(CACHE_NAME) #this will esc variable from our const symbol
-    quote
-        #const $esc(CACHE_NAME) function esc is executed when macro is executed, not when code is defined
-        # dollar sign means will have the value of esc cachename here
-        const $var = $IdDict()
+function ci_cache_snapshot()
+    cleaned_cache_to_save = IdDict()
+    for key in keys(GPUCompiler.GLOBAL_CI_CACHES)
+        # Will only keep those elements with infinite ranges
+        cleaned_cache_to_save[key] = GPUCompiler.CodeCache(GPUCompiler.GLOBAL_CI_CACHES[key])
     end
+    println("cleaned cache to save")
+    @show cleaned_cache_to_save
+    return cleaned_cache_to_save
 end
 
-macro snapshot_cache()
-    var = esc(CACHE_NAME)
-    quote
-        $snapshot_cache($var)
-    end
-end
+function ci_cache_delta(previous_snapshot)
+    current_snapshot = ci_cache_snapshot()
+    println("current snapshot")
+    @show current_snapshot
+    delta_snapshot = IdDict{Tuple{DataType, Core.Compiler.InferenceParams, Core.Compiler.OptimizationParams}, GPUCompiler.CodeCache}()
+    for (cachekey, cache) in current_snapshot
+        if cachekey in keys(previous_snapshot)
+            for (mi, civ) in cache
+                if mi in keys(previous_snapshot[cachekey])
+                    for ci in civ
+                        if !(ci in previous_snapshot[cachekey][mi])
+                            if !(cachekey in delta_snapshot)
+                                delta_snapshot[cachekey] = GPUCompiler.CodeCache()
+                                delta_snapshot[cachekey][mi] = Vector{CodeInstance}()
+                            elseif !(mi in delta_snapshot[cachekey])
+                                delta_snapshot[cachekey][mi] = Vector{CodeInstance}()
+                            end
 
-macro reinit_cache()
-    var = esc(CACHE_NAME)
-    quote
-        # will need to keep track of this is CUDA so that GPUCompiler caches are not overfilled
-        $reinit_cache($var)
+                            append!(delta_snapshot[cachekey][mi], ci)
+                        end
+                    end
+                else
+                    # this whole cache is not present in the previous snapshot, can add all
+                    if !(cachekey in delta_snapshot)
+                        delta_snapshot[cachekey] = GPUCompiler.CodeCache()
+                    end
+                    delta_snapshot[cachekey][mi] = civ
+                end
+            end
+        else
+            delta_snapshot[cachekey] = current_snapshot[cachekey]
+        end
     end
+    println("delta snapshot")
+    @show delta_snapshot
+    return delta_snapshot
 end
 
-macro get_cache()
-    var = esc(CACHE_NAME)
-    quote
-        $var
+function ci_cache_insert(caches)
+    empty!(GPUCompiler.GLOBAL_CI_CACHES)
+    for (key, cache) in caches
+        GPUCompiler.GLOBAL_CI_CACHES[key] = GPUCompiler.CodeCache(cache)
     end
 end
 
-function declare_cache()
-    return IdDict()
-end
+#=function ci_cache_insert(cache)
+    if !is_precompiling()
+        # need to merge caches at the code instance level
+        for key in keys(cache)
+            if haskey(GPUCompiler.GLOBAL_CI_CACHES, key)
+                global_cache = GPUCompiler.GLOBAL_CI_CACHES[key]
+                local_cache = cache[key]
+                for (mi, civ) in (local_cache.dict)
+                    # this should be one since there is only one range that is infinite
+                    @assert length(civ) == 1
+                    # add all code instances to global cache
+                    # could move truncating code to set index
+                    ci = civ[1]
+                    if haskey(global_cache.dict, mi)
+                        gciv = global_cache.dict[mi]
+                        # truncation cod3
+                        # sort by min world age, then make sure no age ranges overlap // this part is uneeded
+                        sort(gciv, by=x->x.min_world)
+                        if ci.min_world > gciv[length(gciv)].min_world
+                            println("invalidating mi [$mi] in world age [$(ci.min_world-1)]")
+                            println("adding ci [$ci]")
+                            invalidate_code_cache(global_cache, mi, ci.min_world - 1)
+                            Core.Compiler.setindex!(global_cache, ci, mi)
+                        else
+                            println("Should not get here?")
+                            @assert false
+                        end
+                    else
+                        println("adding method instance [$mi] code instance [$ci]")
+                        # occurs if we kill everything in the parent and then need to store in child
+                        Core.Compiler.setindex!(global_cache, ci, mi)
+                    end
+                end
+            else
+                # no conflict at cache level
+                println("no conflictt adding cache $(cache[key])")
+                GPUCompiler.GLOBAL_CI_CACHES[key] = cache[key]
+            end
+        end
+        println("global cache post insert")
+        @show GPUCompiler.GLOBAL_CI_CACHES
+    end
+end=#
 
 """
 Given a function and param types caches the function to the global cache
diff --git a/test/EnzymeTest.jl b/test/EnzymeTest.jl
new file mode 100644
index 00000000..05723bc8
--- /dev/null
+++ b/test/EnzymeTest.jl
@@ -0,0 +1,19 @@
+module EnzymeTest
+using GPUCompiler
+using Enzyme
+
+f1(x) = x*x
+autodiff_wrapper(f) = first(autodiff(Reverse, f, Active(1.0)))
+
+println("precompilation!")
+
+
+
+const cache = let
+    cache_snapshot = GPUCompiler.ci_cache_snapshot()
+    autodiff_wrapper(f1)
+    GPUCompiler.ci_cache_delta(cache_snapshot)
+end
+
+__init__() = GPUCompiler.ci_cache_insert(cache)
+end # module EnzymeTest

From c0d25a3782ebef81dd693c5453da449be9fad6eb Mon Sep 17 00:00:00 2001
From: Collin R Warner <collinw@amdci2.julia.csail.mit.edu>
Date: Wed, 19 Apr 2023 18:44:51 -0400
Subject: [PATCH 10/14] debugging

---
 src/precompilation_cache.jl | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/src/precompilation_cache.jl b/src/precompilation_cache.jl
index 2268853f..187b6606 100644
--- a/src/precompilation_cache.jl
+++ b/src/precompilation_cache.jl
@@ -1,7 +1,7 @@
 const CACHE_NAME = gensym(:CACHE) # is now a const symbol (not a variable)
 is_precompiling() = ccall(:jl_generating_output, Cint, ()) != 0
 
-export ci_cache_snapshot, ci_cache_delta, ci_cache_insert
+export ci_cache_snapshot, ci_cache_delta, ci_cache_insert, precompile_gpucompiler
 
 function ci_cache_snapshot()
     cleaned_cache_to_save = IdDict()
@@ -9,15 +9,11 @@ function ci_cache_snapshot()
         # Will only keep those elements with infinite ranges
         cleaned_cache_to_save[key] = GPUCompiler.CodeCache(GPUCompiler.GLOBAL_CI_CACHES[key])
     end
-    println("cleaned cache to save")
-    @show cleaned_cache_to_save
     return cleaned_cache_to_save
 end
 
 function ci_cache_delta(previous_snapshot)
     current_snapshot = ci_cache_snapshot()
-    println("current snapshot")
-    @show current_snapshot
     delta_snapshot = IdDict{Tuple{DataType, Core.Compiler.InferenceParams, Core.Compiler.OptimizationParams}, GPUCompiler.CodeCache}()
     for (cachekey, cache) in current_snapshot
         if cachekey in keys(previous_snapshot)
@@ -47,19 +43,17 @@ function ci_cache_delta(previous_snapshot)
             delta_snapshot[cachekey] = current_snapshot[cachekey]
         end
     end
-    println("delta snapshot")
-    @show delta_snapshot
     return delta_snapshot
 end
 
-function ci_cache_insert(caches)
+#=function ci_cache_insert(caches)
     empty!(GPUCompiler.GLOBAL_CI_CACHES)
     for (key, cache) in caches
         GPUCompiler.GLOBAL_CI_CACHES[key] = GPUCompiler.CodeCache(cache)
     end
-end
+end=#
 
-#=function ci_cache_insert(cache)
+function ci_cache_insert(cache)
     if !is_precompiling()
         # need to merge caches at the code instance level
         for key in keys(cache)
@@ -101,7 +95,7 @@ end
         println("global cache post insert")
         @show GPUCompiler.GLOBAL_CI_CACHES
     end
-end=#
+end
 
 """
 Given a function and param types caches the function to the global cache

From 845be17550beac14d5588e1068d186747f4d4f64 Mon Sep 17 00:00:00 2001
From: Collin R Warner <collinw@amdci2.julia.csail.mit.edu>
Date: Fri, 21 Apr 2023 22:08:46 -0400
Subject: [PATCH 11/14] modifying to get orig

---
 examples/Example/Manifest.toml      | 194 ----------------------------
 examples/Example/Project.toml       |   8 --
 examples/Example/src/Example.jl     |  15 ---
 examples/SimpleGPU/Manifest.toml    | 188 ---------------------------
 examples/SimpleGPU/Project.toml     |   7 -
 examples/SimpleGPU/src/SimpleGPU.jl |  28 ----
 src/precompilation_cache.jl         |  54 +++++---
 test/EnzymeTest.jl                  |  19 ---
 test/Project.toml                   |   1 +
 9 files changed, 36 insertions(+), 478 deletions(-)
 delete mode 100644 examples/Example/Manifest.toml
 delete mode 100644 examples/Example/Project.toml
 delete mode 100644 examples/Example/src/Example.jl
 delete mode 100644 examples/SimpleGPU/Manifest.toml
 delete mode 100644 examples/SimpleGPU/Project.toml
 delete mode 100644 examples/SimpleGPU/src/SimpleGPU.jl
 delete mode 100644 test/EnzymeTest.jl

diff --git a/examples/Example/Manifest.toml b/examples/Example/Manifest.toml
deleted file mode 100644
index a5b1f1c4..00000000
--- a/examples/Example/Manifest.toml
+++ /dev/null
@@ -1,194 +0,0 @@
-# This file is machine-generated - editing it directly is not advised
-
-julia_version = "1.10.0-DEV"
-manifest_format = "2.0"
-project_hash = "6afd6f1a57af520013070870c6f183d98c839ff4"
-
-[[deps.ArgTools]]
-uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
-version = "1.1.1"
-
-[[deps.Artifacts]]
-uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
-
-[[deps.Base64]]
-uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
-
-[[deps.CEnum]]
-git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90"
-uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
-version = "0.4.2"
-
-[[deps.Dates]]
-deps = ["Printf"]
-uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
-
-[[deps.Downloads]]
-deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
-uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
-version = "1.6.0"
-
-[[deps.ExprTools]]
-git-tree-sha1 = "c1d06d129da9f55715c6c212866f5b1bddc5fa00"
-uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
-version = "0.1.9"
-
-[[deps.FileWatching]]
-uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
-
-[[deps.GPUCompiler]]
-deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
-path = "/home/collinw/.julia/dev/GPUCompiler"
-uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
-version = "0.19.0"
-
-[[deps.InteractiveUtils]]
-deps = ["Markdown"]
-uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
-
-[[deps.JLLWrappers]]
-deps = ["Preferences"]
-git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1"
-uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
-version = "1.4.1"
-
-[[deps.LLVM]]
-deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
-git-tree-sha1 = "a8960cae30b42b66dd41808beb76490519f6f9e2"
-uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "5.0.0"
-
-[[deps.LLVMExtra_jll]]
-deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
-git-tree-sha1 = "09b7505cc0b1cee87e5d4a26eea61d2e1b0dcd35"
-uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
-version = "0.0.21+0"
-
-[[deps.LazyArtifacts]]
-deps = ["Artifacts", "Pkg"]
-uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
-
-[[deps.LibCURL]]
-deps = ["LibCURL_jll", "MozillaCACerts_jll"]
-uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
-version = "0.6.3"
-
-[[deps.LibCURL_jll]]
-deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
-uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
-version = "8.0.1+0"
-
-[[deps.LibGit2]]
-deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
-uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
-
-[[deps.LibSSH2_jll]]
-deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
-uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
-version = "1.10.2+0"
-
-[[deps.Libdl]]
-uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
-
-[[deps.Logging]]
-uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
-
-[[deps.Markdown]]
-deps = ["Base64"]
-uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
-
-[[deps.MbedTLS_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
-version = "2.28.2+0"
-
-[[deps.MozillaCACerts_jll]]
-uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
-version = "2023.1.10"
-
-[[deps.NetworkOptions]]
-uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
-version = "1.2.0"
-
-[[deps.Pkg]]
-deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
-uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-version = "1.10.0"
-
-[[deps.Preferences]]
-deps = ["TOML"]
-git-tree-sha1 = "47e5f437cc0e7ef2ce8406ce1e7e24d44915f88d"
-uuid = "21216c6a-2e73-6563-6e65-726566657250"
-version = "1.3.0"
-
-[[deps.Printf]]
-deps = ["Unicode"]
-uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
-
-[[deps.REPL]]
-deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
-uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
-
-[[deps.Random]]
-deps = ["SHA", "Serialization"]
-uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-
-[[deps.SHA]]
-uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
-version = "0.7.0"
-
-[[deps.Scratch]]
-deps = ["Dates"]
-git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a"
-uuid = "6c6a2e73-6563-6170-7368-637461726353"
-version = "1.2.0"
-
-[[deps.Serialization]]
-uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
-
-[[deps.SimpleGPU]]
-deps = ["GPUCompiler"]
-path = "../SimpleGPU"
-uuid = "0f92ac95-628b-4f27-9a96-2faf96da70df"
-version = "0.1.0"
-
-[[deps.Sockets]]
-uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
-
-[[deps.TOML]]
-deps = ["Dates"]
-uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
-version = "1.0.3"
-
-[[deps.Tar]]
-deps = ["ArgTools", "SHA"]
-uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
-version = "1.10.0"
-
-[[deps.TimerOutputs]]
-deps = ["ExprTools", "Printf"]
-git-tree-sha1 = "f2fd3f288dfc6f507b0c3a2eb3bac009251e548b"
-uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
-version = "0.5.22"
-
-[[deps.UUIDs]]
-deps = ["Random", "SHA"]
-uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
-
-[[deps.Unicode]]
-uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
-
-[[deps.Zlib_jll]]
-deps = ["Libdl"]
-uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
-version = "1.2.13+0"
-
-[[deps.nghttp2_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
-version = "1.52.0+0"
-
-[[deps.p7zip_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
-version = "17.4.0+0"
diff --git a/examples/Example/Project.toml b/examples/Example/Project.toml
deleted file mode 100644
index 22ffa2ea..00000000
--- a/examples/Example/Project.toml
+++ /dev/null
@@ -1,8 +0,0 @@
-name = "Example"
-uuid = "3a86cd2f-4474-4e46-89c8-15adf66897e9"
-authors = ["collinw "]
-version = "0.1.0"
-
-[deps]
-GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
-SimpleGPU = "0f92ac95-628b-4f27-9a96-2faf96da70df"
diff --git a/examples/Example/src/Example.jl b/examples/Example/src/Example.jl
deleted file mode 100644
index 776793ec..00000000
--- a/examples/Example/src/Example.jl
+++ /dev/null
@@ -1,15 +0,0 @@
-module Example
-using GPUCompiler
-using SimpleGPU
-SimpleGPU.@declare_cache()
-
-f(x) = 1
-SimpleGPU.precompile_simple(f, (Int, ))
-
-function __init__()
-    SimpleGPU.@reinit_cache()
-end
-
-SimpleGPU.@snapshot_cache()
-
-end # module Example
diff --git a/examples/SimpleGPU/Manifest.toml b/examples/SimpleGPU/Manifest.toml
deleted file mode 100644
index 564425c9..00000000
--- a/examples/SimpleGPU/Manifest.toml
+++ /dev/null
@@ -1,188 +0,0 @@
-# This file is machine-generated - editing it directly is not advised
-
-julia_version = "1.10.0-DEV"
-manifest_format = "2.0"
-project_hash = "7e4bd5a8a18c1099e483003348b1afb869b6d01e"
-
-[[deps.ArgTools]]
-uuid = "0dad84c5-d112-42e6-8d28-ef12dabb789f"
-version = "1.1.1"
-
-[[deps.Artifacts]]
-uuid = "56f22d72-fd6d-98f1-02f0-08ddc0907c33"
-
-[[deps.Base64]]
-uuid = "2a0f44e3-6c83-55bd-87e4-b1978d98bd5f"
-
-[[deps.CEnum]]
-git-tree-sha1 = "eb4cb44a499229b3b8426dcfb5dd85333951ff90"
-uuid = "fa961155-64e5-5f13-b03f-caf6b980ea82"
-version = "0.4.2"
-
-[[deps.Dates]]
-deps = ["Printf"]
-uuid = "ade2ca70-3891-5945-98fb-dc099432e06a"
-
-[[deps.Downloads]]
-deps = ["ArgTools", "FileWatching", "LibCURL", "NetworkOptions"]
-uuid = "f43a241f-c20a-4ad4-852c-f6b1247861c6"
-version = "1.6.0"
-
-[[deps.ExprTools]]
-git-tree-sha1 = "c1d06d129da9f55715c6c212866f5b1bddc5fa00"
-uuid = "e2ba6199-217a-4e67-a87a-7c52f15ade04"
-version = "0.1.9"
-
-[[deps.FileWatching]]
-uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
-
-[[deps.GPUCompiler]]
-deps = ["ExprTools", "InteractiveUtils", "LLVM", "Libdl", "Logging", "Scratch", "TimerOutputs", "UUIDs"]
-path = "/home/collinw/.julia/dev/GPUCompiler"
-uuid = "61eb1bfa-7361-4325-ad38-22787b887f55"
-version = "0.19.0"
-
-[[deps.InteractiveUtils]]
-deps = ["Markdown"]
-uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
-
-[[deps.JLLWrappers]]
-deps = ["Preferences"]
-git-tree-sha1 = "abc9885a7ca2052a736a600f7fa66209f96506e1"
-uuid = "692b3bcd-3c85-4b1f-b108-f13ce0eb3210"
-version = "1.4.1"
-
-[[deps.LLVM]]
-deps = ["CEnum", "LLVMExtra_jll", "Libdl", "Printf", "Unicode"]
-git-tree-sha1 = "a8960cae30b42b66dd41808beb76490519f6f9e2"
-uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
-version = "5.0.0"
-
-[[deps.LLVMExtra_jll]]
-deps = ["Artifacts", "JLLWrappers", "LazyArtifacts", "Libdl", "TOML"]
-git-tree-sha1 = "09b7505cc0b1cee87e5d4a26eea61d2e1b0dcd35"
-uuid = "dad2f222-ce93-54a1-a47d-0025e8a3acab"
-version = "0.0.21+0"
-
-[[deps.LazyArtifacts]]
-deps = ["Artifacts", "Pkg"]
-uuid = "4af54fe1-eca0-43a8-85a7-787d91b784e3"
-
-[[deps.LibCURL]]
-deps = ["LibCURL_jll", "MozillaCACerts_jll"]
-uuid = "b27032c2-a3e7-50c8-80cd-2d36dbcbfd21"
-version = "0.6.3"
-
-[[deps.LibCURL_jll]]
-deps = ["Artifacts", "LibSSH2_jll", "Libdl", "MbedTLS_jll", "Zlib_jll", "nghttp2_jll"]
-uuid = "deac9b47-8bc7-5906-a0fe-35ac56dc84c0"
-version = "8.0.1+0"
-
-[[deps.LibGit2]]
-deps = ["Base64", "NetworkOptions", "Printf", "SHA"]
-uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
-
-[[deps.LibSSH2_jll]]
-deps = ["Artifacts", "Libdl", "MbedTLS_jll"]
-uuid = "29816b5a-b9ab-546f-933c-edad1886dfa8"
-version = "1.10.2+0"
-
-[[deps.Libdl]]
-uuid = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
-
-[[deps.Logging]]
-uuid = "56ddb016-857b-54e1-b83d-db4d58db5568"
-
-[[deps.Markdown]]
-deps = ["Base64"]
-uuid = "d6f4376e-aef5-505a-96c1-9c027394607a"
-
-[[deps.MbedTLS_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "c8ffd9c3-330d-5841-b78e-0817d7145fa1"
-version = "2.28.2+0"
-
-[[deps.MozillaCACerts_jll]]
-uuid = "14a3606d-f60d-562e-9121-12d972cd8159"
-version = "2023.1.10"
-
-[[deps.NetworkOptions]]
-uuid = "ca575930-c2e3-43a9-ace4-1e988b2c1908"
-version = "1.2.0"
-
-[[deps.Pkg]]
-deps = ["Artifacts", "Dates", "Downloads", "FileWatching", "LibGit2", "Libdl", "Logging", "Markdown", "Printf", "REPL", "Random", "SHA", "Serialization", "TOML", "Tar", "UUIDs", "p7zip_jll"]
-uuid = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
-version = "1.10.0"
-
-[[deps.Preferences]]
-deps = ["TOML"]
-git-tree-sha1 = "47e5f437cc0e7ef2ce8406ce1e7e24d44915f88d"
-uuid = "21216c6a-2e73-6563-6e65-726566657250"
-version = "1.3.0"
-
-[[deps.Printf]]
-deps = ["Unicode"]
-uuid = "de0858da-6303-5e67-8744-51eddeeeb8d7"
-
-[[deps.REPL]]
-deps = ["InteractiveUtils", "Markdown", "Sockets", "Unicode"]
-uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
-
-[[deps.Random]]
-deps = ["SHA", "Serialization"]
-uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
-
-[[deps.SHA]]
-uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
-version = "0.7.0"
-
-[[deps.Scratch]]
-deps = ["Dates"]
-git-tree-sha1 = "30449ee12237627992a99d5e30ae63e4d78cd24a"
-uuid = "6c6a2e73-6563-6170-7368-637461726353"
-version = "1.2.0"
-
-[[deps.Serialization]]
-uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
-
-[[deps.Sockets]]
-uuid = "6462fe0b-24de-5631-8697-dd941f90decc"
-
-[[deps.TOML]]
-deps = ["Dates"]
-uuid = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
-version = "1.0.3"
-
-[[deps.Tar]]
-deps = ["ArgTools", "SHA"]
-uuid = "a4e569a6-e804-4fa4-b0f3-eef7a1d5b13e"
-version = "1.10.0"
-
-[[deps.TimerOutputs]]
-deps = ["ExprTools", "Printf"]
-git-tree-sha1 = "f2fd3f288dfc6f507b0c3a2eb3bac009251e548b"
-uuid = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
-version = "0.5.22"
-
-[[deps.UUIDs]]
-deps = ["Random", "SHA"]
-uuid = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
-
-[[deps.Unicode]]
-uuid = "4ec0a83e-493e-50e2-b9ac-8f72acf5a8f5"
-
-[[deps.Zlib_jll]]
-deps = ["Libdl"]
-uuid = "83775a58-1f1d-513f-b197-d71354ab007a"
-version = "1.2.13+0"
-
-[[deps.nghttp2_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "8e850ede-7688-5339-a07c-302acd2aaf8d"
-version = "1.52.0+0"
-
-[[deps.p7zip_jll]]
-deps = ["Artifacts", "Libdl"]
-uuid = "3f19e933-33d8-53b3-aaab-bd5110c3b7a0"
-version = "17.4.0+0"
diff --git a/examples/SimpleGPU/Project.toml b/examples/SimpleGPU/Project.toml
deleted file mode 100644
index 247624c5..00000000
--- a/examples/SimpleGPU/Project.toml
+++ /dev/null
@@ -1,7 +0,0 @@
-name = "SimpleGPU"
-uuid = "0f92ac95-628b-4f27-9a96-2faf96da70df"
-authors = ["collinw "]
-version = "0.1.0"
-
-[deps]
-GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
diff --git a/examples/SimpleGPU/src/SimpleGPU.jl b/examples/SimpleGPU/src/SimpleGPU.jl
deleted file mode 100644
index 894291e9..00000000
--- a/examples/SimpleGPU/src/SimpleGPU.jl
+++ /dev/null
@@ -1,28 +0,0 @@
-module SimpleGPU
-using GPUCompiler
-struct NativeCompilerParams <: AbstractCompilerParams
-    entry_safepoint::Bool
-    method_table
-
-    NativeCompilerParams(entry_safepoint::Bool=false, method_table=test_method_table) =
-        new(entry_safepoint, method_table)
-end
-
-const test_method_table = nothing
-
-function native_job(@nospecialize(func), @nospecialize(types); kernel::Bool=false,
-                    entry_abi=:specfunc, entry_safepoint::Bool=false, always_inline=false,
-                    method_table=test_method_table, kwargs...)
-    source = methodinstance(typeof(func), Base.to_tuple_type(types))
-    target = NativeCompilerTarget()
-    params = NativeCompilerParams(entry_safepoint, method_table)
-    config = CompilerConfig(target, params; kernel, entry_abi, always_inline)
-    CompilerJob(source, config), kwargs
-end
-
-function precompile_simple(f, t)
-    job, _ = native_job(f, t)
-    GPUCompiler.precompile_gpucompiler(job)
-end
-
-end # module SimpleGPU
diff --git a/src/precompilation_cache.jl b/src/precompilation_cache.jl
index 187b6606..d9ba0e84 100644
--- a/src/precompilation_cache.jl
+++ b/src/precompilation_cache.jl
@@ -15,28 +15,28 @@ end
 function ci_cache_delta(previous_snapshot)
     current_snapshot = ci_cache_snapshot()
     delta_snapshot = IdDict{Tuple{DataType, Core.Compiler.InferenceParams, Core.Compiler.OptimizationParams}, GPUCompiler.CodeCache}()
-    for (cachekey, cache) in current_snapshot
+    for (cachekey, codecache) in current_snapshot
         if cachekey in keys(previous_snapshot)
-            for (mi, civ) in cache
-                if mi in keys(previous_snapshot[cachekey])
+            for (mi, civ) in codecache.dict
+                if mi in keys(previous_snapshot[cachekey].dict)
                     for ci in civ
-                        if !(ci in previous_snapshot[cachekey][mi])
-                            if !(cachekey in delta_snapshot)
+                        if !(ci in previous_snapshot[cachekey].dict[mi])
+                            if !(cachekey in keys(delta_snapshot))
                                 delta_snapshot[cachekey] = GPUCompiler.CodeCache()
-                                delta_snapshot[cachekey][mi] = Vector{CodeInstance}()
-                            elseif !(mi in delta_snapshot[cachekey])
-                                delta_snapshot[cachekey][mi] = Vector{CodeInstance}()
+                                delta_snapshot[cachekey].dict[mi] = Vector{CodeInstance}()
+                            elseif !(mi in keys(delta_snapshot[cachekey].dict))
+                                delta_snapshot[cachekey].dict[mi] = Vector{CodeInstance}()
                             end
 
-                            append!(delta_snapshot[cachekey][mi], ci)
+                            push!(delta_snapshot[cachekey].dict[mi], ci)
                         end
                     end
                 else
                     # this whole cache is not present in the previous snapshot, can add all
-                    if !(cachekey in delta_snapshot)
+                    if !(cachekey in keys(delta_snapshot))
                         delta_snapshot[cachekey] = GPUCompiler.CodeCache()
                     end
-                    delta_snapshot[cachekey][mi] = civ
+                    delta_snapshot[cachekey].dict[mi] = civ
                 end
             end
         else
@@ -55,11 +55,33 @@ end=#
 
 function ci_cache_insert(cache)
     if !is_precompiling()
+        #first clean the cache
+        cleaned_cache = IdDict()
+        for (key, c) in cache
+            usedCache = false
+            newCodeCache = GPUCompiler.CodeCache()
+            for (mi, civ) in c.dict
+                new_civ = Vector()
+                for ci in civ
+                    if ci.min_world <= ci.max_world
+                        push!(new_civ, ci)
+                    end
+                end
+                if length(new_civ) > 0
+                    usedCache = true
+                    newCodeCache.dict[mi] = new_civ
+                end
+            end
+            if usedCache
+                cleaned_cache[key] = newCodeCache
+            end
+        end
+
         # need to merge caches at the code instance level
-        for key in keys(cache)
+        for (key, local_cache) in cleaned_cache
             if haskey(GPUCompiler.GLOBAL_CI_CACHES, key)
                 global_cache = GPUCompiler.GLOBAL_CI_CACHES[key]
-                local_cache = cache[key]
+                #local_cache = cache[key]
                 for (mi, civ) in (local_cache.dict)
                     # this should be one since there is only one range that is infinite
                     @assert length(civ) == 1
@@ -72,8 +94,6 @@ function ci_cache_insert(cache)
                         # sort by min world age, then make sure no age ranges overlap // this part is uneeded
                         sort(gciv, by=x->x.min_world)
                         if ci.min_world > gciv[length(gciv)].min_world
-                            println("invalidating mi [$mi] in world age [$(ci.min_world-1)]")
-                            println("adding ci [$ci]")
                             invalidate_code_cache(global_cache, mi, ci.min_world - 1)
                             Core.Compiler.setindex!(global_cache, ci, mi)
                         else
@@ -81,19 +101,15 @@ function ci_cache_insert(cache)
                             @assert false
                         end
                     else
-                        println("adding method instance [$mi] code instance [$ci]")
                         # occurs if we kill everything in the parent and then need to store in child
                         Core.Compiler.setindex!(global_cache, ci, mi)
                     end
                 end
             else
                 # no conflict at cache level
-                println("no conflictt adding cache $(cache[key])")
                 GPUCompiler.GLOBAL_CI_CACHES[key] = cache[key]
             end
         end
-        println("global cache post insert")
-        @show GPUCompiler.GLOBAL_CI_CACHES
     end
 end
 
diff --git a/test/EnzymeTest.jl b/test/EnzymeTest.jl
deleted file mode 100644
index 05723bc8..00000000
--- a/test/EnzymeTest.jl
+++ /dev/null
@@ -1,19 +0,0 @@
-module EnzymeTest
-using GPUCompiler
-using Enzyme
-
-f1(x) = x*x
-autodiff_wrapper(f) = first(autodiff(Reverse, f, Active(1.0)))
-
-println("precompilation!")
-
-
-
-const cache = let
-    cache_snapshot = GPUCompiler.ci_cache_snapshot()
-    autodiff_wrapper(f1)
-    GPUCompiler.ci_cache_delta(cache_snapshot)
-end
-
-__init__() = GPUCompiler.ci_cache_insert(cache)
-end # module EnzymeTest
diff --git a/test/Project.toml b/test/Project.toml
index e602d235..002ebb1c 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -1,5 +1,6 @@
 [deps]
 Cthulhu = "f68482b8-f384-11e8-15f7-abe071a5a75f"
+GPUCompiler = "61eb1bfa-7361-4325-ad38-22787b887f55"
 InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 Metal_LLVM_Tools_jll = "0418c028-ff8c-56b8-a53e-0f9676ed36fc"

From a6bd41aaa497753b70d2b33cee11cd59ce3e6b64 Mon Sep 17 00:00:00 2001
From: Collin R Warner <collinw@amdci2.julia.csail.mit.edu>
Date: Sat, 22 Apr 2023 20:14:32 -0400
Subject: [PATCH 12/14] Add persistent Cache example

---
 test/ExamplePersistentCache/GPUKernel.jl   | 26 ++++++++++++++++++++++
 test/ExamplePersistentCache/README.txt     | 20 +++++++++++++++++
 test/ExamplePersistentCache/TestRuntime.jl |  8 +++++++
 3 files changed, 54 insertions(+)
 create mode 100644 test/ExamplePersistentCache/GPUKernel.jl
 create mode 100644 test/ExamplePersistentCache/README.txt
 create mode 100644 test/ExamplePersistentCache/TestRuntime.jl

diff --git a/test/ExamplePersistentCache/GPUKernel.jl b/test/ExamplePersistentCache/GPUKernel.jl
new file mode 100644
index 00000000..628e50f1
--- /dev/null
+++ b/test/ExamplePersistentCache/GPUKernel.jl
@@ -0,0 +1,26 @@
+module GPUKernel
+using GPUCompiler
+using TestRuntime
+snapshot = GPUCompiler.ci_cache_snapshot()
+
+struct TestCompilerParams <: AbstractCompilerParams end
+GPUCompiler.runtime_module(::CompilerJob{<:Any,TestCompilerParams}) = TestRuntime
+
+kernel() = nothing
+function main()
+    source = methodinstance(typeof(kernel), Tuple{})
+    target = NativeCompilerTarget()
+    params = TestCompilerParams()
+    config = CompilerConfig(target, params)
+    job = CompilerJob(source, config)
+
+    println(GPUCompiler.compile(:asm, job)[1])
+end
+
+main()
+const persistent_cache = GPUCompiler.ci_cache_delta(snapshot)
+
+function __init__()
+    GPUCompiler.ci_cache_insert(persistent_cache)
+end
+end # module GPUKernel
diff --git a/test/ExamplePersistentCache/README.txt b/test/ExamplePersistentCache/README.txt
new file mode 100644
index 00000000..80462fd5
--- /dev/null
+++ b/test/ExamplePersistentCache/README.txt
@@ -0,0 +1,20 @@
+Persistent Cache api:
+
+GPUCompiler.ci_cache_snapshot() -> cache: returns a snapshot of GLOBAL_CI_CACHES used 
+as a base point for what will be persistently cached.
+
+GPUCompiler.ci_cache_delta(snapshot::cache) -> cache: takes a snapshot and returns
+the cache that represents the difference between (current GLOBAL_CI_CACHES - snapshot)
+
+GPUCompiler.ci_cache_insert(snapshot::cache): inserts snapshot into GLOBAL_CI_CACHES
+
+
+Usage:
+snapshot = GPUCompiler.ci_cache_snapshot()
+... precompile work ...
+const persistent_cache = GPUCompiler.ci_cache_delta(snapshot)
+
+function __init__()
+    GPUCompiler.ci_cache_insert(persistent_cache)
+    ... rest of init logic ...
+end
diff --git a/test/ExamplePersistentCache/TestRuntime.jl b/test/ExamplePersistentCache/TestRuntime.jl
new file mode 100644
index 00000000..1d29e4ba
--- /dev/null
+++ b/test/ExamplePersistentCache/TestRuntime.jl
@@ -0,0 +1,8 @@
+module TestRuntime
+    signal_exception() = return
+    malloc(sz) = C_NULL
+    report_oom(sz) = return
+    report_exception(ex) = return
+    report_exception_name(ex) = return
+    report_exception_frame(idx, func, file, line) = return
+end # module TestRuntime

From cc34d2117bb85a9f3ce34ac8906514ecb52daa5b Mon Sep 17 00:00:00 2001
From: Collin R Warner <collinw@amdci2.julia.csail.mit.edu>
Date: Sun, 23 Apr 2023 15:26:31 -0400
Subject: [PATCH 13/14] Remove dead code

---
 src/precompilation_cache.jl | 66 -------------------------------------
 1 file changed, 66 deletions(-)

diff --git a/src/precompilation_cache.jl b/src/precompilation_cache.jl
index d9ba0e84..1bc5c19a 100644
--- a/src/precompilation_cache.jl
+++ b/src/precompilation_cache.jl
@@ -46,13 +46,6 @@ function ci_cache_delta(previous_snapshot)
     return delta_snapshot
 end
 
-#=function ci_cache_insert(caches)
-    empty!(GPUCompiler.GLOBAL_CI_CACHES)
-    for (key, cache) in caches
-        GPUCompiler.GLOBAL_CI_CACHES[key] = GPUCompiler.CodeCache(cache)
-    end
-end=#
-
 function ci_cache_insert(cache)
     if !is_precompiling()
         #first clean the cache
@@ -125,62 +118,3 @@ function precompile_gpucompiler(job)
         GPUCompiler.ci_cache_populate(interp, cache, mt, job.source, job.world, typemax(Cint))
     end
 end
-
-"""
-Reloads Global Cache from global variable which stores the previous
-cached results
-"""
-function reinit_cache(LOCAL_CACHE)
-    if !is_precompiling()
-        # need to merge caches at the code instance level
-        for key in keys(LOCAL_CACHE)
-            if haskey(GPUCompiler.GLOBAL_CI_CACHES, key)
-                global_cache = GPUCompiler.GLOBAL_CI_CACHES[key]
-                local_cache = LOCAL_CACHE[key]
-                for (mi, civ) in (local_cache.dict)
-                    # this should be one since there is only one range that is infinite
-                    @assert length(civ) == 1
-                    # add all code instances to global cache
-                    # could move truncating code to set index
-                    ci = civ[1]
-                    if haskey(global_cache.dict, mi)
-                        gciv = global_cache.dict[mi]
-                        # truncation cod3
-                        # sort by min world age, then make sure no age ranges overlap // this part is uneeded
-                        sort(gciv, by=x->x.min_world)
-                        if ci.min_world > gciv[length(gciv)].min_world
-                            invalidate_code_cache(global_cache, mi, ci.min_world - 1)
-                            Core.Compiler.setindex!(global_cache, ci, mi)
-                        else
-                            println("Should not get here?")
-                            @assert false
-                        end
-                    else
-                        # occurs if we kill everything in the parent and then need to store in child
-                        Core.Compiler.setindex!(global_cache, ci, mi)
-                    end
-                end
-            else
-                # no conflict at cache level
-                GPUCompiler.GLOBAL_CI_CACHES[key] = LOCAL_CACHE[key]
-            end
-        end
-    end
-end
-
-"""
-Takes a snapshot of the current status of the cache
-
-The cache returned is a deep copy with finite world age endings removed
-"""
-function snapshot_cache(LOCAL_CACHE)
-    cleaned_cache_to_save = IdDict()
-    for key in keys(GPUCompiler.GLOBAL_CI_CACHES)
-        # Will only keep those elements with infinite ranges
-        cleaned_cache_to_save[key] = GPUCompiler.CodeCache(GPUCompiler.GLOBAL_CI_CACHES[key])
-    end
-    global MY_CACHE #technically don't need the global
-    #empty insert
-    empty!(LOCAL_CACHE)
-    merge!(LOCAL_CACHE, cleaned_cache_to_save)
-end

From 195108754ed25587311484e7dda53339b5df7e74 Mon Sep 17 00:00:00 2001
From: collinwarner <collinwarner@cyclops.juliacomputing.io>
Date: Thu, 18 May 2023 18:40:31 -0400
Subject: [PATCH 14/14] add native caching

---
 src/cache.jl                | 12 +++--
 src/jlgen.jl                | 11 ++---
 src/precompilation_cache.jl | 88 +++++++++++++++++++------------------
 3 files changed, 60 insertions(+), 51 deletions(-)

diff --git a/src/cache.jl b/src/cache.jl
index fa71ab19..7a38a88e 100644
--- a/src/cache.jl
+++ b/src/cache.jl
@@ -26,7 +26,6 @@ function cached_compilation(cache::AbstractDict{UInt,V},
     key = hash(tt, key)
     key = hash(world, key)
     key = hash(cfg, key)
-
     # NOTE: no use of lock(::Function)/@lock/get! to avoid try/catch and closure overhead
     lock(cache_lock)
     obj = get(cache, key, nothing)
@@ -36,6 +35,7 @@ function cached_compilation(cache::AbstractDict{UInt,V},
     if obj === nothing || compile_hook[] !== nothing
         obj = actual_compilation(cache, key, cfg, ft, tt, compiler, linker)::V
     end
+
     return obj::V
 end
 
@@ -45,10 +45,14 @@ end
     src = methodinstance(ft, tt)
     job = CompilerJob(src, cfg)
 
+    global_cache = ci_cache(job)
     asm = nothing
-    # TODO: consider loading the assembly from an on-disk cache here
 
-    # compile
+    # read asm from persistent offline cache
+    if haskey(global_cache.asm, src)
+        asm = global_cache.asm[src]
+    end
+
     if asm === nothing
         asm = compiler(job)
     end
@@ -57,7 +61,7 @@ end
     # in which case the cache will already be populated)
     lock(cache_lock) do
         haskey(cache, key) && return cache[key]
-
+        global_cache.asm[src] = asm
         obj = linker(job, asm)
         cache[key] = obj
         obj
diff --git a/src/jlgen.jl b/src/jlgen.jl
index 74c0fd4e..44e95a0a 100644
--- a/src/jlgen.jl
+++ b/src/jlgen.jl
@@ -255,16 +255,19 @@ using Core.Compiler: CodeInstance, MethodInstance, InferenceParams, Optimization
 
 struct CodeCache
     dict::IdDict{MethodInstance,Vector{CodeInstance}}
+    asm::IdDict{MethodInstance, NamedTuple{(:image, :entry, :external_gvars), Tuple{Vector{UInt8}, String, Vector{String}}}}
 
-    CodeCache() = new(Dict{MethodInstance,Vector{CodeInstance}}())
-    CodeCache(cache::CodeCache) = new(GPUCompiler.copyAndFilter(cache.dict))
+    CodeCache() = new(Dict{MethodInstance,Vector{CodeInstance}}(), 
+    Dict{MethodInstance, NamedTuple{(:image, :entry, :external_gvars), Tuple{Vector{UInt8}, String, Vector{String}}}}())
+
+    CodeCache(cache::CodeCache) = new(GPUCompiler.copyAndFilter(cache.dict), cache.asm)
 end
 
 function copyAndFilter(dict::IdDict)
     out= IdDict()
     for key in keys(dict)
         useKey = true
-        # why is it an array of code instances, can there be more than 1?
+
         for ci in dict[key]
             if ci.max_world < typemax(typeof(ci.max_world))
                 useKey = false
@@ -590,7 +593,6 @@ end
 
 function ci_cache_populate(interp, cache, mt, mi, min_world, max_world)
     src = Core.Compiler.typeinf_ext_toplevel(interp, mi)
-
     # inference populates the cache, so we don't need to jl_get_method_inferred
     wvc = WorldView(cache, min_world, max_world)
     @assert Core.Compiler.haskey(wvc, mi)
@@ -622,7 +624,6 @@ function ci_cache_lookup(cache, mi, min_world, max_world)
     return ci
 end
 
-
 ## interface
 
 # for platforms without @cfunction-with-closure support
diff --git a/src/precompilation_cache.jl b/src/precompilation_cache.jl
index 1bc5c19a..138d850e 100644
--- a/src/precompilation_cache.jl
+++ b/src/precompilation_cache.jl
@@ -7,25 +7,33 @@ function ci_cache_snapshot()
     cleaned_cache_to_save = IdDict()
     for key in keys(GPUCompiler.GLOBAL_CI_CACHES)
         # Will only keep those elements with infinite ranges
+        # copy constructor
         cleaned_cache_to_save[key] = GPUCompiler.CodeCache(GPUCompiler.GLOBAL_CI_CACHES[key])
     end
+
     return cleaned_cache_to_save
 end
 
 function ci_cache_delta(previous_snapshot)
     current_snapshot = ci_cache_snapshot()
     delta_snapshot = IdDict{Tuple{DataType, Core.Compiler.InferenceParams, Core.Compiler.OptimizationParams}, GPUCompiler.CodeCache}()
-    for (cachekey, codecache) in current_snapshot
+    for (cachekey, codecache) in current_snapshot # iterate through all caches
         if cachekey in keys(previous_snapshot)
-            for (mi, civ) in codecache.dict
+            for (mi, civ) in codecache.dict # iterate through all mi
                 if mi in keys(previous_snapshot[cachekey].dict)
                     for ci in civ
                         if !(ci in previous_snapshot[cachekey].dict[mi])
                             if !(cachekey in keys(delta_snapshot))
                                 delta_snapshot[cachekey] = GPUCompiler.CodeCache()
                                 delta_snapshot[cachekey].dict[mi] = Vector{CodeInstance}()
+                                if haskey(codecache.asm, mi)
+                                    delta_snapshot[cachekey].asm[mi] = codecache.asm[mi]
+                                end
                             elseif !(mi in keys(delta_snapshot[cachekey].dict))
                                 delta_snapshot[cachekey].dict[mi] = Vector{CodeInstance}()
+                                if haskey(codecache.asm, mi)
+                                    delta_snapshot[cachekey].asm[mi] = codecache.asm[mi]
+                                end
                             end
 
                             push!(delta_snapshot[cachekey].dict[mi], ci)
@@ -36,6 +44,10 @@ function ci_cache_delta(previous_snapshot)
                     if !(cachekey in keys(delta_snapshot))
                         delta_snapshot[cachekey] = GPUCompiler.CodeCache()
                     end
+                    
+                    if haskey(codecache.asm, mi)
+                        delta_snapshot[cachekey].asm[mi] = codecache.asm[mi]
+                    end
                     delta_snapshot[cachekey].dict[mi] = civ
                 end
             end
@@ -43,59 +55,34 @@ function ci_cache_delta(previous_snapshot)
             delta_snapshot[cachekey] = current_snapshot[cachekey]
         end
     end
+
     return delta_snapshot
 end
 
+function print_keys(caches)
+    println("************")
+    for (key, cache) in caches
+        for (mi, civ) in cache.dict
+            println("$mi -> $(length(civ))")
+        end
+    end
+    println("************")
+end
 function ci_cache_insert(cache)
     if !is_precompiling()
-        #first clean the cache
-        cleaned_cache = IdDict()
-        for (key, c) in cache
-            usedCache = false
-            newCodeCache = GPUCompiler.CodeCache()
-            for (mi, civ) in c.dict
-                new_civ = Vector()
-                for ci in civ
-                    if ci.min_world <= ci.max_world
-                        push!(new_civ, ci)
-                    end
-                end
-                if length(new_civ) > 0
-                    usedCache = true
-                    newCodeCache.dict[mi] = new_civ
-                end
-            end
-            if usedCache
-                cleaned_cache[key] = newCodeCache
-            end
-        end
-
         # need to merge caches at the code instance level
-        for (key, local_cache) in cleaned_cache
+        for (key, local_cache) in cache
             if haskey(GPUCompiler.GLOBAL_CI_CACHES, key)
                 global_cache = GPUCompiler.GLOBAL_CI_CACHES[key]
-                #local_cache = cache[key]
                 for (mi, civ) in (local_cache.dict)
                     # this should be one since there is only one range that is infinite
                     @assert length(civ) == 1
                     # add all code instances to global cache
                     # could move truncating code to set index
-                    ci = civ[1]
-                    if haskey(global_cache.dict, mi)
-                        gciv = global_cache.dict[mi]
-                        # truncation cod3
-                        # sort by min world age, then make sure no age ranges overlap // this part is uneeded
-                        sort(gciv, by=x->x.min_world)
-                        if ci.min_world > gciv[length(gciv)].min_world
-                            invalidate_code_cache(global_cache, mi, ci.min_world - 1)
-                            Core.Compiler.setindex!(global_cache, ci, mi)
-                        else
-                            println("Should not get here?")
-                            @assert false
-                        end
-                    else
-                        # occurs if we kill everything in the parent and then need to store in child
-                        Core.Compiler.setindex!(global_cache, ci, mi)
+                    Core.Compiler.setindex!(global_cache, civ[1], mi)
+                    #@assert haskey(local_cache.asm, mi)
+                    if haskey(local_cache.asm, mi)
+                        global_cache.asm[mi] = local_cache.asm[mi]
                     end
                 end
             else
@@ -118,3 +105,20 @@ function precompile_gpucompiler(job)
         GPUCompiler.ci_cache_populate(interp, cache, mt, job.source, job.world, typemax(Cint))
     end
 end
+
+"""
+Generate a precompile file for the current state of the cache
+"""
+function generate_precompilation_file(snapshot, filename, precompilation_function)
+    method_instances = []
+    for (cachekey, cache) in snapshot
+        for (mi, civ) in cache.dict
+            push!(method_instances, mi)
+        end
+    end
+
+    precompile_statements = join(["$precompilation_function($(mi.specTypes.parameters[1]), Core.$(mi.specTypes.parameters[2:length(mi.specTypes.parameters)]))" for mi in method_instances], '\n')
+    open(filename, "w") do file
+        write(file, precompile_statements)
+    end
+end