From 7190fac9677e1ccbbed17aa88342755cb6c3eb1e Mon Sep 17 00:00:00 2001
From: Gabriel Baraldi <baraldigabriel@gmail.com>
Date: Wed, 4 Jun 2025 20:30:23 -0300
Subject: [PATCH 1/9] Make late gc lower handle insertelement of alloca use.
 (#58637)

This was in DAECompiler.jl code found by @serenity4. He also mentioned
that writing up how one might go and fix a bug like this so i'll give a
quick writeup (this was a very simple bug so it might not be too
interesting)

The original crash which looked something like
>   %19 = alloca [10 x i64], align 8
  %155 = insertelement <4 x ptr> poison, ptr %19, i32 0
Unexpected instruction
> [898844] signal 6 (-6): Aborted
in expression starting at
/home/gbaraldi/DAECompiler.jl/test/reflection.jl:28
pthread_kill at /lib/x86_64-linux-gnu/libc.so.6 (unknown line)
gsignal at /lib/x86_64-linux-gnu/libc.so.6 (unknown line)
abort at /lib/x86_64-linux-gnu/libc.so.6 (unknown line)
RecursivelyVisit<llvm::IntrinsicInst,
LateLowerGCFrame::PlaceRootsAndUpdateCalls(llvm::ArrayRef<int>, int,
State&, std::map<llvm::Value*, std::pair<int, int>
>)::<lambda(llvm::AllocaInst*&)>::<lambda(llvm::Use&)> > at
/home/gbaraldi/julia4/src/llvm-late-gc-lowering.cpp:803
operator() at /home/gbaraldi/julia4/src/llvm-late-gc-lowering.cpp:2560
[inlined]
PlaceRootsAndUpdateCalls at
/home/gbaraldi/julia4/src/llvm-late-gc-lowering.cpp:2576
runOnFunction at
/home/gbaraldi/julia4/src/llvm-late-gc-lowering.cpp:2638
run at /home/gbaraldi/julia4/src/llvm-late-gc-lowering.cpp:2675
run at
/home/gbaraldi/julia4/usr/include/llvm/IR/PassManagerInternal.h:91

which means it was crashing inside of late-gc-lowering, so the first
thing I did was ran julia and the same test with LLVM_ASSERTIONS=1 and
FORCE_ASSERTIONS=1 to see if LLVM complained about a malformed module,
and both were fine. Next step was trying to get the failing code out for
inspection.
Easiest way is to do `export
JULIA_LLVM_ARGS="--print-before=LateLowerGCFrame --print-module-scope"`
and pipe the output to a file.
The file is huge, but since it's a crash in LLVM we know that the last
thing is what we want, and that gave me the IR I wanted.
To verify that this is failing I did `make -C src install-analysis-deps`
to install the LLVM machinery (opt...). That gets put in the `tools`
directory of a julia build. Then I checked if this crashed outside of
julia by doing
`./opt -load-pass-plugin=../lib/libjulia-codegen.dylib
--passes=LateLowerGCFrame -S test.ll -o tmp3.ll `. This is run from
inside the tools dir so your paths might vary (the -S is so LLVM doesn't
generate bitcode) and my code did crash, however it was over 500 lines
of IR which makes it harder to debug and to write a test.

Next step then is to minimize the crash by doing
[`llvm-reduce`](https://llvm.org/docs/CommandGuide/llvm-reduce.html)
over it (it's basically creduce but optimized for LLVM IR) which gave me
a 2 line reproducer (in this case apparently just having the
insertelement was enough for the pass to fail). One thing to be wary is
that llvm-reduce will usually make very weird code, so it might be
useful to modify the code slightly so it doesn't look odd (it will have
unreachable basic-blocks and such).
After the cleanup fixing the bug here wasn't interesting but this
doesn't apply generally. And also always transform your reduced IR into
a test to put in llvmpasses.

(cherry picked from commit 906d3482d318868e7d3eef6b1db97ef5bedd6f82)
---
 src/llvm-late-gc-lowering.cpp    |  2 +-
 test/llvmpasses/late-lower-gc.ll | 14 ++++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)
diff --git a/src/llvm-late-gc-lowering.cpp b/src/llvm-late-gc-lowering.cpp
index 46214666c5d36..25ec6959eddf1 100644
--- a/src/llvm-late-gc-lowering.cpp
+++ b/src/llvm-late-gc-lowering.cpp
@@ -778,7 +778,7 @@ void RecursivelyVisit(callback f, Value *V) {
         if (isa<CallInst>(TheUser) || isa<LoadInst>(TheUser) ||
             isa<SelectInst>(TheUser) || isa<PHINode>(TheUser) || // TODO: should these be removed from this list?
             isa<StoreInst>(TheUser) || isa<PtrToIntInst>(TheUser) ||
-            isa<ICmpInst>(TheUser) || // ICmpEQ/ICmpNE can be used with ptr types
+            isa<ICmpInst>(TheUser) || isa<InsertElementInst>(TheUser)|| // ICmpEQ/ICmpNE can be used with ptr types
             isa<AtomicCmpXchgInst>(TheUser) || isa<AtomicRMWInst>(TheUser))
             continue;
         if (isa<GetElementPtrInst>(TheUser) || isa<BitCastInst>(TheUser) || isa<AddrSpaceCastInst>(TheUser)) {
diff --git a/test/llvmpasses/late-lower-gc.ll b/test/llvmpasses/late-lower-gc.ll
index bdb10e97093b2..6d5c42f50fcde 100644
--- a/test/llvmpasses/late-lower-gc.ll
+++ b/test/llvmpasses/late-lower-gc.ll
@@ -199,6 +199,20 @@ define void @decayar([2 x {} addrspace(10)* addrspace(11)*] %ar) {
 ; CHECK: %r = call i32 @callee_root(ptr addrspace(10) %l0, ptr addrspace(10) %l1)
 ; CHECK: call void @julia.pop_gc_frame(ptr %gcframe)
 
+define swiftcc ptr addrspace(10) @insert_element(ptr swiftself %0) {
+; CHECK-LABEL: @insert_element
+  %2 = alloca [10 x i64], i32 1, align 8
+; CHECK: %gcframe = call ptr @julia.new_gc_frame(i32 10)
+; CHECK: [[gc_slot_addr_:%.*]] = call ptr @julia.get_gc_frame_slot(ptr %gcframe, i32 0)
+; CHECK: call void @julia.push_gc_frame(ptr %gcframe, i32 10)
+  call void null(ptr sret([2 x [5 x ptr addrspace(10)]]) %2, ptr null, ptr addrspace(11) null, ptr null)
+  %4 = insertelement <4 x ptr> zeroinitializer, ptr %2, i32 0
+; CHECK: [[gc_slot_addr_:%.*]] = insertelement <4 x ptr> zeroinitializer, ptr [[gc_slot_addr_:%.*]], i32 0
+; CHECK: call void @julia.pop_gc_frame(ptr %gcframe)
+  ret ptr addrspace(10) null
+}
+
+
 !0 = !{i64 0, i64 23}
 !1 = !{!1}
 !2 = !{!7} ; scope list

From 6846af76eb4da22d7198b737de07895447e9585a Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Wed, 16 Jul 2025 14:23:38 -0400
Subject: [PATCH 2/9] Fix LLVM TaskDispatcher implementation issues (#58950)

Fixes #58229 (LLVM JITLink stack overflow issue)

I tried submitting this promise/future implementation upstream
(https://github.com/llvm/llvm-project/compare/main...vtjnash:llvm-project:jn/cowait-jit)
so that I would not need to duplicate nearly as much code here to fix
this bug, but upstream is currently opposed to fixing this bug and
instead insists it is preferable for each downstream project to
implement this fix themselves adding extra maintenance burden for us for
now. Sigh.

(cherry picked from commit 00351da179b7de6cd8a28f229580c96603387f0b)
---
 src/Makefile                     |   2 +-
 src/jitlayers.cpp                |  23 +-
 src/llvm-julia-task-dispatcher.h | 465 +++++++++++++++++++++++++++++++
 3 files changed, 474 insertions(+), 16 deletions(-)
 create mode 100644 src/llvm-julia-task-dispatcher.h

diff --git a/src/Makefile b/src/Makefile
index e859acc765354..6dac1524145ef 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -349,7 +349,7 @@ $(BUILDDIR)/gc-alloc-profiler.o $(BUILDDIR)/gc-alloc-profiler.dbg.obj: $(SRCDIR)
 $(BUILDDIR)/gc-page-profiler.o $(BUILDDIR)/gc-page-profiler.dbg.obj: $(SRCDIR)/gc-page-profiler.h
 $(BUILDDIR)/init.o $(BUILDDIR)/init.dbg.obj: $(SRCDIR)/builtin_proto.h
 $(BUILDDIR)/interpreter.o $(BUILDDIR)/interpreter.dbg.obj: $(SRCDIR)/builtin_proto.h
-$(BUILDDIR)/jitlayers.o $(BUILDDIR)/jitlayers.dbg.obj: $(SRCDIR)/jitlayers.h $(SRCDIR)/llvm-codegen-shared.h
+$(BUILDDIR)/jitlayers.o $(BUILDDIR)/jitlayers.dbg.obj: $(SRCDIR)/jitlayers.h $(SRCDIR)/llvm-codegen-shared.h $(SRCDIR)/llvm-julia-task-dispatcher.h
 $(BUILDDIR)/jltypes.o $(BUILDDIR)/jltypes.dbg.obj: $(SRCDIR)/builtin_proto.h
 $(build_shlibdir)/libllvmcalltest.$(SHLIB_EXT): $(SRCDIR)/llvm-codegen-shared.h $(BUILDDIR)/julia_version.h
 $(BUILDDIR)/llvm-alloc-helpers.o $(BUILDDIR)/llvm-alloc-helpers.dbg.obj: $(SRCDIR)/llvm-codegen-shared.h $(SRCDIR)/llvm-pass-helpers.h $(SRCDIR)/llvm-alloc-helpers.h
diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp
index 3ea95ea42f596..a44da64941985 100644
--- a/src/jitlayers.cpp
+++ b/src/jitlayers.cpp
@@ -6,6 +6,7 @@
 #include <string>
 
 #include "llvm/IR/Mangler.h"
+#include <llvm/ADT/BitmaskEnum.h>
 #include <llvm/ADT/Statistic.h>
 #include <llvm/ADT/StringMap.h>
 #include <llvm/Analysis/TargetLibraryInfo.h>
@@ -46,6 +47,7 @@ using namespace llvm;
 #include "jitlayers.h"
 #include "julia_assert.h"
 #include "processor.h"
+#include "llvm-julia-task-dispatcher.h"
 
 #if JL_LLVM_VERSION >= 180000
 # include <llvm/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.h>
@@ -666,17 +668,8 @@ static void jl_compile_codeinst_now(jl_code_instance_t *codeinst)
             if (!decls.specFunctionObject.empty())
                 NewDefs.push_back(decls.specFunctionObject);
         }
-        // Split batches to avoid stack overflow in the JIT linker.
-        // FIXME: Patch ORCJITs InPlaceTaskDispatcher to not recurse on task dispatches but
-        // push the tasks to a queue to be drained later. This avoids the stackoverflow caused by recursion
-        // in the linker when compiling a large number of functions at once.
-        SmallVector<uint64_t, 0> Addrs;
-        for (size_t i = 0; i < NewDefs.size(); i += 1000) {
-            auto end = std::min(i + 1000, NewDefs.size());
-            SmallVector<StringRef> batch(NewDefs.begin() + i, NewDefs.begin() + end);
-            auto AddrsBatch = jl_ExecutionEngine->findSymbols(batch);
-            Addrs.append(AddrsBatch);
-        }
+        auto Addrs = jl_ExecutionEngine->findSymbols(NewDefs);
+
         size_t nextaddr = 0;
         for (auto &this_code : linkready) {
             auto it = invokenames.find(this_code);
@@ -1841,7 +1834,7 @@ llvm::DataLayout jl_create_datalayout(TargetMachine &TM) {
 JuliaOJIT::JuliaOJIT()
   : TM(createTargetMachine()),
     DL(jl_create_datalayout(*TM)),
-    ES(cantFail(orc::SelfExecutorProcessControl::Create())),
+    ES(cantFail(orc::SelfExecutorProcessControl::Create(nullptr, std::make_unique<::JuliaTaskDispatcher>()))),
     GlobalJD(ES.createBareJITDylib("JuliaGlobals")),
     JD(ES.createBareJITDylib("JuliaOJIT")),
     ExternalJD(ES.createBareJITDylib("JuliaExternal")),
@@ -2098,7 +2091,7 @@ SmallVector<uint64_t> JuliaOJIT::findSymbols(ArrayRef<StringRef> Names)
         Unmangled[NonOwningSymbolStringPtr(Mangled)] = Unmangled.size();
         Exports.add(std::move(Mangled));
     }
-    SymbolMap Syms = cantFail(ES.lookup(orc::makeJITDylibSearchOrder(ArrayRef(&JD)), std::move(Exports)));
+    SymbolMap Syms = cantFail(::safelookup(ES, orc::makeJITDylibSearchOrder(ArrayRef(&JD)), std::move(Exports)));
     SmallVector<uint64_t> Addrs(Names.size());
     for (auto it : Syms) {
         Addrs[Unmangled.at(orc::NonOwningSymbolStringPtr(it.first))] = it.second.getAddress().getValue();
@@ -2110,7 +2103,7 @@ Expected<ExecutorSymbolDef> JuliaOJIT::findSymbol(StringRef Name, bool ExportedS
 {
     orc::JITDylib* SearchOrders[3] = {&JD, &GlobalJD, &ExternalJD};
     ArrayRef<orc::JITDylib*> SearchOrder = ArrayRef<orc::JITDylib*>(&SearchOrders[0], ExportedSymbolsOnly ? 3 : 1);
-    auto Sym = ES.lookup(SearchOrder, Name);
+    auto Sym = ::safelookup(ES, SearchOrder, Name);
     return Sym;
 }
 
@@ -2123,7 +2116,7 @@ Expected<ExecutorSymbolDef> JuliaOJIT::findExternalJDSymbol(StringRef Name, bool
 {
     orc::JITDylib* SearchOrders[3] = {&ExternalJD, &GlobalJD, &JD};
     ArrayRef<orc::JITDylib*> SearchOrder = ArrayRef<orc::JITDylib*>(&SearchOrders[0], ExternalJDOnly ? 1 : 3);
-    auto Sym = ES.lookup(SearchOrder, getMangledName(Name));
+    auto Sym = ::safelookup(ES, SearchOrder, getMangledName(Name));
     return Sym;
 }
 
diff --git a/src/llvm-julia-task-dispatcher.h b/src/llvm-julia-task-dispatcher.h
new file mode 100644
index 0000000000000..dd4037378b6b6
--- /dev/null
+++ b/src/llvm-julia-task-dispatcher.h
@@ -0,0 +1,465 @@
+// This file is a part of Julia. License is MIT: https://julialang.org/license
+
+namespace {
+
+using namespace llvm::orc;
+
+template <typename U> struct future_value_storage {
+  // Union disables default construction/destruction semantics, allowing us to
+  // use placement new/delete for precise control over value lifetime
+  union {
+    U value_;
+  };
+
+  future_value_storage() {}
+  ~future_value_storage() {}
+};
+
+template <> struct future_value_storage<void> {
+  // No value_ member for void
+};
+
+struct JuliaTaskDispatcher : public TaskDispatcher {
+    /// Forward declarations
+    class future_base;
+    void dispatch(std::unique_ptr<Task> T) override;
+    void shutdown() override;
+    void work_until(future_base &F);
+private:
+  /// C++ does not support non-static thread_local variables, so this needs to
+  /// store both the task and the associated dispatcher queue so that shutdown
+  /// can wait for the correct tasks to finish.
+  thread_local static SmallVector<std::pair<std::unique_ptr<Task>, JuliaTaskDispatcher*>> TaskQueue;
+  std::mutex DispatchMutex;
+  std::condition_variable WorkFinishedCV;
+  SmallVector<future_base *> WaitingFutures;
+
+public:
+
+/// @name ORC Promise/Future Classes
+///
+/// ORC-aware promise/future implementation that integrates with the
+/// TaskDispatcher system to allow efficient cooperative multitasking while
+/// waiting for results (with certain limitations on what can be awaited).
+/// Together they provide building blocks for a full async/await-like runtime
+/// for llvm that supports multiple threads.
+///
+/// Unlike std::promise/std::future alone, these classes can help dispatch other
+/// tasks while waiting, preventing deadlocks and improving overall system
+/// throughput. They have a similar API, though with some important differences
+/// and some features simply not currently implemented.
+///
+/// @{
+
+template <typename T> class promise;
+template <typename T> class future;
+
+/// Status for future/promise state
+enum class FutureStatus : uint8_t { NotReady = 0, Ready = 1 };
+
+/// @}
+
+/// Type-erased base class for futures, generally for scheduler use to avoid
+/// needing virtual dispatches
+class future_base {
+public:
+  /// Check if the future is now ready with a value (precondition: get_promise()
+  /// must have been called)
+  bool ready() const {
+    if (!valid())
+      report_fatal_error("ready() called before get_promise()");
+    return state_->status_.load(std::memory_order_acquire) == FutureStatus::Ready;
+  }
+
+  /// Check if the future is in a valid state (not moved-from and get_promise() called)
+  bool valid() const { return state_ != nullptr; }
+
+  /// Wait for the future to be ready, helping with task dispatch
+  void wait(JuliaTaskDispatcher &D) {
+    // Keep helping with task dispatch until our future is ready
+    if (!ready()) {
+      D.work_until(*this);
+      if (state_->status_.load(std::memory_order_relaxed) != FutureStatus::Ready)
+        report_fatal_error(
+            "work_until() returned without this future being ready");
+    }
+  }
+
+protected:
+  struct state_base {
+    std::atomic<FutureStatus> status_{FutureStatus::NotReady};
+  };
+
+  future_base(state_base *state) : state_(state) {}
+  future_base() = default;
+
+  /// Only allow deleting the future once it is invalid
+  ~future_base() {
+    if (state_)
+      report_fatal_error("get() must be called before future destruction (ensuring promise::set_value memory is valid)");
+  }
+
+  // Move constructor and assignment
+  future_base(future_base &&other) noexcept : state_(other.state_) {
+    other.state_ = nullptr;
+  }
+
+  future_base &operator=(future_base &&other) noexcept {
+    if (this != &other) {
+      this->~future_base();
+      state_ = other.state_;
+      other.state_ = nullptr;
+    }
+    return *this;
+  }
+
+  state_base *state_;
+};
+
+/// TaskDispatcher-aware future class for cooperative await.
+///
+/// @tparam T The type of value this future will provide. Use void for futures
+/// that
+///           signal completion without providing a value.
+///
+/// This future implementation is similar to `std::future`, so most code can
+/// transition to it easily. However, it differs from `std::future` in a few
+/// key ways to be aware of:
+/// - No exception support (or the overhead for it).
+/// - The future is created before the promise, then the promise is created
+///   from the future.
+/// - The future is in an invalid state until get_promise() has been called.
+/// - Waiting operations (get(&D), wait(&D)) help dispatch other tasks while
+///   blocked, requiring an additional argument of which TaskDispatcher object
+///   of where all associated work will be scheduled.
+/// - While `wait` may be called multiple times and on multiple threads, all of
+///   them must have returned before calling `get` on exactly one thread.
+/// - Must call get() exactly once before destruction (enforced with
+///   `report_fatal_error`) after each call to `get_promise`. Internal state is
+///   freed when `get` returns, and allocated when `get_promise` is called.
+///
+/// Other notable features, in common with `std::future`:
+/// - Supports both value types and void specialization through the same
+/// interface.
+/// - Thread-safe through atomic operations.
+/// - Provides acquire-release ordering with `std::promise::set_value()`.
+/// - Concurrent access to any method (including to `ready`) on multiple threads
+///   is not allowed.
+/// - Holding any locks while calling `get()` is likely to lead to deadlock.
+///
+/// @warning Users should avoid borrowing references to futures. References may
+/// go out of scope and break the uniqueness contract, which may break the
+/// soundness of the types. Always use move semantics or pass by value.
+
+template <typename T> class future : public future_base {
+public:
+  future() : future_base(nullptr) {}
+  future(const future &) = delete;
+  future &operator=(const future &) = delete;
+  future(future &&) = default;
+  future &operator=(future &&) = default;
+
+  /// Get the value, helping with task dispatch while waiting.
+  /// This will destroy the underlying value, so this must be called exactly
+  /// once, which returns the future to the initial state.
+  T get(JuliaTaskDispatcher &D) {
+    if (!valid())
+      report_fatal_error("get() must only be called once, after get_promise()");
+    wait(D);
+    auto state_ = static_cast<state*>(this->state_);
+    this->state_ = nullptr;
+    return take_value(state_);
+  }
+
+  /// Get the associated promise (must only be called once)
+  promise<T> get_promise() {
+    if (valid())
+      report_fatal_error("get_promise() can only be called once");
+    auto state_ = new state();
+    this->state_ = state_;
+    return promise<T>(state_);
+  }
+
+private:
+  friend class promise<T>;
+
+  // Template the state struct with EBCO so that future<void> has no wasted
+  // overhead for the value. The declaration of future_value_storage is far
+  // above here since GCC doesn't implement it properly when nested.
+  struct state : future_base::state_base, future_value_storage<T> {};
+
+  template <typename U = T>
+  typename std::enable_if<!std::is_void<U>::value, U>::type take_value(state *state_) {
+    T result = std::move(state_->value_);
+    state_->value_.~T();
+    delete state_;
+    return result;
+  }
+
+  template <typename U = T>
+  typename std::enable_if<std::is_void<U>::value, U>::type take_value(state *state_) {
+    delete state_;
+  }
+};
+
+/// TaskDispatcher-aware promise class that provides values to associated
+/// futures.
+///
+/// @tparam T The type of value this promise will provide. Use void for promises
+/// that
+///           signal completion without providing a value.
+///
+/// This promise implementation provides the value-setting side of the
+/// promise/future pair and integrates with the ORC TaskDispatcher system. Key
+/// characteristics:
+/// - Created from a future via get_promise() rather than creating the future from the promise.
+/// - Must call get_future() on the thread that created it (it can be passed to another thread, but do not borrow a reference and use that to mutate it from another thread).
+/// - Must call set_value() exactly once per `get_promise()` call to provide the result.
+/// - Thread-safe from set_value to get.
+/// - Move-only semantics to prevent accidental copying.
+///
+/// The `promise` can usually be passed to another thread in one of two ways:
+/// - With move semantics:
+///     * `[P = F.get_promise()] () { P.set_value(); }`
+///     * `[P = std::move(P)] () { P.set_value(); }`
+///     * Advantages: clearer where `P` is owned, automatic deadlock detection
+///     on destruction,
+///       easier memory management if the future is returned from the function.
+/// - By reference:
+///     * `[&P] () { P.set_value(); }`
+///     * Advantages: simpler memory management if the future is consumed in the
+///     same function.
+///     * Disadvantages: more difficult memory management if the future is
+///     returned from the function, no deadlock detection.
+///
+/// @warning Users should avoid borrowing references to promises. References may
+/// go out of scope and break the uniqueness contract, which may break the
+/// soundness of the types. Always use move semantics or pass by value.
+///
+/// @par Error Handling:
+/// The promise/future system uses report_fatal_error() for misuse:
+/// - Calling set_value() more than once.
+/// - Destroying a future without calling get().
+/// - Calling get() more than once on a future.
+///
+/// @par Thread Safety:
+/// - Each promise/future must only be accessed by one thread, as concurrent
+///   calls to the API functions may result in crashes.
+/// - Multiple threads can safely access different promise/future pairs.
+/// - set_value() and get() operations are atomic and thread-safe.
+/// - Move operations should only be performed by a single thread.
+template <typename T> class promise {
+  friend class future<T>;
+
+public:
+  promise() : state_(nullptr) {}
+
+  ~promise() {
+    // Assert proper promise lifecycle: ensure set_value was called if promise was valid.
+    // This can catch deadlocks where a promise is created but set_value() is
+    // never called, though only if the promise is moved from instead of
+    // borrowed from the frame with the future.
+    // Empty promises (state_ == nullptr) are allowed to be destroyed without calling set_value.
+  }
+
+  promise(const promise &) = delete;
+  promise &operator=(const promise &) = delete;
+
+  promise(promise &&other) noexcept
+      : state_(other.state_) {
+    other.state_ = nullptr;
+  }
+
+  promise &operator=(promise &&other) noexcept {
+    if (this != &other) {
+      this->~promise();
+      state_ = other.state_;
+      other.state_ = nullptr;
+    }
+    return *this;
+  }
+
+
+  /// Set the value (must only be called once)
+  // In C++20, this std::conditional weirdness can probably be replaced just
+  // with requires. It ensures that we don't try to define a method for `void&`,
+  // but that if the user calls set_value(v) for any value v that they get a
+  // member function error, instead of no member named 'value_'.
+  template <typename U = T>
+  void
+  set_value(const typename std::conditional<std::is_void<T>::value,
+                                            std::nullopt_t, T>::type &value) const {
+    assert(state_ && "set_value() can only be called once");
+    new (&state_->value_) T(value);
+    state_->status_.store(FutureStatus::Ready, std::memory_order_release);
+    state_ = nullptr;
+  }
+
+  template <typename U = T>
+  void set_value(typename std::conditional<std::is_void<T>::value,
+                                           std::nullopt_t, T>::type &&value) const {
+    assert(state_ && "set_value() can only be called once");
+    new (&state_->value_) T(std::move(value));
+    state_->status_.store(FutureStatus::Ready, std::memory_order_release);
+    state_ = nullptr;
+  }
+
+  template <typename U = T>
+  typename std::enable_if<std::is_void<U>::value, void>::type
+  set_value(const std::nullopt_t &value) = delete;
+
+  template <typename U = T>
+  typename std::enable_if<std::is_void<U>::value, void>::type
+  set_value(std::nullopt_t &&value) = delete;
+
+  template <typename U = T>
+  typename std::enable_if<std::is_void<U>::value, void>::type set_value() const {
+    assert(state_ && "set_value() can only be called once");
+    state_->status_.store(FutureStatus::Ready, std::memory_order_release);
+    state_ = nullptr;
+  }
+
+  /// Swap with another promise
+  void swap(promise &other) noexcept {
+    using std::swap;
+    swap(state_, other.state_);
+  }
+
+private:
+  explicit promise(typename future<T>::state *state)
+      : state_(state) {}
+
+  mutable typename future<T>::state *state_;
+};
+
+}; // class JuliaTaskDispatcher
+
+thread_local SmallVector<std::pair<std::unique_ptr<Task>, JuliaTaskDispatcher *>> JuliaTaskDispatcher::TaskQueue;
+
+void JuliaTaskDispatcher::dispatch(std::unique_ptr<Task> T) {
+  TaskQueue.push_back(std::pair(std::move(T), this));
+}
+
+void JuliaTaskDispatcher::shutdown() {
+  // Keep processing until no tasks belonging to this dispatcher remain
+  while (true) {
+    // Check if any task belongs to this dispatcher
+    auto it = std::find_if(
+        TaskQueue.begin(), TaskQueue.end(),
+        [this](const auto &TaskPair) { return TaskPair.second == this; });
+
+    // If no tasks belonging to this dispatcher, we're done
+    if (it == TaskQueue.end())
+      return;
+
+    // Create a future/promise pair to wait for completion of this task
+    future<void> taskFuture;
+    // Replace the task with a GenericNamedTask that wraps the original task
+    // with a notification of completion that this thread can work_until.
+    auto originalTask = std::move(it->first);
+    it->first = makeGenericNamedTask(
+        [originalTask = std::move(originalTask),
+         taskPromise = taskFuture.get_promise()]() {
+          originalTask->run();
+          taskPromise.set_value();
+        },
+        "Shutdown task marker");
+
+    // Wait for the task to complete
+    taskFuture.get(*this);
+  }
+}
+
+void JuliaTaskDispatcher::work_until(future_base &F) {
+  while (!F.ready()) {
+    // First, process any tasks in our local queue
+    // Process in LIFO order (most recently added first) to avoid deadlocks
+    // when tasks have dependencies on each other
+    while (!TaskQueue.empty()) {
+      {
+        auto TaskPair = std::move(TaskQueue.back());
+        TaskQueue.pop_back();
+        TaskPair.first->run();
+      }
+
+      // Notify any threads that might be waiting for work to complete
+      {
+        std::lock_guard<std::mutex> Lock(DispatchMutex);
+        bool ShouldNotify = llvm::any_of(
+            WaitingFutures, [](future_base *F) { return F->ready(); });
+        if (ShouldNotify) {
+          WaitingFutures.clear();
+          WorkFinishedCV.notify_all();
+        }
+      }
+
+      // Check if our future is now ready
+      if (F.ready())
+        return;
+    }
+
+    // If we get here, our queue is empty but the future isn't ready
+    // We need to wait for other threads to finish work that should complete our
+    // future
+    {
+      std::unique_lock<std::mutex> Lock(DispatchMutex);
+      WaitingFutures.push_back(&F);
+      WorkFinishedCV.wait(Lock, [&F]() { return F.ready(); });
+    }
+  }
+}
+
+} // End namespace
+
+namespace std {
+template <typename T>
+void swap(::JuliaTaskDispatcher::promise<T> &lhs, ::JuliaTaskDispatcher::promise<T> &rhs) noexcept {
+  lhs.swap(rhs);
+}
+} // End namespace std
+
+// n.b. this actually is sometimes a safepoint
+Expected<SymbolMap>
+safelookup(ExecutionSession &ES,
+           const JITDylibSearchOrder &SearchOrder,
+           SymbolLookupSet Symbols, LookupKind K = LookupKind::Static,
+           SymbolState RequiredState = SymbolState::Ready,
+           RegisterDependenciesFunction RegisterDependencies = NoDependenciesToRegister) JL_NOTSAFEPOINT {
+  JuliaTaskDispatcher::future<MSVCPExpected<SymbolMap>> PromisedFuture;
+  auto NotifyComplete = [PromisedResult = PromisedFuture.get_promise()](Expected<SymbolMap> R) {
+    PromisedResult.set_value(std::move(R));
+  };
+  ES.lookup(K, SearchOrder, std::move(Symbols), RequiredState,
+        std::move(NotifyComplete), RegisterDependencies);
+  return PromisedFuture.get(static_cast<JuliaTaskDispatcher&>(ES.getExecutorProcessControl().getDispatcher()));
+}
+
+Expected<ExecutorSymbolDef>
+safelookup(ExecutionSession &ES,
+           const JITDylibSearchOrder &SearchOrder,
+           SymbolStringPtr Name,
+           SymbolState RequiredState = SymbolState::Ready) JL_NOTSAFEPOINT {
+  SymbolLookupSet Names({Name});
+
+  if (auto ResultMap = safelookup(ES, SearchOrder, std::move(Names), LookupKind::Static,
+                                  RequiredState, NoDependenciesToRegister)) {
+    assert(ResultMap->size() == 1 && "Unexpected number of results");
+    assert(ResultMap->count(Name) && "Missing result for symbol");
+    return std::move(ResultMap->begin()->second);
+  } else
+    return ResultMap.takeError();
+}
+
+Expected<ExecutorSymbolDef>
+safelookup(ExecutionSession &ES,
+           ArrayRef<JITDylib *> SearchOrder, SymbolStringPtr Name,
+           SymbolState RequiredState = SymbolState::Ready) JL_NOTSAFEPOINT {
+  return safelookup(ES, makeJITDylibSearchOrder(SearchOrder), Name, RequiredState);
+}
+
+Expected<ExecutorSymbolDef>
+safelookup(ExecutionSession &ES,
+           ArrayRef<JITDylib *> SearchOrder, StringRef Name,
+           SymbolState RequiredState = SymbolState::Ready) JL_NOTSAFEPOINT {
+  return safelookup(ES, SearchOrder, ES.intern(Name), RequiredState);
+}

From c9bfbbc005231b8e7a4bc32c99b15d3a5060fb3a Mon Sep 17 00:00:00 2001
From: Keno Fischer <keno@juliacomputing.com>
Date: Thu, 17 Jul 2025 13:19:33 -0400
Subject: [PATCH 3/9] Fix use-after-free in FileWatching (#59017)

We observe an abort on Windows on Revise master CI, where a free'd
handle is passed to jl_close_uv. The root cause is that uv_fseventscb_file
called uvfinalize earlier, but did not set the handle to NULL, so when the
actual finalizer ran later, it would see corrupted state.

(cherry picked from commit b45b429b93966e992c25a101c896e72738799ca4)
---
 stdlib/FileWatching/src/FileWatching.jl | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/stdlib/FileWatching/src/FileWatching.jl b/stdlib/FileWatching/src/FileWatching.jl
index ebfdd9c8fea6b..ddaf36dfd33a4 100644
--- a/stdlib/FileWatching/src/FileWatching.jl
+++ b/stdlib/FileWatching/src/FileWatching.jl
@@ -516,17 +516,19 @@ end
 
 function uvfinalize(uv::Union{FileMonitor, FolderMonitor})
     iolock_begin()
-    if uv.handle != C_NULL
-        disassociate_julia_struct(uv) # close (and free) without notify
-        ccall(:jl_close_uv, Cvoid, (Ptr{Cvoid},), uv.handle)
+    handle = @atomicswap :monotonic uv.handle = C_NULL
+    if handle != C_NULL
+        disassociate_julia_struct(handle) # close (and free) without notify
+        ccall(:jl_close_uv, Cvoid, (Ptr{Cvoid},), handle)
     end
     iolock_end()
 end
 
 function close(t::Union{FileMonitor, FolderMonitor})
     iolock_begin()
-    if t.handle != C_NULL
-        ccall(:jl_close_uv, Cvoid, (Ptr{Cvoid},), t.handle)
+    handle = t.handle
+    if handle != C_NULL
+        ccall(:jl_close_uv, Cvoid, (Ptr{Cvoid},), handle)
     end
     iolock_end()
 end

From 2bfc0f2cac802b4fa8fcd2379bfd2a582108100c Mon Sep 17 00:00:00 2001
From: JonasIsensee <jonas.isensee@web.de>
Date: Fri, 18 Jul 2025 22:28:54 +0200
Subject: [PATCH 4/9] Bugfix: Use Base.aligned_sizeof instead of sizeof in
 Mmap.mmap (#58998)

fix #58982

(cherry picked from commit c30199fe17aa24f77530bd64f7762546c15e2941)
---
 stdlib/Mmap/src/Mmap.jl      |  8 ++++----
 stdlib/Mmap/test/runtests.jl | 15 ++++++++++++++-
 2 files changed, 18 insertions(+), 5 deletions(-)

diff --git a/stdlib/Mmap/src/Mmap.jl b/stdlib/Mmap/src/Mmap.jl
index 7d57bf053940d..c527123d51bb8 100644
--- a/stdlib/Mmap/src/Mmap.jl
+++ b/stdlib/Mmap/src/Mmap.jl
@@ -187,16 +187,16 @@ like HDF5 (which can be used with memory-mapping).
 """
 function mmap(io::IO,
               ::Type{Array{T,N}}=Vector{UInt8},
-              dims::NTuple{N,Integer}=(div(filesize(io)-position(io),sizeof(T)),),
+              dims::NTuple{N,Integer}=(div(filesize(io)-position(io),Base.aligned_sizeof(T)),),
               offset::Integer=position(io); grow::Bool=true, shared::Bool=true) where {T,N}
     # check inputs
     isopen(io) || throw(ArgumentError("$io must be open to mmap"))
     isbitstype(T)  || throw(ArgumentError("unable to mmap $T; must satisfy isbitstype(T) == true"))
 
-    len = sizeof(T)
+    len = Base.aligned_sizeof(T)
     for l in dims
         len, overflow = Base.Checked.mul_with_overflow(promote(len, l)...)
-        overflow && throw(ArgumentError("requested size prod($((sizeof(T), dims...))) too large, would overflow typeof(size(T)) == $(typeof(len))"))
+        overflow && throw(ArgumentError("requested size prod($((len, dims...))) too large, would overflow typeof(size(T)) == $(typeof(len))"))
     end
     len >= 0 || throw(ArgumentError("requested size must be ≥ 0, got $len"))
     len == 0 && return Array{T}(undef, ntuple(x->0,Val(N)))
@@ -267,7 +267,7 @@ end
 
 mmap(file::AbstractString,
      ::Type{T}=Vector{UInt8},
-     dims::NTuple{N,Integer}=(div(filesize(file),sizeof(eltype(T))),),
+     dims::NTuple{N,Integer}=(div(filesize(file),Base.aligned_sizeof(eltype(T))),),
      offset::Integer=Int64(0); grow::Bool=true, shared::Bool=true) where {T<:Array,N} =
     open(io->mmap(io, T, dims, offset; grow=grow, shared=shared), file, isfile(file) ? "r" : "w+")::Array{eltype(T),N}
 
diff --git a/stdlib/Mmap/test/runtests.jl b/stdlib/Mmap/test/runtests.jl
index 03e4b48d95f7a..224664e5faaa2 100644
--- a/stdlib/Mmap/test/runtests.jl
+++ b/stdlib/Mmap/test/runtests.jl
@@ -44,7 +44,7 @@ s = open(file)
 @test length(@inferred mmap(s, Vector{Int8}, 12, 0; grow=false)) == 12
 @test length(@inferred mmap(s, Vector{Int8}, 12, 0; shared=false)) == 12
 close(s)
-@test_throws ErrorException mmap(file, Vector{Ref}) # must be bit-type
+@test_throws ArgumentError mmap(file, Vector{Ref}) # must be bit-type
 GC.gc(); GC.gc()
 
 s = open(f->f,file,"w")
@@ -341,6 +341,19 @@ end
 GC.gc()
 rm(file)
 
+@testset "test for #58982 - mmap with primitive types" begin
+    file = tempname()
+    primitive type PrimType9Bytes 9*8 end
+    arr = Vector{PrimType9Bytes}(undef, 2)
+    write(file, arr)
+    m = mmap(file, Vector{PrimType9Bytes})
+    @test length(m) == 2
+    @test m[1] == arr[1]
+    @test m[2] == arr[2]
+    finalize(m); m = nothing; GC.gc()
+    rm(file)
+end
+
 @testset "Docstrings" begin
     @test isempty(Docs.undocumented_names(Mmap))
 end

From ab66f187e5d6c16742f7edd1dc856d8db79464bf Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Tue, 22 Jul 2025 16:08:27 -0400
Subject: [PATCH 5/9] stored method interference graph (#58948)

Store full method interference relationship graph in interferences field
of Method to avoid expensive morespecific calls during dispatch. This
provides significant performance improvements:
  - Replace method comparisons with precomputed interference lookup.
  - Optimize ml_matches minmax computation using interference lookups.
  - Optimize sort_mlmatches for large return sets by iterating over
    interferences instead of all matching methods.
  - Add method_morespecific_via_interferences in both C and Julia.

This representation may exclude some edges that are implied by
transitivity since sort_mlmatches will ensure the correct result by
following strong edges. Ambiguous edges are guaranteed to be checkable
without recursion.

Also fix a variety of bugs along the way:
 - Builtins signature would cause them to try to discard all other
   methods during `sort_mlmatches`.
 - Some ambiguities were over-estimated, which now are improved upon.
 - Setting lim==-1 now gives the same limited list of methods as lim>0,
   since that is actually faster now than attempting to give the
   unsorted list. This provides a better fix to #53814 than #57837 and
   fixes #58766.
 - Reverts recent METHOD_SIG_LATEST_HAS_NOTMORESPECIFIC attempt (though
   not the whole commit), since I found a significant problem with any
   usage of that bit during testing: it only tracks methods that
   intersect with a target, but new methods do not necessarily intersect
   with any existing target.

This provides a decent performance improvement to `methods` calls, which
implies a decent speed up to package loading also (e.g. ModelingToolkit
loads in about 4 seconds instead of 5 seconds).

(cherry picked from commit 59a7bb3184feee32227aa15d5c39aa59270ad8e5)
---
 base/Base.jl              |   2 +
 base/errorshow.jl         |   8 +-
 base/methodshow.jl        |   8 +-
 base/runtime_internals.jl |  10 +-
 base/staticdata.jl        | 215 +++++++---
 src/gf.c                  | 856 ++++++++++++++++++--------------------
 src/jltypes.c             |  14 +-
 src/julia.h               |   1 +
 src/julia_internal.h      |   3 +-
 src/method.c              |   1 +
 src/staticdata.c          |   2 -
 src/staticdata_utils.c    |   4 +
 stdlib/Test/src/Test.jl   |   1 -
 test/ambiguous.jl         |  40 +-
 test/core.jl              |   2 +-
 15 files changed, 638 insertions(+), 529 deletions(-)

diff --git a/base/Base.jl b/base/Base.jl
index 6d798d69014ea..f8905bf49ce79 100644
--- a/base/Base.jl
+++ b/base/Base.jl
@@ -264,7 +264,9 @@ include("uuid.jl")
 include("pkgid.jl")
 include("toml_parser.jl")
 include("linking.jl")
+module StaticData
 include("staticdata.jl")
+end
 include("loading.jl")
 
 # misc useful functions & macros
diff --git a/base/errorshow.jl b/base/errorshow.jl
index 9df363a8c5e9f..e771597d7b1e0 100644
--- a/base/errorshow.jl
+++ b/base/errorshow.jl
@@ -441,6 +441,7 @@ function show_method_candidates(io::IO, ex::MethodError, kwargs=[])
     line_score = Int[]
     # These functions are special cased to only show if first argument is matched.
     special = f === convert || f === getindex || f === setindex!
+    f isa Core.Builtin && return # `methods` isn't very useful for a builtin
     funcs = Tuple{Any,Vector{Any}}[(f, arg_types_param)]
 
     # An incorrect call method produces a MethodError for convert.
@@ -448,7 +449,7 @@ function show_method_candidates(io::IO, ex::MethodError, kwargs=[])
     # pool MethodErrors for these two functions.
     if f === convert && !isempty(arg_types_param)
         at1 = arg_types_param[1]
-        if isType(at1) && !has_free_typevars(at1)
+        if isType(at1) && !has_free_typevars(at1) && at1.parameters[1] isa Type
             push!(funcs, (at1.parameters[1], arg_types_param[2:end]))
         end
     end
@@ -470,8 +471,8 @@ function show_method_candidates(io::IO, ex::MethodError, kwargs=[])
             end
             sig0 = sig0::DataType
             s1 = sig0.parameters[1]
-            if sig0 === Tuple || !isa(func, rewrap_unionall(s1, method.sig))
-                # function itself doesn't match or is a builtin
+            if !isa(func, rewrap_unionall(s1, method.sig))
+                # function itself doesn't match
                 continue
             else
                 print(iob, "  ")
@@ -616,6 +617,7 @@ function show_method_candidates(io::IO, ex::MethodError, kwargs=[])
             println(io) # extra newline for spacing to stacktrace
         end
     end
+    nothing
 end
 
 # In case the line numbers in the source code have changed since the code was compiled,
diff --git a/base/methodshow.jl b/base/methodshow.jl
index dd391a46d170a..cae0e9d809225 100644
--- a/base/methodshow.jl
+++ b/base/methodshow.jl
@@ -78,7 +78,7 @@ end
 
 # NOTE: second argument is deprecated and is no longer used
 function kwarg_decl(m::Method, kwtype = nothing)
-    if m.sig !== Tuple # OpaqueClosure or Builtin
+    if !(m.sig === Tuple || m.sig <: Tuple{Core.Builtin, Vararg}) # OpaqueClosure or Builtin
         kwtype = typeof(Core.kwcall)
         sig = rewrap_unionall(Tuple{kwtype, NamedTuple, (unwrap_unionall(m.sig)::DataType).parameters...}, m.sig)
         kwli = ccall(:jl_methtable_lookup, Any, (Any, UInt), sig, get_world_counter())
@@ -216,8 +216,7 @@ show(io::IO, ::MIME"text/plain", m::Method; kwargs...) = show_method(io, m; kwar
 
 function show_method(io::IO, m::Method; modulecolor = :light_black, digit_align_width = 1)
     tv, decls, file, line = arg_decl_parts(m)
-    sig = unwrap_unionall(m.sig)
-    if sig === Tuple
+    if m.sig <: Tuple{Core.Builtin, Vararg}
         # Builtin
         print(io, m.name, "(...)")
         file = "none"
@@ -420,8 +419,7 @@ end
 function show(io::IO, ::MIME"text/html", m::Method)
     tv, decls, file, line = arg_decl_parts(m, true)
     sig = unwrap_unionall(m.sig)
-    if sig === Tuple
-        # Builtin
+    if sig <: Tuple{Core.Builtin, Vararg}
         print(io, m.name, "(...) in ", parentmodule(m))
         return
     end
diff --git a/base/runtime_internals.jl b/base/runtime_internals.jl
index f48b4734a7b8b..7a06a8fcbdb75 100644
--- a/base/runtime_internals.jl
+++ b/base/runtime_internals.jl
@@ -1330,15 +1330,7 @@ end
 function matches_to_methods(ms::Array{Any,1}, tn::Core.TypeName, mod)
     # Lack of specialization => a comprehension triggers too many invalidations via _collect, so collect the methods manually
     ms = Method[(ms[i]::Core.MethodMatch).method for i in 1:length(ms)]
-    # Remove shadowed methods with identical type signatures
-    prev = nothing
-    filter!(ms) do m
-        l = prev
-        repeated = (l isa Method && m.sig == l.sig)
-        prev = m
-        return !repeated
-    end
-    # Remove methods not part of module (after removing shadowed methods)
+    # Remove methods not part of module
     mod === nothing || filter!(ms) do m
         return parentmodule(m) ∈ mod
     end
diff --git a/base/staticdata.jl b/base/staticdata.jl
index 9f7ef67743906..5a8ea39c8a4c9 100644
--- a/base/staticdata.jl
+++ b/base/staticdata.jl
@@ -1,9 +1,7 @@
 # This file is a part of Julia. License is MIT: https://julialang.org/license
 
-module StaticData
-
 using .Core: CodeInstance, MethodInstance
-using .Base: JLOptions, Compiler, get_world_counter, _methods_by_ftype, get_methodtable
+using .Base: JLOptions, Compiler, get_world_counter, _methods_by_ftype, get_methodtable, morespecific
 
 const WORLD_AGE_REVALIDATION_SENTINEL::UInt = 1
 const _jl_debug_method_invalidation = Ref{Union{Nothing,Vector{Any}}}(nothing)
@@ -167,6 +165,7 @@ function verify_method(codeinst::CodeInstance, stack::Vector{CodeInstance}, visi
     elseif maxworld == Base.get_require_world()
         # if no new worlds were allocated since serializing the base module, then no new validation is worth doing right now either
     else
+        matches = []
         j = 1
         while j ≤ length(callees)
             local min_valid2::UInt, max_valid2::UInt
@@ -177,14 +176,14 @@ function verify_method(codeinst::CodeInstance, stack::Vector{CodeInstance}, visi
             end
             if edge isa MethodInstance
                 sig = edge.specTypes
-                min_valid2, max_valid2, matches = verify_call(sig, callees, j, 1, world, true)
+                min_valid2, max_valid2 = verify_call(sig, callees, j, 1, world, true, matches)
                 j += 1
             elseif edge isa Int
                 sig = callees[j+1]
                 # Handle negative counts (fully_covers=false)
                 nmatches = abs(edge)
                 fully_covers = edge > 0
-                min_valid2, max_valid2, matches = verify_call(sig, callees, j+2, nmatches, world, fully_covers)
+                min_valid2, max_valid2 = verify_call(sig, callees, j+2, nmatches, world, fully_covers, matches)
                 j += 2 + nmatches
                 edge = sig
             elseif edge isa Core.Binding
@@ -201,7 +200,6 @@ function verify_method(codeinst::CodeInstance, stack::Vector{CodeInstance}, visi
                     min_valid2 = 1
                     max_valid2 = 0
                 end
-                matches = nothing
             else
                 callee = callees[j+1]
                 if callee isa Core.MethodTable # skip the legacy edge (missing backedge)
@@ -216,7 +214,7 @@ function verify_method(codeinst::CodeInstance, stack::Vector{CodeInstance}, visi
                 else
                     meth = callee::Method
                 end
-                min_valid2, max_valid2, matches = verify_invokesig(edge, meth, world)
+                min_valid2, max_valid2 = verify_invokesig(edge, meth, world, matches)
                 j += 2
             end
             if minworld < min_valid2
@@ -227,7 +225,7 @@ function verify_method(codeinst::CodeInstance, stack::Vector{CodeInstance}, visi
             end
             invalidations = _jl_debug_method_invalidation[]
             if max_valid2 ≠ typemax(UInt) && invalidations !== nothing
-                push!(invalidations, edge, "insert_backedges_callee", codeinst, matches)
+                push!(invalidations, edge, "insert_backedges_callee", codeinst, copy(matches))
             end
             if max_valid2 == 0 && invalidations === nothing
                 break
@@ -291,44 +289,172 @@ function verify_method(codeinst::CodeInstance, stack::Vector{CodeInstance}, visi
     return 0, minworld, maxworld
 end
 
-function verify_call(@nospecialize(sig), expecteds::Core.SimpleVector, i::Int, n::Int, world::UInt, fully_covers::Bool)
+function get_method_from_edge(@nospecialize t)
+    if t isa Method
+        return t
+    else
+        if t isa CodeInstance
+            t = get_ci_mi(t)::MethodInstance
+        else
+            t = t::MethodInstance
+        end
+        return t.def::Method
+    end
+end
+
+# Check if method2 is in method1's interferences set
+# Returns true if method2 is found (meaning !morespecific(method1, method2))
+function method_in_interferences(method2::Method, method1::Method)
+    interferences = method1.interferences
+    for k = 1:length(interferences)
+        isassigned(interferences, k) || break
+        interference_method = interferences[k]::Method
+        if interference_method === method2
+            return true
+        end
+    end
+    return false
+end
+
+# Check if method1 is more specific than method2 via the interference graph
+function method_morespecific_via_interferences(method1::Method, method2::Method)
+    if method1 === method2
+        return false
+    end
+    ms = method_in_interferences_recursive(method1, method2, IdSet{Method}())
+    # slow check: @assert ms === morespecific(method1, method2) || typeintersect(method1.sig, method2.sig) === Union{} || typeintersect(method2.sig, method1.sig) === Union{}
+    return ms
+end
+
+# Returns true if method1 is in method2's interferences (meaning !morespecific(method2, method1))
+function method_in_interferences_recursive(method1::Method, method2::Method, visited::IdSet{Method})
+    if method_in_interferences(method2, method1)
+        return false
+    end
+    if method_in_interferences(method1, method2)
+        return true
+    end
+
+    # Recursively check through interference graph
+    method2 in visited && return false
+    push!(visited, method2)
+    interferences = method2.interferences
+    for k = 1:length(interferences)
+        isassigned(interferences, k) || break
+        method3 = interferences[k]::Method
+        if method_in_interferences(method2, method3)
+            continue # only follow edges to morespecific methods in search of the morespecific target (skip ambiguities)
+        end
+        if method_in_interferences_recursive(method1, method3, visited)
+            return true # found method1 in the interference graph
+        end
+    end
+
+    return false
+end
+
+function verify_call(@nospecialize(sig), expecteds::Core.SimpleVector, i::Int, n::Int, world::UInt, fully_covers::Bool, matches::Vector{Any})
     # verify that these edges intersect with the same methods as before
     mi = nothing
-    if n == 1
+    expected_deleted = false
+    for j = 1:n
+        t = expecteds[i+j-1]
+        meth = get_method_from_edge(t)
+        if iszero(meth.dispatch_status & METHOD_SIG_LATEST_WHICH)
+            expected_deleted = true
+            break
+        end
+    end
+    if expected_deleted
+        if _jl_debug_method_invalidation[] === nothing && world == get_world_counter()
+            return UInt(1), UInt(0)
+        end
+    elseif n == 1
         # first, fast-path a check if the expected method simply dominates its sig anyways
         # so the result of ml_matches is already simply known
-        let t = expecteds[i], meth, minworld, maxworld, result
-            if t isa Method
-                meth = t
-            else
+        let t = expecteds[i], meth, minworld, maxworld
+            meth = get_method_from_edge(t)
+            if !(t isa Method)
                 if t isa CodeInstance
                     mi = get_ci_mi(t)::MethodInstance
                 else
                     mi = t::MethodInstance
                 end
-                meth = mi.def::Method
-                # Fast path is legal when fully_covers=true OR when METHOD_SIG_LATEST_HAS_NOTMORESPECIFIC is unset
-                if (fully_covers || iszero(meth.dispatch_status & METHOD_SIG_LATEST_HAS_NOTMORESPECIFIC)) &&
-                   !iszero(mi.dispatch_status & METHOD_SIG_LATEST_ONLY)
+                # Fast path is legal when fully_covers=true
+                if fully_covers && !iszero(mi.dispatch_status & METHOD_SIG_LATEST_ONLY)
                     minworld = meth.primary_world
                     @assert minworld ≤ world
                     maxworld = typemax(UInt)
-                    result = Any[] # result is unused
-                    return minworld, maxworld, result
+                    return minworld, maxworld
                 end
             end
-            # Fast path is legal when fully_covers=true OR when METHOD_SIG_LATEST_HAS_NOTMORESPECIFIC is unset
-            if fully_covers || iszero(meth.dispatch_status & METHOD_SIG_LATEST_HAS_NOTMORESPECIFIC)
-                if !iszero(meth.dispatch_status & METHOD_SIG_LATEST_ONLY)
-                    minworld = meth.primary_world
-                    @assert minworld ≤ world
-                    maxworld = typemax(UInt)
-                    result = Any[] # result is unused
-                    return minworld, maxworld, result
+            # Fast path is legal when fully_covers=true
+            if fully_covers && !iszero(meth.dispatch_status & METHOD_SIG_LATEST_ONLY)
+                minworld = meth.primary_world
+                @assert minworld ≤ world
+                maxworld = typemax(UInt)
+                return minworld, maxworld
+            end
+        end
+    elseif n > 1
+        # Try the interference set fast path: check if all interference sets are covered by expecteds
+        interference_fast_path_success = fully_covers
+        # If it didn't fail yet, then check that all interference methods are either expected, or not applicable.
+        if interference_fast_path_success
+            local interference_minworld::UInt = 1
+            for j = 1:n
+                meth = get_method_from_edge(expecteds[i+j-1])
+                if interference_minworld < meth.primary_world
+                    interference_minworld = meth.primary_world
+                end
+                interferences = meth.interferences
+                for k = 1:length(interferences)
+                    isassigned(interferences, k) || break # no more entries
+                    interference_method = interferences[k]::Method
+                    if iszero(interference_method.dispatch_status & METHOD_SIG_LATEST_WHICH)
+                        # detected a deleted interference_method, so need the full lookup to compute minworld
+                        interference_fast_path_success = false
+                        break
+                    end
+                    world < interference_method.primary_world && break # this and later entries are for a future world
+                    local found_in_expecteds = false
+                    for j = 1:n
+                        if interference_method === get_method_from_edge(expecteds[i+j-1])
+                            found_in_expecteds = true
+                            break
+                        end
+                    end
+                    if !found_in_expecteds
+                        ti = typeintersect(sig, interference_method.sig)
+                        if !(ti === Union{})
+                            # try looking for a different expected method that fully covers this interference_method anyways over their intersection
+                            for j = 1:n
+                                meth2 = get_method_from_edge(expecteds[i+j-1])
+                                if method_morespecific_via_interferences(meth2, interference_method) && ti <: meth2.sig
+                                    found_in_expecteds = true
+                                    break
+                                end
+                            end
+                            if !found_in_expecteds
+                                meth2 = get_method_from_edge(expecteds[i])
+                                interference_fast_path_success = false
+                                break
+                            end
+                        end
+                    end
+                end
+                if !interference_fast_path_success
+                    break
                 end
             end
+            if interference_fast_path_success
+                # All interference sets are covered by expecteds, can return success
+                @assert interference_minworld ≤ world
+                maxworld = typemax(UInt)
+                return interference_minworld, maxworld
+            end
         end
-    end
+   end
     # next, compare the current result of ml_matches to the old result
     lim = _jl_debug_method_invalidation[] !== nothing ? Int(typemax(Int32)) : n
     minworld = Ref{UInt}(1)
@@ -336,6 +462,7 @@ function verify_call(@nospecialize(sig), expecteds::Core.SimpleVector, i::Int, n
     has_ambig = Ref{Int32}(0)
     result = _methods_by_ftype(sig, nothing, lim, world, #=ambig=#false, minworld, maxworld, has_ambig)
     if result === nothing
+        empty!(matches)
         maxworld[] = 0
     else
         # setdiff!(result, expected)
@@ -348,17 +475,7 @@ function verify_call(@nospecialize(sig), expecteds::Core.SimpleVector, i::Int, n
             local found = false
             for j = 1:n
                 t = expecteds[i+j-1]
-                if t isa Method
-                    meth = t
-                else
-                    if t isa CodeInstance
-                        t = get_ci_mi(t)::MethodInstance
-                    else
-                        t = t::MethodInstance
-                    end
-                    meth = t.def::Method
-                end
-                if match.method == meth
+                if match.method == get_method_from_edge(t)
                     found = true
                     break
                 end
@@ -377,12 +494,13 @@ function verify_call(@nospecialize(sig), expecteds::Core.SimpleVector, i::Int, n
         end
         if maxworld[] ≠ typemax(UInt) && _jl_debug_method_invalidation[] !== nothing
             resize!(result, ins)
+            copy!(matches, result)
         end
     end
     if maxworld[] == typemax(UInt) && mi isa MethodInstance
         ccall(:jl_promote_mi_to_current, Cvoid, (Any, UInt, UInt), mi, minworld[], world)
     end
-    return minworld[], maxworld[], result
+    return minworld[], maxworld[]
 end
 
 # fast-path dispatch_status bit definitions (false indicates unknown)
@@ -390,13 +508,11 @@ end
 const METHOD_SIG_LATEST_WHICH = 0x1
 # true indicates this method would be returned as the only result from `methods` when calling `method.sig` in the current latest world
 const METHOD_SIG_LATEST_ONLY = 0x2
-# true indicates there exists some other method that is not more specific than this one in the current latest world (which might be more fully covering)
-const METHOD_SIG_LATEST_HAS_NOTMORESPECIFIC = 0x8
 
-function verify_invokesig(@nospecialize(invokesig), expected::Method, world::UInt)
+function verify_invokesig(@nospecialize(invokesig), expected::Method, world::UInt, matches::Vector{Any})
     @assert invokesig isa Type
     local minworld::UInt, maxworld::UInt
-    matched = nothing
+    empty!(matches)
     if invokesig === expected.sig && !iszero(expected.dispatch_status & METHOD_SIG_LATEST_WHICH)
         # the invoke match is `expected` for `expected->sig`, unless `expected` is replaced
         minworld = expected.primary_world
@@ -413,14 +529,13 @@ function verify_invokesig(@nospecialize(invokesig), expected::Method, world::UIn
             if matched === nothing
                 maxworld = 0
             else
-                matched = Any[matched.method]
-                if matched[] !== expected
+                matched = matched.method
+                push!(matches, matched)
+                if matched !== expected
                     maxworld = 0
                 end
             end
         end
     end
-    return minworld, maxworld, matched
+    return minworld, maxworld
 end
-
-end # module StaticData
diff --git a/src/gf.c b/src/gf.c
index b0c7a65ea8f99..0757614b390db 100644
--- a/src/gf.c
+++ b/src/gf.c
@@ -302,27 +302,27 @@ JL_DLLEXPORT jl_value_t *jl_methtable_lookup(jl_value_t *type, size_t world)
 
 jl_method_t *jl_mk_builtin_func(jl_datatype_t *dt, jl_sym_t *sname, jl_fptr_args_t fptr) JL_GC_DISABLED
 {
-    jl_method_t *m = jl_new_method_uninit(jl_core_module);
+    jl_value_t *params[2];
+    params[0] = dt->name->wrapper;
+    params[1] = jl_tparam0(jl_anytuple_type);
+    jl_datatype_t *tuptyp = (jl_datatype_t*)jl_apply_tuple_type_v(params, 2);
+
+    jl_typemap_entry_t *newentry = NULL;
+    jl_method_t *m = NULL;
+    JL_GC_PUSH3(&m, &newentry, &tuptyp);
+
+    m = jl_new_method_uninit(jl_core_module);
     m->name = sname;
     m->module = jl_core_module;
     m->isva = 1;
     m->nargs = 2;
     jl_atomic_store_relaxed(&m->primary_world, 1);
     jl_atomic_store_relaxed(&m->dispatch_status, METHOD_SIG_LATEST_ONLY | METHOD_SIG_LATEST_WHICH);
-    m->sig = (jl_value_t*)jl_anytuple_type;
+    m->sig = (jl_value_t*)tuptyp;
     m->slot_syms = jl_an_empty_string;
     m->nospecialize = 0;
     m->nospecialize = ~m->nospecialize;
 
-    jl_typemap_entry_t *newentry = NULL;
-    jl_datatype_t *tuptyp = NULL;
-    JL_GC_PUSH3(&m, &newentry, &tuptyp);
-
-    jl_value_t *params[2];
-    params[0] = dt->name->wrapper;
-    params[1] = jl_tparam0(jl_anytuple_type);
-    tuptyp = (jl_datatype_t*)jl_apply_tuple_type_v(params, 2);
-
     jl_method_instance_t *mi = jl_get_specialized(m, (jl_value_t*)tuptyp, jl_emptysvec);
     jl_atomic_store_relaxed(&m->unspecialized, mi);
     jl_gc_wb(m, mi);
@@ -1881,11 +1881,12 @@ struct matches_env {
 static int get_intersect_visitor(jl_typemap_entry_t *oldentry, struct typemap_intersection_env *closure0)
 {
     struct matches_env *closure = container_of(closure0, struct matches_env, match);
+    jl_method_t *oldmethod = oldentry->func.method;
     assert(oldentry != closure->newentry && "entry already added");
     assert(jl_atomic_load_relaxed(&oldentry->min_world) <= jl_atomic_load_relaxed(&closure->newentry->min_world) && "old method cannot be newer than new method");
-    assert(jl_atomic_load_relaxed(&oldentry->max_world) != jl_atomic_load_relaxed(&closure->newentry->min_world) && "method cannot be added at the same time as method deleted");
+    //assert(jl_atomic_load_relaxed(&oldentry->max_world) != jl_atomic_load_relaxed(&closure->newentry->min_world) && "method cannot be added at the same time as method deleted");
+    assert((jl_atomic_load_relaxed(&oldentry->max_world) == ~(size_t)0));
     // don't need to consider other similar methods if this oldentry will always fully intersect with them and dominates all of them
-    jl_method_t *oldmethod = oldentry->func.method;
     if (closure->match.issubty // e.g. jl_subtype(closure->newentry.sig, oldentry->sig)
         && jl_subtype(oldmethod->sig, (jl_value_t*)closure->newentry->sig)) { // e.g. jl_type_equal(closure->newentry->sig, oldentry->sig)
         if (closure->replaced == NULL || jl_atomic_load_relaxed(&closure->replaced->min_world) < jl_atomic_load_relaxed(&oldentry->min_world))
@@ -1893,7 +1894,10 @@ static int get_intersect_visitor(jl_typemap_entry_t *oldentry, struct typemap_in
     }
     if (closure->shadowed == NULL)
         closure->shadowed = (jl_value_t*)jl_alloc_vec_any(0);
-    if (closure->match.issubty) { // this should be rarely true (in fact, get_intersect_visitor should be rarely true), but might as well skip the rest of the scan fast anyways since we can
+    // This should be rarely true (in fact, get_intersect_visitor should be
+    // rarely true), but might as well skip the rest of the scan fast anyways
+    // since we can.
+    if (closure->match.issubty) {
         int only = jl_atomic_load_relaxed(&oldmethod->dispatch_status) & METHOD_SIG_LATEST_ONLY;
         if (only) {
             size_t len = jl_array_nrows(closure->shadowed);
@@ -2397,7 +2401,6 @@ struct invalidate_mt_env {
     jl_typemap_entry_t *newentry;
     jl_array_t *shadowed;
     size_t max_world;
-    int invalidated;
 };
 static int invalidate_mt_cache(jl_typemap_entry_t *oldentry, void *closure0)
 {
@@ -2439,7 +2442,6 @@ static int invalidate_mt_cache(jl_typemap_entry_t *oldentry, void *closure0)
                 JL_GC_POP();
             }
             jl_atomic_store_relaxed(&oldentry->max_world, env->max_world);
-            env->invalidated = 1;
         }
     }
     return 1;
@@ -2584,7 +2586,7 @@ JL_DLLEXPORT void jl_method_table_disable(jl_method_t *method)
         jl_error("Method changes have been disabled via a call to disable_new_worlds.");
     int enabled = jl_atomic_load_relaxed(&methodentry->max_world) == ~(size_t)0;
     if (enabled) {
-            // Narrow the world age on the method to make it uncallable
+        // Narrow the world age on the method to make it uncallable
         size_t world = jl_atomic_load_relaxed(&jl_world_counter);
         assert(method == methodentry->func.method);
         jl_atomic_store_relaxed(&method->dispatch_status, 0);
@@ -2592,7 +2594,7 @@ JL_DLLEXPORT void jl_method_table_disable(jl_method_t *method)
         jl_atomic_store_relaxed(&methodentry->max_world, world);
         jl_method_table_invalidate(method, world);
         jl_atomic_store_release(&jl_world_counter, world + 1);
-        }
+    }
     JL_UNLOCK(&world_counter_lock);
     if (!enabled)
         jl_errorf("Method of %s already disabled", jl_symbol_name(method->name));
@@ -2621,6 +2623,132 @@ jl_typemap_entry_t *jl_method_table_add(jl_methtable_t *mt, jl_method_t *method,
     return newentry;
 }
 
+static int has_key(jl_genericmemory_t *keys, jl_value_t *key)
+{
+    for (size_t l = keys->length, i = 0; i < l; i++) {
+        jl_value_t *k = jl_genericmemory_ptr_ref(keys, i);
+        if (k == NULL)
+            return 0;
+        if (jl_genericmemory_ptr_ref(keys, i) == key)
+            return 1;
+    }
+    return 0;
+}
+
+// Check if m2 is in m1's interferences set, which means !morespecific(m1, m2)
+static int method_in_interferences(jl_method_t *m2, jl_method_t *m1)
+{
+    return has_key(jl_atomic_load_relaxed(&m1->interferences), (jl_value_t*)m2);
+}
+
+// Find the index of a method in the method match array
+static int find_method_in_matches(jl_array_t *t, jl_method_t *method)
+{
+    size_t len = jl_array_nrows(t);
+    for (size_t i = 0; i < len; i++) {
+        jl_method_match_t *matc = (jl_method_match_t*)jl_array_ptr_ref(t, i);
+        if (matc->method == method)
+            return i;
+    }
+    return -1;
+}
+
+// Recursively check if any method in interferences covers the given type signature
+static int check_interferences_covers(jl_method_t *m, jl_value_t *ti, jl_array_t *t, arraylist_t *visited, arraylist_t *recursion_stack)
+{
+    // Check if we're already visiting this method (cycle detection and memoization)
+    for (size_t i = 0; i < recursion_stack->len; i++)
+        if (recursion_stack->items[i] == (void*)m)
+            return 0;
+
+    // Add this method to the recursion stack
+    arraylist_push(recursion_stack, (void*)m);
+
+    jl_genericmemory_t *interferences = jl_atomic_load_relaxed(&m->interferences);
+    for (size_t i = 0; i < interferences->length; i++) {
+        jl_method_t *m2 = (jl_method_t*)jl_genericmemory_ptr_ref(interferences, i);
+        if (m2 == NULL)
+            continue;
+        int idx = find_method_in_matches(t, m2);
+        if (idx < 0)
+            continue;
+        if (method_in_interferences(m, m2))
+            continue; // ambiguous
+        assert(visited->items[idx] != (void*)0);
+        if (visited->items[idx] != (void*)1)
+            continue; // part of the same SCC cycle (handled by ambiguity later)
+        if (jl_subtype(ti, m2->sig))
+            return 1;
+        // Recursively check m2's interferences since m2 is more specific
+        if (check_interferences_covers(m2, ti, t, visited, recursion_stack))
+            return 1;
+    }
+    return 0;
+}
+
+static int check_fully_ambiguous(jl_method_t *m, jl_value_t *ti, jl_array_t *t, int include_ambiguous, int *has_ambiguity)
+{
+    jl_genericmemory_t *interferences = jl_atomic_load_relaxed(&m->interferences);
+    for (size_t i = 0; i < interferences->length; i++) {
+        jl_method_t *m2 = (jl_method_t*)jl_genericmemory_ptr_ref(interferences, i);
+        if (m2 == NULL)
+            continue;
+        int idx = find_method_in_matches(t, m2);
+        if (idx < 0)
+            continue;
+        if (!method_in_interferences(m, m2))
+            continue;
+        *has_ambiguity = 1;
+        if (!include_ambiguous && jl_subtype(ti, m2->sig))
+            return 1;
+    }
+    return 0;
+}
+
+// Recursively check if target_method is in the interferences of (morespecific than) start_method, but not the reverse
+static int method_in_interferences_recursive(jl_method_t *target_method, jl_method_t *start_method, arraylist_t *seen)
+{
+    // Check direct interferences first
+    if (method_in_interferences(start_method, target_method))
+        return 0;
+    if (method_in_interferences(target_method, start_method))
+        return 1;
+
+    // Check if we're already visiting this method (cycle prevention and memoization)
+    for (size_t i = 0; i < seen->len; i++) {
+        if (seen->items[i] == (void*)start_method)
+            return 0;
+    }
+    arraylist_push(seen, (void*)start_method);
+
+    // Recursively check interferences
+    jl_genericmemory_t *interferences = jl_atomic_load_relaxed(&start_method->interferences);
+    for (size_t i = 0; i < interferences->length; i++) {
+        jl_method_t *interference_method = (jl_method_t*)jl_genericmemory_ptr_ref(interferences, i);
+        if (interference_method == NULL)
+            continue;
+        if (method_in_interferences(start_method, interference_method))
+            continue; // only follow edges to morespecific methods in search of morespecific target (skip ambiguities)
+        if (method_in_interferences_recursive(target_method, interference_method, seen))
+            return 1;
+    }
+
+    return 0;
+}
+
+static int method_morespecific_via_interferences(jl_method_t *target_method, jl_method_t *start_method)
+{
+    if (target_method == start_method)
+        return 0;
+    arraylist_t seen;
+    arraylist_new(&seen, 0);
+    int result = method_in_interferences_recursive(target_method, start_method, &seen);
+    arraylist_free(&seen);
+    //assert(result == jl_method_morespecific(target_method, start_method) || jl_has_empty_intersection(target_method->sig, start_method->sig) || jl_has_empty_intersection(start_method->sig, target_method->sig));
+    return result;
+}
+
+
 void jl_method_table_activate(jl_typemap_entry_t *newentry)
 {
     JL_TIMING(ADD_METHOD, ADD_METHOD);
@@ -2644,48 +2772,100 @@ void jl_method_table_activate(jl_typemap_entry_t *newentry)
     jl_value_t *loctag = NULL;  // debug info for invalidation
     jl_value_t *isect = NULL;
     jl_value_t *isect2 = NULL;
-    jl_value_t *isect3 = NULL;
-    JL_GC_PUSH6(&oldvalue, &oldmi, &loctag, &isect, &isect2, &isect3);
+    jl_genericmemory_t *interferences = NULL;
+    JL_GC_PUSH6(&oldvalue, &oldmi, &loctag, &isect, &isect2, &interferences);
     jl_typemap_entry_t *replaced = NULL;
-    // then check what entries we replaced
+    // Check what entries this intersects with in the prior world.
     oldvalue = get_intersect_matches(jl_atomic_load_relaxed(&mt->defs), newentry, &replaced, max_world);
+    jl_method_t *const *d;
+    size_t j, n;
+    if (oldvalue == NULL) {
+        d = NULL;
+        n = 0;
+    }
+    else {
+        assert(jl_is_array(oldvalue));
+        d = (jl_method_t**)jl_array_ptr_data(oldvalue);
+        n = jl_array_nrows(oldvalue);
+        oldmi = jl_alloc_vec_any(0);
+    }
 
+    // These get updated from their state stored in the caches files, since content in cache files gets added "all at once".
     int invalidated = 0;
     int dispatch_bits = METHOD_SIG_LATEST_WHICH; // Always set LATEST_WHICH
     // Check precompiled dispatch status bits
     int precompiled_status = jl_atomic_load_relaxed(&method->dispatch_status);
     if (!(precompiled_status & METHOD_SIG_PRECOMPILE_MANY))
+        // This will store if this method will be currently the only result that would returned from `ml_matches` given `sig`.
         dispatch_bits |= METHOD_SIG_LATEST_ONLY; // Tentatively set, will be cleared if not applicable
-    if (precompiled_status & METHOD_SIG_PRECOMPILE_HAS_NOTMORESPECIFIC)
-        dispatch_bits |= METHOD_SIG_LATEST_HAS_NOTMORESPECIFIC;
-    if (replaced) {
-        oldvalue = (jl_value_t*)replaced;
-        jl_method_t *m = replaced->func.method;
-        invalidated = 1;
-        method_overwrite(newentry, m);
-        // This is an optimized version of below, given we know the type-intersection is exact
-        jl_method_table_invalidate(m, max_world);
-        int m_dispatch = jl_atomic_load_relaxed(&m->dispatch_status);
-        // Clear METHOD_SIG_LATEST_ONLY and METHOD_SIG_LATEST_WHICH bits, only keeping NOTMORESPECIFIC
-        jl_atomic_store_relaxed(&m->dispatch_status, m_dispatch & METHOD_SIG_LATEST_HAS_NOTMORESPECIFIC);
-        // Edge case: don't set dispatch_bits |= METHOD_SIG_LATEST_HAS_NOTMORESPECIFIC unconditionally since `m` is not an visible method for invalidations
-        dispatch_bits |= (m_dispatch & METHOD_SIG_LATEST_HAS_NOTMORESPECIFIC);
-        if (!(m_dispatch & METHOD_SIG_LATEST_ONLY))
-            dispatch_bits &= ~METHOD_SIG_LATEST_ONLY;
-    }
-    else {
-        jl_method_t *const *d;
-        size_t j, n;
-        if (oldvalue == NULL) {
+    // Holds the set of all intersecting methods not more specific than this one.
+    // Note: this set may be incomplete (may exclude methods whose intersection
+    // is covered by another method that is morespecific than both, causing them
+    // to have no relevant type intersection for sorting).
+    interferences = (jl_genericmemory_t*)jl_atomic_load_relaxed(&method->interferences);
+    if (oldvalue) {
+        assert(n > 0);
+        if (replaced) {
+            oldvalue = (jl_value_t*)replaced;
+            jl_method_t *m = replaced->func.method;
+            invalidated = 1;
+            method_overwrite(newentry, m);
+            // This is an optimized version of below, given we know the type-intersection is exact
+            jl_method_table_invalidate(m, max_world);
+            int m_dispatch = jl_atomic_load_relaxed(&m->dispatch_status);
+            // Clear METHOD_SIG_LATEST_ONLY and METHOD_SIG_LATEST_WHICH bits
+            jl_atomic_store_relaxed(&m->dispatch_status, 0);
+            if (!(m_dispatch & METHOD_SIG_LATEST_ONLY))
+                dispatch_bits &= ~METHOD_SIG_LATEST_ONLY;
+            // Take over the interference list from the replaced method
+            jl_genericmemory_t *m_interferences = jl_atomic_load_relaxed(&m->interferences);
+            if (interferences->length == 0) {
+                interferences = jl_genericmemory_copy(m_interferences);
+            }
+            else {
+                for (size_t i = 0; i < m_interferences->length; i++) {
+                    jl_value_t *k = jl_genericmemory_ptr_ref(m_interferences, i);
+                    if (k && !has_key(interferences, (jl_value_t*)k)) {
+                        ssize_t idx;
+                        interferences = jl_idset_put_key(interferences, (jl_value_t*)k, &idx);
+                    }
+                }
+            }
+            ssize_t idx;
+            m_interferences = jl_idset_put_key(m_interferences, (jl_value_t*)method, &idx);
+            jl_atomic_store_release(&m->interferences, m_interferences);
+            jl_gc_wb(m, m_interferences);
+            for (j = 0; j < n; j++) {
+                jl_method_t *m2 = d[j];
+                if (m2 && method_in_interferences(m, m2)) {
+                    jl_genericmemory_t *m2_interferences = jl_atomic_load_relaxed(&m2->interferences);
+                    ssize_t idx;
+                    m2_interferences = jl_idset_put_key(m2_interferences, (jl_value_t*)method, &idx);
+                    jl_atomic_store_release(&m2->interferences, m2_interferences);
+                    jl_gc_wb(m2, m2_interferences);
+                }
+            }
+            loctag = jl_atomic_load_relaxed(&m->specializations); // use loctag for a gcroot
+            _Atomic(jl_method_instance_t*) *data;
+            size_t l;
+            if (jl_is_svec(loctag)) {
+                data = (_Atomic(jl_method_instance_t*)*)jl_svec_data(loctag);
+                l = jl_svec_len(loctag);
+            }
+            else {
+                data = (_Atomic(jl_method_instance_t*)*) &loctag;
+                l = 1;
+            }
+            for (size_t i = 0; i < l; i++) {
+                jl_method_instance_t *mi = jl_atomic_load_relaxed(&data[i]);
+                if ((jl_value_t*)mi == jl_nothing)
+                    continue;
+                jl_array_ptr_1d_push(oldmi, (jl_value_t*)mi);
+            }
             d = NULL;
             n = 0;
         }
         else {
-            assert(jl_is_array(oldvalue));
-            d = (jl_method_t**)jl_array_ptr_data(oldvalue);
-            n = jl_array_nrows(oldvalue);
-
-            oldmi = jl_alloc_vec_any(0);
             char *morespec = (char*)alloca(n);
             // Compute all morespec values upfront
             for (j = 0; j < n; j++)
@@ -2699,16 +2879,27 @@ void jl_method_table_activate(jl_typemap_entry_t *newentry)
                 if (morespec[j] || ambig) {
                     // !morespecific(new, old)
                     dispatch_bits &= ~METHOD_SIG_LATEST_ONLY;
-                    m_dispatch |= METHOD_SIG_LATEST_HAS_NOTMORESPECIFIC;
+                    // Add the old method to this interference set
+                    ssize_t idx;
+                    if (!has_key(interferences, (jl_value_t*)m))
+                        interferences = jl_idset_put_key(interferences, (jl_value_t*)m, &idx);
                 }
                 if (!morespec[j]) {
                     // !morespecific(old, new)
-                    dispatch_bits |= METHOD_SIG_LATEST_HAS_NOTMORESPECIFIC;
                     m_dispatch &= ~METHOD_SIG_LATEST_ONLY;
+                    // Add the new method to its interference set
+                    jl_genericmemory_t *m_interferences = jl_atomic_load_relaxed(&m->interferences);
+                    ssize_t idx;
+                    m_interferences = jl_idset_put_key(m_interferences, (jl_value_t*)method, &idx);
+                    jl_atomic_store_release(&m->interferences, m_interferences);
+                    jl_gc_wb(m, m_interferences);
                 }
+                // Add methods that intersect but are not more specific to interference list
                 jl_atomic_store_relaxed(&m->dispatch_status, m_dispatch);
                 if (morespec[j])
                     continue;
+
+                // Now examine if this caused any invalidations.
                 loctag = jl_atomic_load_relaxed(&m->specializations); // use loctag for a gcroot
                 _Atomic(jl_method_instance_t*) *data;
                 size_t l;
@@ -2724,12 +2915,11 @@ void jl_method_table_activate(jl_typemap_entry_t *newentry)
                     jl_method_instance_t *mi = jl_atomic_load_relaxed(&data[i]);
                     if ((jl_value_t*)mi == jl_nothing)
                         continue;
-                    isect3 = jl_type_intersection(m->sig, (jl_value_t*)mi->specTypes);
-                    if (jl_type_intersection2(type, isect3, &isect, &isect2)) {
+                    if (jl_type_intersection2(type, mi->specTypes, &isect, &isect2)) {
                         // Replacing a method--see if this really was the selected method previously
-                        // over the intersection (not ambiguous) and the new method will be selected now (morespec_is).
+                        // over the intersection (not ambiguous) and the new method will be selected now (morespec).
                         // TODO: this only checks pair-wise for ambiguities, but the ambiguities could arise from the interaction of multiple methods
-                        // and thus might miss a case where we introduce an ambiguity between two existing methods
+                        // and thus might miss a case where we introduce an ambiguity between`.u two existing methods
                         // We could instead work to sort this into 3 groups `morespecific .. ambiguous .. lesspecific`, with `type` in ambiguous,
                         // such that everything in `morespecific` dominates everything in `ambiguous`, and everything in `ambiguous` dominates everything in `lessspecific`
                         // And then compute where each isect falls, and whether it changed group--necessitating invalidation--or not.
@@ -2738,9 +2928,10 @@ void jl_method_table_activate(jl_typemap_entry_t *newentry)
                         // call invalidate_backedges(mi, max_world, "jl_method_table_insert");
                         // but ignore invoke-type edges
                         int invalidatedmi = _invalidate_dispatch_backedges(mi, type, m, d, n, replaced_dispatch, ambig, max_world, morespec);
-                        if (replaced_dispatch)
+                        if (replaced_dispatch) {
                             jl_atomic_store_relaxed(&mi->dispatch_status, 0);
-                        jl_array_ptr_1d_push(oldmi, (jl_value_t*)mi);
+                            jl_array_ptr_1d_push(oldmi, (jl_value_t*)mi);
+                        }
                         if (_jl_debug_method_invalidation && invalidatedmi) {
                             jl_array_ptr_1d_push(_jl_debug_method_invalidation, (jl_value_t*)mi);
                             loctag = jl_cstr_to_string("jl_method_table_insert");
@@ -2748,49 +2939,64 @@ void jl_method_table_activate(jl_typemap_entry_t *newentry)
                         }
                         invalidated |= invalidatedmi;
                     }
+                    // TODO: do we have any interesting cases left where isect3 is useful
+                    //jl_value_t *isect3 = NULL;
+                    //jl_value_t *isect4 = NULL;
+                    //jl_value_t *isect5 = NULL;
+                    //JL_GC_PUSH3(&isec3, &isect4, &isect5);
+                    //isect3 = jl_type_intersection(m->sig, (jl_value_t*)mi->specTypes);
+                    //jl_type_intersection2(type, isect3, &isect4, &isect5);
+                    //if (!jl_types_equal(isect, isect4) && (!isect2 || !jl_types_equal(isect2, isect4)) &&
+                    //    (!isect5 || (!jl_types_equal(isect, isect5) && (!isect2 || !jl_types_equal(isect2, isect5))))) {
+                    //    jl_(type);
+                    //    jl_(mi->specTypes);
+                    //    jl_(m->sig);
+                    //}
+                    //JL_GC_POP();
+                    isect = NULL;
+                    isect2 = NULL;
                 }
             }
         }
+    }
 
-        jl_methcache_t *mc = jl_method_table->cache;
-        JL_LOCK(&mc->writelock);
-        struct _typename_invalidate_backedge typename_env = {type, &isect, &isect2, d, n, max_world, invalidated};
-        if (!jl_foreach_top_typename_for(_typename_invalidate_backedges, type, 1, &typename_env)) {
-            // if the new method cannot be split into exact backedges, scan the whole table for anything that might be affected
-            jl_genericmemory_t *allbackedges = jl_method_table->backedges;
-            for (size_t i = 0, n = allbackedges->length; i < n; i += 2) {
-                jl_value_t *tn = jl_genericmemory_ptr_ref(allbackedges, i);
-                jl_value_t *backedges = jl_genericmemory_ptr_ref(allbackedges, i+1);
-                if (tn && tn != jl_nothing && backedges)
-                    _typename_invalidate_backedges((jl_typename_t*)tn, 0, &typename_env);
-            }
-        }
-        invalidated |= typename_env.invalidated;
-        if (oldmi && jl_array_nrows(oldmi)) {
-            // search mc->cache and leafcache and drop anything that might overlap with the new method
-            // this is very cheap, so we don't mind being fairly conservative at over-approximating this
-            struct invalidate_mt_env mt_cache_env;
-            mt_cache_env.max_world = max_world;
-            mt_cache_env.shadowed = oldmi;
-            mt_cache_env.newentry = newentry;
-            mt_cache_env.invalidated = 0;
-
-            jl_typemap_visitor(jl_atomic_load_relaxed(&mc->cache), invalidate_mt_cache, (void*)&mt_cache_env);
-            jl_genericmemory_t *leafcache = jl_atomic_load_relaxed(&mc->leafcache);
-            size_t i, l = leafcache->length;
-            for (i = 1; i < l; i += 2) {
-                jl_value_t *entry = jl_genericmemory_ptr_ref(leafcache, i);
-                if (entry) {
-                    while (entry != jl_nothing) {
-                        invalidate_mt_cache((jl_typemap_entry_t*)entry, (void*)&mt_cache_env);
-                        entry = (jl_value_t*)jl_atomic_load_relaxed(&((jl_typemap_entry_t*)entry)->next);
-                    }
+    jl_methcache_t *mc = jl_method_table->cache;
+    JL_LOCK(&mc->writelock);
+    struct _typename_invalidate_backedge typename_env = {type, &isect, &isect2, d, n, max_world, invalidated};
+    if (!jl_foreach_top_typename_for(_typename_invalidate_backedges, type, 1, &typename_env)) {
+        // if the new method cannot be split into exact backedges, scan the whole table for anything that might be affected
+        jl_genericmemory_t *allbackedges = jl_method_table->backedges;
+        for (size_t i = 0, n = allbackedges->length; i < n; i += 2) {
+            jl_value_t *tn = jl_genericmemory_ptr_ref(allbackedges, i);
+            jl_value_t *backedges = jl_genericmemory_ptr_ref(allbackedges, i+1);
+            if (tn && tn != jl_nothing && backedges)
+                _typename_invalidate_backedges((jl_typename_t*)tn, 0, &typename_env);
+        }
+    }
+    invalidated |= typename_env.invalidated;
+    if (oldmi && jl_array_nrows(oldmi)) {
+        // drop leafcache and search mc->cache and drop anything that might overlap with the new method
+        // this is very cheap, so we don't mind being very conservative at over-approximating this
+        struct invalidate_mt_env mt_cache_env;
+        mt_cache_env.max_world = max_world;
+        mt_cache_env.shadowed = oldmi;
+        mt_cache_env.newentry = newentry;
+
+        jl_typemap_visitor(jl_atomic_load_relaxed(&mc->cache), invalidate_mt_cache, (void*)&mt_cache_env);
+        jl_genericmemory_t *leafcache = jl_atomic_load_relaxed(&mc->leafcache);
+        size_t i, l = leafcache->length;
+        for (i = 1; i < l; i += 2) {
+            jl_value_t *entry = jl_genericmemory_ptr_ref(leafcache, i);
+            if (entry) {
+                while (entry != jl_nothing) {
+                    jl_atomic_store_relaxed(&((jl_typemap_entry_t*)entry)->max_world, max_world);
+                    entry = (jl_value_t*)jl_atomic_load_relaxed(&((jl_typemap_entry_t*)entry)->next);
                 }
             }
-            invalidated |= mt_cache_env.invalidated;
         }
-        JL_UNLOCK(&mc->writelock);
+        jl_atomic_store_relaxed(&mc->leafcache, (jl_genericmemory_t*)jl_an_empty_memory_any);
     }
+    JL_UNLOCK(&mc->writelock);
     if (invalidated && _jl_debug_method_invalidation) {
         jl_array_ptr_1d_push(_jl_debug_method_invalidation, (jl_value_t*)method);
         loctag = jl_cstr_to_string("jl_method_table_insert");
@@ -2798,6 +3004,8 @@ void jl_method_table_activate(jl_typemap_entry_t *newentry)
     }
     jl_atomic_store_relaxed(&newentry->max_world, ~(size_t)0);
     jl_atomic_store_relaxed(&method->dispatch_status, dispatch_bits); // TODO: this should be sequenced fully after the world counter store
+    jl_atomic_store_release(&method->interferences, interferences);
+    jl_gc_wb(method, interferences);
     JL_GC_POP();
 }
 
@@ -4238,16 +4446,9 @@ static int ml_matches_visitor(jl_typemap_entry_t *ml, struct typemap_intersectio
     return 1;
 }
 
-static int ml_mtable_visitor(jl_methtable_t *mt, void *closure0)
-{
-    struct typemap_intersection_env* env = (struct typemap_intersection_env*)closure0;
-    return jl_typemap_intersection_visitor(jl_atomic_load_relaxed(&mt->defs), 0, env);
-}
-
 // Visit the candidate methods, starting from t[idx], to determine a possible valid sort ordering,
 // where every morespecific method appears before any method which it has a common
-// intersection with but is not partly ambiguous with (ambiguity is transitive, particularly
-// if lim==-1, although morespecific is not transitive).
+// intersection with but is not partly ambiguous with (ambiguity is not transitive, since morespecific is not transitive).
 // Implements Tarjan's SCC (strongly connected components) algorithm, simplified to remove the count variable
 // Inputs:
 //  * `t`: the array of vertexes (method matches)
@@ -4255,260 +4456,118 @@ static int ml_mtable_visitor(jl_methtable_t *mt, void *closure0)
 //  * `visited`: the state of the algorithm for each vertex in `t`: either 1 if we visited it already or 1+depth if we are visiting it now
 //  * `stack`: the state of the algorithm for the current vertex (up to length equal to `t`): the list of all vertexes currently in the depth-first path or in the current SCC
 //  * `result`: the output of the algorithm, a sorted list of vertexes (up to length `lim`)
-//  * `allambig`: a list of all vertexes with an ambiguity (up to length equal to `t`), discovered while running the rest of the algorithm
+//  * `recursion_stack`: an array for temporary use
 //  * `lim`: either -1 for unlimited matches, or the maximum length for `result` before returning failure (return -1).
-//           If specified as -1, this will return extra matches that would have been elided from the list because they were already covered by an earlier match.
-//           This gives a sort of maximal set of matching methods (up to the first minmax method).
-//           If specified as -1, the sorting will also include all "weak" edges (every ambiguous pair) which will create much larger ambiguity cycles,
-//           resulting in a less accurate sort order and much less accurate `*has_ambiguity` result.
 //  * `include_ambiguous`: whether to filter out fully ambiguous matches from `result`
 //  * `*has_ambiguity`: whether the algorithm does not need to compute if there is an unresolved ambiguity
 //  * `*found_minmax`: whether there is a minmax method already found, so future fully_covers matches should be ignored
 // Outputs:
-//  * `*has_ambiguity`: whether the caller should check if there remains an unresolved ambiguity (in `allambig`)
+//  * `*has_ambiguity`: whether there are any ambiguities that mean the sort order is not exact
 // Returns:
 //  * -1: too many matches for lim, other outputs are undefined
 //  *  0: the child(ren) have been added to the output
 //  * 1+: the children are part of this SCC (up to this depth)
 // TODO: convert this function into an iterative call, rather than recursive
-static int sort_mlmatches(jl_array_t *t, size_t idx, arraylist_t *visited, arraylist_t *stack, arraylist_t *result, arraylist_t *allambig, int lim, int include_ambiguous, int *has_ambiguity, int *found_minmax)
+static int sort_mlmatches(jl_array_t *t, size_t idx, arraylist_t *visited, arraylist_t *stack, arraylist_t *result, arraylist_t *recursion_stack, int lim, int include_ambiguous, int *has_ambiguity, int *found_minmax)
 {
     size_t cycle = (size_t)visited->items[idx];
     if (cycle != 0)
         return cycle - 1; // depth remaining
-    jl_method_match_t *matc = (jl_method_match_t*)jl_array_ptr_ref(t, idx);
-    jl_method_t *m = matc->method;
-    jl_value_t *ti = (jl_value_t*)matc->spec_types;
-    int subt = matc->fully_covers != NOT_FULLY_COVERS; // jl_subtype((jl_value_t*)type, (jl_value_t*)m->sig)
-    // first check if this new method is actually already fully covered by an
-    // existing match and we can just ignore this entry quickly
-    size_t result_len = 0;
-    if (subt) {
-        if (*found_minmax == 2)
-            visited->items[idx] = (void*)1;
-    }
-    else if (lim != -1) {
-        for (; result_len < result->len; result_len++) {
-            size_t idx2 = (size_t)result->items[result_len];
-            jl_method_match_t *matc2 = (jl_method_match_t*)jl_array_ptr_ref(t, idx2);
-            jl_method_t *m2 = matc2->method;
-            if (jl_subtype(ti, m2->sig)) {
-                if (include_ambiguous) {
-                    if (!jl_method_morespecific(m2, m))
-                        continue;
-                }
-                visited->items[idx] = (void*)1;
-                break;
-            }
-        }
-    }
-    if ((size_t)visited->items[idx] == 1)
-        return 0;
     arraylist_push(stack, (void*)idx);
     size_t depth = stack->len;
-    visited->items[idx] = (void*)(1 + depth);
-    cycle = depth;
-    int addambig = 0;
-    int mayexclude = 0;
-    // First visit all "strong" edges where the child is definitely better.
-    // This likely won't hit any cycles, but might (because morespecific is not transitive).
-    // Along the way, record if we hit any ambiguities-we may need to track those later.
-    for (size_t childidx = 0; childidx < jl_array_nrows(t); childidx++) {
-        if (childidx == idx)
-            continue;
-        int child_cycle = (size_t)visited->items[childidx];
-        if (child_cycle == 1)
-            continue; // already handled
-        if (child_cycle != 0 && child_cycle - 1 >= cycle)
-            continue; // already part of this cycle
-        jl_method_match_t *matc2 = (jl_method_match_t*)jl_array_ptr_ref(t, childidx);
-        jl_method_t *m2 = matc2->method;
-        int subt2 = matc2->fully_covers == FULLY_COVERS; // jl_subtype((jl_value_t*)type, (jl_value_t*)m2->sig)
-        // TODO: we could change this to jl_has_empty_intersection(ti, (jl_value_t*)matc2->spec_types);
-        // since we only care about sorting of the intersections the user asked us about
-        if (!subt2 && jl_has_empty_intersection(m2->sig, m->sig))
-            continue;
-        int msp = jl_method_morespecific(m, m2);
-        int msp2 = !msp && jl_method_morespecific(m2, m);
-        if (!msp) {
-            if (subt || !include_ambiguous || (lim != -1 && msp2)) {
-                if (subt2 || ((lim != -1 || (!include_ambiguous && !msp2)) && jl_subtype((jl_value_t*)ti, m2->sig))) {
-                    // this may be filtered out as fully intersected, if applicable later
-                    mayexclude = 1;
-                }
-            }
-            if (!msp2) {
-                addambig = 1; // record there is a least one previously-undetected ambiguity that may need to be investigated later (between m and m2)
-            }
-        }
-        if (lim == -1 ? msp : !msp2) // include only strong or also weak edges, depending on whether the result size is limited
-            continue;
-        // m2 is (lim!=-1 ? better : not-worse), so attempt to visit it first
-        // if limited, then we want to visit only better edges, because that results in finding k best matches quickest
-        // if not limited, then we want to visit all edges, since that results in finding the largest SCC cycles, which requires doing the fewest intersections
-        child_cycle = sort_mlmatches(t, childidx, visited, stack, result, allambig, lim, include_ambiguous, has_ambiguity, found_minmax);
-        if (child_cycle == -1)
-            return -1;
-        if (child_cycle && child_cycle < cycle) {
+    { // scope block
+        visited->items[idx] = (void*)(1 + depth);
+        jl_method_match_t *matc = (jl_method_match_t*)jl_array_ptr_ref(t, idx);
+        jl_method_t *m = matc->method;
+        jl_value_t *ti = (jl_value_t*)matc->spec_types;
+        int subt = matc->fully_covers != NOT_FULLY_COVERS; // jl_subtype((jl_value_t*)type, (jl_value_t*)m->sig)
+        jl_genericmemory_t *interferences = jl_atomic_load_relaxed(&m->interferences);
+        cycle = depth;
+        // Iterate over the interferences set to get the morespecific methods
+        for (size_t i = 0; i < interferences->length; i++) {
+            jl_method_t *m2 = (jl_method_t*)jl_genericmemory_ptr_ref(interferences, i);
+            if (m2 == NULL)
+                continue;
+            int childidx = find_method_in_matches(t, m2);
+            if (childidx < 0 || (size_t)childidx == idx)
+                continue;
+            int child_cycle = (size_t)visited->items[childidx];
+            if (child_cycle == 1)
+                continue; // already handled
+            if (child_cycle != 0 && child_cycle - 1 >= cycle)
+                continue; // already part of this cycle
+            if (method_in_interferences(m, m2))
+                continue;
+            // m2 is morespecific, so attempt to visit it first
+            child_cycle = sort_mlmatches(t, childidx, visited, stack, result, recursion_stack, lim, include_ambiguous, has_ambiguity, found_minmax);
+            if (child_cycle == -1)
+                return -1;
             // record the cycle will resolve at depth "cycle"
-            cycle = child_cycle;
-        }
-        if (stack->len == depth) {
-            // if this child resolved without hitting a cycle, then there is
-            // some probability that this method is already fully covered now
-            // (same check as before), and we can delete this vertex now without
-            // anyone noticing (too much)
-            if (subt) {
-                if (*found_minmax == 2)
-                    visited->items[idx] = (void*)1;
-            }
-            else if (lim != -1) {
-                for (; result_len < result->len; result_len++) {
-                    size_t idx2 = (size_t)result->items[result_len];
-                    jl_method_match_t *matc2 = (jl_method_match_t*)jl_array_ptr_ref(t, idx2);
-                    jl_method_t *m2 = matc2->method;
-                    if (jl_subtype(ti, m2->sig)) {
-                        if (include_ambiguous) {
-                            if (!jl_method_morespecific(m2, m))
-                                continue;
-                        }
-                        visited->items[idx] = (void*)1;
-                        break;
-                    }
-                }
-            }
-            if ((size_t)visited->items[idx] == 1) {
-                // n.b. cycle might be < depth, if we had a cycle with a child
-                // idx, but since we are on the top of the stack, nobody
-                // observed that and so we are content to ignore this
-                size_t childidx = (size_t)arraylist_pop(stack);
-                assert(childidx == idx); (void)childidx;
-                assert(!subt || *found_minmax == 2);
-                return 0;
-            }
+            if (child_cycle && child_cycle < cycle)
+                cycle = child_cycle;
         }
-    }
-    if (matc->fully_covers == NOT_FULLY_COVERS && addambig)
-        arraylist_push(allambig, (void*)idx);
-    if (cycle != depth)
-        return cycle;
-    result_len = result->len;
-    if (stack->len == depth) {
-        // Found one "best" method to add right now. But we might exclude it if
-        // we determined earlier that we had that option.
-        if (mayexclude) {
-            if (!subt || *found_minmax == 2)
+        // There is some probability that this method is already fully covered
+        // now, and we can delete this vertex now without anyone noticing.
+        if (subt && *found_minmax) {
+            if (*found_minmax == 2)
                 visited->items[idx] = (void*)1;
         }
+        else if (check_interferences_covers(m, ti, t, visited, recursion_stack)) {
+            visited->items[idx] = (void*)1;
+        }
+        else if (check_fully_ambiguous(m, ti, t, include_ambiguous, has_ambiguity)) {
+            visited->items[idx] = (void*)1;
+        }
+
+        // If there were no cycles hit either, then we can potentially delete all of its edges too.
+        if ((size_t)visited->items[idx] == 1 && stack->len == depth) {
+            // n.b. cycle might be < depth, if we had a cycle with a child
+            // idx, but since we are on the top of the stack, nobody
+            // observed that and so we are content to ignore this
+            size_t childidx = (size_t)arraylist_pop(stack);
+            assert(childidx == idx); (void)childidx;
+            return 0;
+        }
+        if (cycle != depth)
+            return cycle;
     }
-    else {
-        // We have a set of ambiguous methods. Record that.
-        // This is greatly over-approximated for lim==-1
-        *has_ambiguity = 1;
-        // If we followed weak edges above, then this also fully closed the ambiguity cycle
-        if (lim == -1)
-            addambig = 0;
-        // If we're only returning possible matches, now filter out this method
-        // if its intersection is fully ambiguous in this SCC group.
-        // This is a repeat of the "first check", now that we have completed the cycle analysis
+    // If this is in an SCC group, do some additional checks before returning or setting has_ambiguity
+    if (depth != stack->len) {
+        int scc_count = 0;
         for (size_t i = depth - 1; i < stack->len; i++) {
             size_t childidx = (size_t)stack->items[i];
-            jl_method_match_t *matc = (jl_method_match_t*)jl_array_ptr_ref(t, childidx);
-            jl_value_t *ti = (jl_value_t*)matc->spec_types;
-            int subt = matc->fully_covers != NOT_FULLY_COVERS; // jl_subtype((jl_value_t*)type, (jl_value_t*)m->sig)
-            if ((size_t)visited->items[childidx] == 1) {
-                assert(subt);
-                continue;
-            }
-            assert(visited->items[childidx] == (void*)(2 + i));
-            // if we only followed strong edges before above
-            // check also if this set has an unresolved ambiguity missing from it
-            if (lim != -1 && !addambig) {
-                for (size_t j = 0; j < allambig->len; j++) {
-                    if ((size_t)allambig->items[j] == childidx) {
-                        addambig = 1;
-                        break;
-                    }
-                }
-            }
-            // always remove fully_covers matches after the first minmax ambiguity group is handled
-            if (subt) {
-                if (*found_minmax)
-                    visited->items[childidx] = (void*)1;
+            if (visited->items[childidx] == (void*)1)
                 continue;
-            }
-            else if (lim != -1) {
-                // when limited, don't include this match if it was covered by an earlier one
-                for (size_t result_len = 0; result_len < result->len; result_len++) {
-                    size_t idx2 = (size_t)result->items[result_len];
-                    jl_method_match_t *matc2 = (jl_method_match_t*)jl_array_ptr_ref(t, idx2);
-                    jl_method_t *m2 = matc2->method;
-                    if (jl_subtype(ti, m2->sig)) {
-                        if (include_ambiguous) {
-                            if (!jl_method_morespecific(m2, m))
-                                continue;
-                        }
-                        visited->items[childidx] = (void*)1;
-                        break;
-                    }
-                }
-            }
-        }
-        if (!include_ambiguous && lim == -1) {
-            for (size_t i = depth - 1; i < stack->len; i++) {
-                size_t childidx = (size_t)stack->items[i];
-                if ((size_t)visited->items[childidx] == 1)
-                    continue;
-                jl_method_match_t *matc = (jl_method_match_t*)jl_array_ptr_ref(t, childidx);
-                jl_method_t *m = matc->method;
-                jl_value_t *ti = (jl_value_t*)matc->spec_types;
-                for (size_t j = depth - 1; j < stack->len; j++) {
-                    if (i == j)
-                        continue;
-                    size_t idx2 = (size_t)stack->items[j];
-                    jl_method_match_t *matc2 = (jl_method_match_t*)jl_array_ptr_ref(t, idx2);
-                    jl_method_t *m2 = matc2->method;
-                    int subt2 = matc2->fully_covers == FULLY_COVERS; // jl_subtype((jl_value_t*)type, (jl_value_t*)m2->sig)
-                    // if their intersection contributes to the ambiguity cycle
-                    // and the contribution of m is fully ambiguous with the portion of the cycle from m2
-                    if (subt2 || jl_subtype((jl_value_t*)ti, m2->sig)) {
-                        // but they aren't themselves simply ordered (here
-                        // we don't consider that a third method might be
-                        // disrupting that ordering and just consider them
-                        // pairwise to keep this simple).
-                        if (!jl_method_morespecific(m, m2) && !jl_method_morespecific(m2, m)) {
-                            visited->items[childidx] = (void*)-1;
-                            break;
-                        }
-                    }
-                }
-            }
+            scc_count++;
         }
+        if (scc_count > 1)
+            *has_ambiguity = 1;
     }
     // copy this cycle into the results
     for (size_t i = depth - 1; i < stack->len; i++) {
         size_t childidx = (size_t)stack->items[i];
+        jl_method_match_t *matc = (jl_method_match_t*)jl_array_ptr_ref(t, childidx);
+        int subt = matc->fully_covers != NOT_FULLY_COVERS;
+        if (subt && *found_minmax)
+            visited->items[childidx] = (void*)1;
         if ((size_t)visited->items[childidx] == 1)
             continue;
-        if ((size_t)visited->items[childidx] != -1) {
-            assert(visited->items[childidx] == (void*)(2 + i));
-            visited->items[childidx] = (void*)-1;
-            if (lim == -1 || result->len < lim)
-                arraylist_push(result, (void*)childidx);
-            else
-                return -1;
-        }
+        assert(visited->items[childidx] == (void*)(2 + i));
+        visited->items[childidx] = (void*)1;
+        if (lim == -1 || result->len < lim)
+            arraylist_push(result, (void*)childidx);
+        else
+            return -1;
     }
     // now finally cleanup the stack
     while (stack->len >= depth) {
         size_t childidx = (size_t)arraylist_pop(stack);
         // always remove fully_covers matches after the first minmax ambiguity group is handled
-        //jl_method_match_t *matc = (jl_method_match_t*)jl_array_ptr_ref(t, childidx);
-        if (matc->fully_covers != NOT_FULLY_COVERS && !addambig)
+        jl_method_match_t *matc = (jl_method_match_t*)jl_array_ptr_ref(t, childidx);
+        int subt = matc->fully_covers == FULLY_COVERS;
+        if (subt && *found_minmax == 1)
             *found_minmax = 2;
-        if (visited->items[childidx] != (void*)-1)
-            continue;
-        visited->items[childidx] = (void*)1;
+        assert(visited->items[childidx] == (void*)1);
     }
     return 0;
 }
@@ -4620,7 +4679,7 @@ static jl_value_t *ml_matches(jl_methtable_t *mt, jl_methcache_t *mc,
         }
     }
     // then scan everything
-    if (!ml_mtable_visitor(mt, &env.match) && env.t == jl_an_empty_vec_any) {
+    if (!jl_typemap_intersection_visitor(jl_atomic_load_relaxed(&mt->defs), 0, &env.match) && env.t == jl_an_empty_vec_any) {
         JL_GC_POP();
         // if we return early without returning methods, set only the min/max valid collected from matching
         *min_valid = env.match.min_valid;
@@ -4634,43 +4693,31 @@ static jl_value_t *ml_matches(jl_methtable_t *mt, jl_methcache_t *mc,
     env.match.ti = NULL; env.matc = NULL; env.match.env = NULL; search.env = NULL;
     size_t i, j, len = jl_array_nrows(env.t);
     jl_method_match_t *minmax = NULL;
-    int minmax_ambig = 0;
-    int all_subtypes = 1;
+    int any_subtypes = 0;
     if (len > 1) {
         // first try to pre-process the results to find the most specific
-        // result that fully covers the input, since we can do this in linear
-        // time, and the rest is O(n^2)
+        // result that fully covers the input, since we can do this in O(n^2)
+        // time, and the rest is O(n^3)
         //   - first find a candidate for the best of these method results
         for (i = 0; i < len; i++) {
             jl_method_match_t *matc = (jl_method_match_t*)jl_array_ptr_ref(env.t, i);
             if (matc->fully_covers == FULLY_COVERS) {
+                any_subtypes = 1;
                 jl_method_t *m = matc->method;
-                if (minmax != NULL) {
-                    jl_method_t *minmaxm = minmax->method;
-                    if (jl_method_morespecific(minmaxm, m))
+                for (j = 0; j < len; j++) {
+                    if (i == j)
                         continue;
+                    jl_method_match_t *matc2 = (jl_method_match_t*)jl_array_ptr_ref(env.t, j);
+                    if (matc2->fully_covers == FULLY_COVERS) {
+                        jl_method_t *m2 = matc2->method;
+                        if (!method_morespecific_via_interferences(m, m2))
+                            break;
+                    }
                 }
-                minmax = matc;
-            }
-            else {
-                all_subtypes = 0;
-            }
-        }
-        //   - then see if it dominated all of the other choices
-        if (minmax != NULL) {
-            for (i = 0; i < len; i++) {
-                jl_method_match_t *matc = (jl_method_match_t*)jl_array_ptr_ref(env.t, i);
-                if (matc == minmax)
+                if (j == len) {
+                    // Found the minmax method
+                    minmax = matc;
                     break;
-                if (matc->fully_covers == FULLY_COVERS) {
-                    jl_method_t *m = matc->method;
-                    jl_method_t *minmaxm = minmax->method;
-                    if (!jl_method_morespecific(minmaxm, m)) {
-                        minmax_ambig = 1;
-                        minmax = NULL;
-                        has_ambiguity = 1;
-                        break;
-                    }
                 }
             }
         }
@@ -4683,24 +4730,31 @@ static jl_value_t *ml_matches(jl_methtable_t *mt, jl_methcache_t *mc,
         //     cost much extra and is less likely to help us hit a fast path
         //     (we will look for this later, when we compute ambig_groupid, for
         //     correctness)
-        if (!all_subtypes && minmax != NULL) {
-            jl_method_t *minmaxm = minmax->method;
-            all_subtypes = 1;
+        int all_subtypes = any_subtypes;
+        if (any_subtypes) {
+            jl_method_t *minmaxm = NULL;
+            if (minmax != NULL)
+                minmaxm = minmax->method;
             for (i = 0; i < len; i++) {
                 jl_method_match_t *matc = (jl_method_match_t*)jl_array_ptr_ref(env.t, i);
                 if (matc->fully_covers != FULLY_COVERS) {
                     jl_method_t *m = matc->method;
-                    if (jl_method_morespecific(minmaxm, m))
-                        matc->fully_covers = SENTINEL; // put a sentinel value here for sorting
-                    else
-                        all_subtypes = 0;
+                    if (minmaxm) {
+                        if (method_morespecific_via_interferences(minmaxm, m)) {
+                            matc->fully_covers = SENTINEL; // put a sentinel value here for sorting
+                            continue;
+                        }
+                        if (method_in_interferences(minmaxm, m)) // !morespecific(m, minmaxm)
+                            has_ambiguity = 1;
+                    }
+                    all_subtypes = 0;
                 }
             }
         }
         //    - now we might have a fast-return here, if we see that
         //      we've already processed all of the possible outputs
         if (all_subtypes) {
-            if (minmax_ambig) {
+            if (minmax == NULL) {
                 if (!include_ambiguous) {
                     len = 0;
                     env.t = jl_an_empty_vec_any;
@@ -4711,7 +4765,6 @@ static jl_value_t *ml_matches(jl_methtable_t *mt, jl_methcache_t *mc,
                 }
             }
             else {
-                assert(minmax != NULL);
                 jl_array_ptr_set(env.t, 0, minmax);
                 jl_array_del_end((jl_array_t*)env.t, len - 1);
                 len = 1;
@@ -4724,20 +4777,23 @@ static jl_value_t *ml_matches(jl_methtable_t *mt, jl_methcache_t *mc,
         }
     }
     if (len > 1) {
-        arraylist_t stack, visited, result, allambig;
+        arraylist_t stack, visited, result, recursion_stack;
         arraylist_new(&result, lim != -1 && lim < len ? lim : len);
         arraylist_new(&stack, 0);
         arraylist_new(&visited, len);
-        arraylist_new(&allambig, len);
+        arraylist_new(&recursion_stack, len);
         arraylist_grow(&visited, len);
         memset(visited.items, 0, len * sizeof(size_t));
         // if we had a minmax method (any subtypes), now may now be able to
         // quickly cleanup some of methods
         int found_minmax = 0;
-        if (minmax != NULL)
+        if (has_ambiguity)
+            found_minmax = 1;
+        else if (minmax != NULL)
             found_minmax = 2;
-        else if (minmax_ambig && !include_ambiguous)
+        else if (any_subtypes && !include_ambiguous)
             found_minmax = 1;
+        has_ambiguity = 0;
         if (ambig == NULL) // if we don't care about the result, set it now so we won't bother attempting to compute it accurately later
             has_ambiguity = 1;
         for (i = 0; i < len; i++) {
@@ -4748,9 +4804,9 @@ static jl_value_t *ml_matches(jl_methtable_t *mt, jl_methcache_t *mc,
                 // by visiting it and it might be a bit costly
                 continue;
             }
-            int child_cycle = sort_mlmatches((jl_array_t*)env.t, i, &visited, &stack, &result, &allambig, lim == -1 || minmax == NULL ? lim : lim - 1, include_ambiguous, &has_ambiguity, &found_minmax);
+            int child_cycle = sort_mlmatches((jl_array_t*)env.t, i, &visited, &stack, &result, &recursion_stack, lim == -1 || minmax == NULL ? lim : lim - 1, include_ambiguous, &has_ambiguity, &found_minmax);
             if (child_cycle == -1) {
-                arraylist_free(&allambig);
+                arraylist_free(&recursion_stack);
                 arraylist_free(&visited);
                 arraylist_free(&stack);
                 arraylist_free(&result);
@@ -4761,89 +4817,7 @@ static jl_value_t *ml_matches(jl_methtable_t *mt, jl_methcache_t *mc,
             assert(stack.len == 0);
             assert(visited.items[i] == (void*)1);
         }
-        // now compute whether there were ambiguities left in this cycle
-        if (has_ambiguity == 0 && allambig.len > 0) {
-            if (lim == -1) {
-                // lim is over-approximated, so has_ambiguities is too
-                has_ambiguity = 1;
-            }
-            else {
-                // go back and find the additional ambiguous methods and temporary add them to the stack
-                // (potentially duplicating them from lower on the stack to here)
-                jl_value_t *ti = NULL;
-                jl_value_t *isect2 = NULL;
-                JL_GC_PUSH2(&ti, &isect2);
-                for (size_t i = 0; i < allambig.len; i++) {
-                    size_t idx = (size_t)allambig.items[i];
-                    jl_method_match_t *matc = (jl_method_match_t*)jl_array_ptr_ref(env.t, idx);
-                    jl_method_t *m = matc->method;
-                    int subt = matc->fully_covers == FULLY_COVERS; // jl_subtype((jl_value_t*)type, (jl_value_t*)m->sig)
-                    for (size_t idx2 = 0; idx2 < jl_array_nrows(env.t); idx2++) {
-                        if (idx2 == idx)
-                            continue;
-                        // laborious test, checking for existence and coverage of another method (m3)
-                        // outside of the ambiguity group that dominates any ambiguous methods,
-                        // and means we can ignore this for has_ambiguity
-                        // (has_ambiguity is overestimated for lim==-1, since we don't compute skipped matches either)
-                        // n.b. even if we skipped them earlier, they still might
-                        // contribute to the ambiguities (due to lock of transitivity of
-                        // morespecific over subtyping)
-                        // TODO: we could improve this result by checking if the removal of some
-                        // edge earlier means that this subgraph is now well-ordered and then be
-                        // allowed to ignore these vertexes entirely here
-                        jl_method_match_t *matc2 = (jl_method_match_t*)jl_array_ptr_ref(env.t, idx2);
-                        jl_method_t *m2 = matc2->method;
-                        int subt2 = matc2->fully_covers == FULLY_COVERS; // jl_subtype((jl_value_t*)type, (jl_value_t*)m2->sig)
-                        if (subt) {
-                            ti = (jl_value_t*)matc2->spec_types;
-                            isect2 = NULL;
-                        }
-                        else if (subt2) {
-                            ti = (jl_value_t*)matc->spec_types;
-                            isect2 = NULL;
-                        }
-                        else {
-                            jl_type_intersection2((jl_value_t*)matc->spec_types, (jl_value_t*)matc2->spec_types, &ti, &isect2);
-                        }
-                        // if their intersection contributes to the ambiguity cycle
-                        if (ti == jl_bottom_type)
-                            continue;
-                        // and they aren't themselves simply ordered
-                        if (jl_method_morespecific(m, m2) || jl_method_morespecific(m2, m))
-                            continue;
-                        // now look for a third method m3 that dominated these and that fully covered this intersection already
-                        size_t k;
-                        for (k = 0; k < result.len; k++) {
-                            size_t idx3 = (size_t)result.items[k];
-                            if (idx3 == idx || idx3 == idx2) {
-                                has_ambiguity = 1;
-                                break;
-                            }
-                            jl_method_match_t *matc3 = (jl_method_match_t*)jl_array_ptr_ref(env.t, idx3);
-                            jl_method_t *m3 = matc3->method;
-                            if ((jl_subtype(ti, m3->sig) || (isect2 && jl_subtype(isect2, m3->sig)))
-                                    && jl_method_morespecific(m3, m) && jl_method_morespecific(m3, m2)) {
-                                //if (jl_subtype(matc->spec_types, ti) || jl_subtype(matc->spec_types, matc3->m3->sig))
-                                //    // check if it covered not only this intersection, but all intersections with matc
-                                //    // if so, we do not need to check all of them separately
-                                //    j = len;
-                                break;
-                            }
-                        }
-                        if (k == result.len)
-                            has_ambiguity = 1;
-                        isect2 = NULL;
-                        ti = NULL;
-                        if (has_ambiguity)
-                            break;
-                    }
-                    if (has_ambiguity)
-                        break;
-                }
-                JL_GC_POP();
-            }
-        }
-        arraylist_free(&allambig);
+        arraylist_free(&recursion_stack);
         arraylist_free(&visited);
         arraylist_free(&stack);
         for (j = 0; j < result.len; j++) {
diff --git a/src/jltypes.c b/src/jltypes.c
index d30adbd4b72fa..28a8afc10216e 100644
--- a/src/jltypes.c
+++ b/src/jltypes.c
@@ -3537,12 +3537,13 @@ void jl_init_types(void) JL_GC_DISABLED
     jl_method_type =
         jl_new_datatype(jl_symbol("Method"), core,
                         jl_any_type, jl_emptysvec,
-                        jl_perm_symsvec(32,
+                        jl_perm_symsvec(33,
                             "name",
                             "module",
                             "file",
                             "line",
                             "dispatch_status", // atomic
+                            "interferences", // atomic
                             "primary_world", // atomic
                             "sig",
                             "specializations", // !const
@@ -3570,12 +3571,13 @@ void jl_init_types(void) JL_GC_DISABLED
                             "constprop",
                             "max_varargs",
                             "purity"),
-                        jl_svec(32,
+                        jl_svec(33,
                             jl_symbol_type,
                             jl_module_type,
                             jl_symbol_type,
                             jl_int32_type,
                             jl_uint8_type,
+                            jl_memory_any_type,
                             jl_ulong_type,
                             jl_type_type,
                             jl_any_type, // union(jl_simplevector_type, jl_method_instance_type),
@@ -3605,9 +3607,9 @@ void jl_init_types(void) JL_GC_DISABLED
                             jl_uint16_type),
                         jl_emptysvec,
                         0, 1, 10);
-    //const static uint32_t method_constfields[1] = { 0b0 }; // (1<<0)|(1<<1)|(1<<2)|(1<<3)|(1<<6)|(1<<9)|(1<<10)|(1<<17)|(1<<21)|(1<<22)|(1<<23)|(1<<24)|(1<<25)|(1<<26)|(1<<27)|(1<<28)|(1<<29)|(1<<30);
+    //const static uint32_t method_constfields[] = { 0b0, 0b0 }; // (1<<0)|(1<<1)|(1<<2)|(1<<3)|(1<<6)|(1<<9)|(1<<10)|(1<<17)|(1<<21)|(1<<22)|(1<<23)|(1<<24)|(1<<25)|(1<<26)|(1<<27)|(1<<28)|(1<<29)|(1<<30);
     //jl_method_type->name->constfields = method_constfields;
-    const static uint32_t method_atomicfields[1] = { 0x10000030 }; // (1<<4)|(1<<5)||(1<<28)
+    const static uint32_t method_atomicfields[] = { 0x20000070, 0x0 }; // (1<<4)|(1<<5)|(1<<6)|(1<<29)
     jl_method_type->name->atomicfields = method_atomicfields;
 
     jl_method_instance_type =
@@ -3634,7 +3636,7 @@ void jl_init_types(void) JL_GC_DISABLED
                         jl_emptysvec,
                         0, 1, 3);
     // These fields should be constant, but Serialization wants to mutate them in initialization
-    //const static uint32_t method_instance_constfields[1] = { 0b0000111 }; // fields 1, 2, 3
+    //const static uint32_t method_instance_constfields[1] = { 0b00000111 }; // fields 1, 2, 3
     const static uint32_t method_instance_atomicfields[1]  = { 0b11010000 }; // fields 5, 7, 8
     //Fields 4 and 5 must be protected by method->write_lock, and thus all operations on jl_method_instance_t are threadsafe.
     //jl_method_instance_type->name->constfields = method_instance_constfields;
@@ -3854,7 +3856,7 @@ void jl_init_types(void) JL_GC_DISABLED
     jl_svecset(jl_methcache_type->types, 2, jl_long_type); // voidpointer
     jl_svecset(jl_methcache_type->types, 3, jl_long_type); // uint32_t plus alignment
     jl_svecset(jl_methtable_type->types, 3, jl_module_type);
-    jl_svecset(jl_method_type->types, 13, jl_method_instance_type);
+    jl_svecset(jl_method_type->types, 14, jl_method_instance_type);
     //jl_svecset(jl_debuginfo_type->types, 0, jl_method_instance_type); // union(jl_method_instance_type, jl_method_type, jl_symbol_type)
     jl_svecset(jl_method_instance_type->types, 4, jl_code_instance_type);
     jl_svecset(jl_code_instance_type->types, 19, jl_voidpointer_type);
diff --git a/src/julia.h b/src/julia.h
index 2d8eecbf358f2..094a518f9f86d 100644
--- a/src/julia.h
+++ b/src/julia.h
@@ -333,6 +333,7 @@ typedef struct _jl_method_t {
     jl_sym_t *file;
     int32_t line;
     _Atomic(uint8_t) dispatch_status; // bits defined in staticdata.jl
+    _Atomic(jl_genericmemory_t*) interferences; // set of intersecting methods not more specific
     _Atomic(size_t) primary_world;
 
     // method's type signature. redundant with TypeMapEntry->specTypes
diff --git a/src/julia_internal.h b/src/julia_internal.h
index 42f77c6a59f72..e9fc24dda22ca 100644
--- a/src/julia_internal.h
+++ b/src/julia_internal.h
@@ -677,8 +677,6 @@ typedef union {
 #define METHOD_SIG_LATEST_WHICH             0b0001
 #define METHOD_SIG_LATEST_ONLY              0b0010
 #define METHOD_SIG_PRECOMPILE_MANY          0b0100
-#define METHOD_SIG_LATEST_HAS_NOTMORESPECIFIC 0b1000  // indicates there exists some other method that is not more specific than this one
-#define METHOD_SIG_PRECOMPILE_HAS_NOTMORESPECIFIC 0b10000  // precompiled version of METHOD_SIG_LATEST_HAS_NOTMORESPECIFIC
 
 JL_DLLEXPORT jl_code_instance_t *jl_engine_reserve(jl_method_instance_t *m, jl_value_t *owner);
 JL_DLLEXPORT void jl_engine_fulfill(jl_code_instance_t *ci, jl_code_info_t *src);
@@ -1748,6 +1746,7 @@ JL_DLLEXPORT jl_value_t *jl_have_fma(jl_value_t *a);
 JL_DLLEXPORT int jl_stored_inline(jl_value_t *el_type);
 JL_DLLEXPORT jl_value_t *(jl_array_data_owner)(jl_array_t *a);
 JL_DLLEXPORT jl_array_t *jl_array_copy(jl_array_t *ary);
+JL_DLLEXPORT jl_genericmemory_t *jl_genericmemory_copy(jl_genericmemory_t *mem);
 
 JL_DLLEXPORT uintptr_t jl_object_id_(uintptr_t tv, jl_value_t *v) JL_NOTSAFEPOINT;
 JL_DLLEXPORT void jl_set_next_task(jl_task_t *task) JL_NOTSAFEPOINT;
diff --git a/src/method.c b/src/method.c
index a04035086df90..8220178964333 100644
--- a/src/method.c
+++ b/src/method.c
@@ -1008,6 +1008,7 @@ JL_DLLEXPORT jl_method_t *jl_new_method_uninit(jl_module_t *module)
     m->nargs = 0;
     jl_atomic_store_relaxed(&m->primary_world, ~(size_t)0);
     jl_atomic_store_relaxed(&m->dispatch_status, 0);
+    jl_atomic_store_relaxed(&m->interferences, (jl_genericmemory_t*)jl_an_empty_memory_any);
     m->is_for_opaque_closure = 0;
     m->nospecializeinfer = 0;
     jl_atomic_store_relaxed(&m->did_scan_source, 0);
diff --git a/src/staticdata.c b/src/staticdata.c
index 88304c1eb6f7f..5d575188ab5bf 100644
--- a/src/staticdata.c
+++ b/src/staticdata.c
@@ -1837,8 +1837,6 @@ static void jl_write_values(jl_serializer_state *s) JL_GC_DISABLED
                         int new_dispatch_status = 0;
                         if (!(dispatch_status & METHOD_SIG_LATEST_ONLY))
                             new_dispatch_status |= METHOD_SIG_PRECOMPILE_MANY;
-                        if (dispatch_status & METHOD_SIG_LATEST_HAS_NOTMORESPECIFIC)
-                            new_dispatch_status |= METHOD_SIG_PRECOMPILE_HAS_NOTMORESPECIFIC;
                         jl_atomic_store_relaxed(&newm->dispatch_status, new_dispatch_status);
                         arraylist_push(&s->fixup_objs, (void*)reloc_offset);
                     }
diff --git a/src/staticdata_utils.c b/src/staticdata_utils.c
index 0b8cfc1cf4ebd..e676eabbddad3 100644
--- a/src/staticdata_utils.c
+++ b/src/staticdata_utils.c
@@ -726,7 +726,11 @@ static void jl_activate_methods(jl_array_t *external, jl_array_t *internal, size
         }
         for (i = 0; i < l; i++) {
             jl_typemap_entry_t *entry = (jl_typemap_entry_t*)jl_array_ptr_ref(external, i);
+            //uint64_t t0 = uv_hrtime();
             jl_method_table_activate(entry);
+            //jl_printf(JL_STDERR, "%f ", (double)(uv_hrtime() - t0) / 1e6);
+            //jl_static_show(JL_STDERR, entry->func.value);
+            //jl_printf(JL_STDERR, "\n");
         }
     }
 }
diff --git a/stdlib/Test/src/Test.jl b/stdlib/Test/src/Test.jl
index 20a1df8e773bd..b50107cec3a29 100644
--- a/stdlib/Test/src/Test.jl
+++ b/stdlib/Test/src/Test.jl
@@ -2143,7 +2143,6 @@ function detect_ambiguities(mods::Module...;
     end
     function examine(mt::Core.MethodTable)
         for m in Base.MethodList(mt)
-            m.sig == Tuple && continue # ignore Builtins
             is_in_mods(parentmodule(m), recursive, mods) || continue
             world = Base.get_world_counter()
             ambig = Ref{Int32}(0)
diff --git a/test/ambiguous.jl b/test/ambiguous.jl
index 0f29817e74dd5..df3ba89e3eb2a 100644
--- a/test/ambiguous.jl
+++ b/test/ambiguous.jl
@@ -19,7 +19,7 @@ include("testenv.jl")
 
 @test length(methods(ambig, (Int, Int))) == 1
 @test length(methods(ambig, (UInt8, Int))) == 0
-@test length(Base.methods_including_ambiguous(ambig, (UInt8, Int))) == 3
+@test length(Base.methods_including_ambiguous(ambig, (UInt8, Int))) == 2
 
 @test ambig("hi", "there") == 1
 @test ambig(3.1, 3.2) == 5
@@ -42,7 +42,6 @@ let err = try
     errstr = String(take!(io))
     @test occursin("  ambig(x, y::Integer)\n    @ $curmod_str", errstr)
     @test occursin("  ambig(x::Integer, y)\n    @ $curmod_str", errstr)
-    @test occursin("  ambig(x::Number, y)\n    @ $curmod_str", errstr)
     @test occursin("Possible fix, define\n  ambig(::Integer, ::Integer)", errstr)
 end
 
@@ -160,7 +159,7 @@ ambig(::Signed, ::Int) = 3
 ambig(::Int, ::Signed) = 4
 end
 ambs = detect_ambiguities(Ambig48312)
-@test length(ambs) == 4
+@test length(ambs) == 1 # only ambiguous over (Int, Int), which is 3 or 4
 
 module UnboundAmbig55868
     module B
@@ -287,7 +286,7 @@ end
 @test isempty(methods(Ambig8.f, (Int,)))
 @test isempty(methods(Ambig8.g, (Int,)))
 for f in (Ambig8.f, Ambig8.g)
-    @test length(methods(f, (Integer,))) == 2 # 1 is also acceptable
+    @test length(methods(f, (Integer,))) == 2 # 3 is also acceptable
     @test length(methods(f, (Signed,))) == 1 # 2 is also acceptable
     @test length(Base.methods_including_ambiguous(f, (Signed,))) == 2
     @test f(0x00) == 1
@@ -413,7 +412,7 @@ let has_ambig = Ref(Int32(0))
     ms = Base._methods_by_ftype(Tuple{typeof(fnoambig), Any, Any}, nothing, 4, Base.get_world_counter(), false, Ref(typemin(UInt)), Ref(typemax(UInt)), has_ambig)
     @test ms isa Vector
     @test length(ms) == 4
-    @test has_ambig[] == 0
+    @test has_ambig[] == 1 # 0 is better, but expensive and probably unnecessary to compute
 end
 
 # issue #11407
@@ -459,15 +458,38 @@ struct U55231{P} end
 struct V55231{P} end
 U55231(::V55231) = nothing
 (::Type{T})(::V55231) where {T<:U55231} = nothing
-@test length(methods(U55231)) == 2
+@test length(methods(U55231)) == 1
 U55231(a, b) = nothing
-@test length(methods(U55231)) == 3
+@test length(methods(U55231)) == 2
 struct S55231{P} end
 struct T55231{P} end
 (::Type{T})(::T55231) where {T<:S55231} = nothing
 S55231(::T55231) = nothing
-@test length(methods(S55231)) == 2
+@test length(methods(S55231)) == 1
 S55231(a, b) = nothing
-@test length(methods(S55231)) == 3
+@test length(methods(S55231)) == 2
+
+ambig10() = 1
+ambig10(a::Vararg{Any}) = 2
+ambig10(a::Vararg{Union{Int32,Int64}}) = 6
+ambig10(a::Vararg{Matrix}) = 4
+ambig10(a::Vararg{Number}) = 7
+ambig10(a::Vararg{N}) where {N<:Number} = 5
+let ambig = Ref{Int32}(0)
+    ms = Base._methods_by_ftype(Tuple{typeof(ambig10), Vararg}, nothing, -1, Base.get_world_counter(), false, Ref{UInt}(typemin(UInt)), Ref{UInt}(typemax(UInt)), ambig)
+    @test ms isa Vector
+    @test length(ms) == 6
+    @test_broken ambig[] == 0
+end
+let ambig = Ref{Int32}(0)
+    ms = Base._methods_by_ftype(Tuple{typeof(ambig10), Vararg{Number}}, nothing, -1, Base.get_world_counter(), false, Ref{UInt}(typemin(UInt)), Ref{UInt}(typemax(UInt)), ambig)
+    @test ms isa Vector
+    @test length(ms) == 4
+    @test_broken ambig[] == 0
+    @test ms[1].method === which(ambig10, ())
+    @test ms[2].method === which(ambig10, (Vararg{Union{Int32, Int64}},))
+    @test ms[3].method === which(ambig10, Tuple{Vararg{N}} where N<:Number,)
+    @test ms[4].method === which(ambig10, (Vararg{Number},))
+end
 
 nothing
diff --git a/test/core.jl b/test/core.jl
index 4fdd3540b3ef7..28c496cedee3a 100644
--- a/test/core.jl
+++ b/test/core.jl
@@ -36,7 +36,7 @@ end
 for (T, c) in (
         (Core.CodeInfo, []),
         (Core.CodeInstance, [:next, :min_world, :max_world, :inferred, :edges, :debuginfo, :ipo_purity_bits, :invoke, :specptr, :specsigflags, :precompile, :time_compile]),
-        (Core.Method, [:primary_world, :did_scan_source, :dispatch_status]),
+        (Core.Method, [:primary_world, :did_scan_source, :dispatch_status, :interferences]),
         (Core.MethodInstance, [:cache, :flags, :dispatch_status]),
         (Core.MethodTable, [:defs]),
         (Core.MethodCache, [:leafcache, :cache, :var""]),

From b92bdc2fc47a106f40f266735d490fdd2c0aad9e Mon Sep 17 00:00:00 2001
From: Kristoffer Carlsson <kcarlsson89@gmail.com>
Date: Fri, 25 Jul 2025 14:58:30 -0400
Subject: [PATCH 6/9] Revert "Fix LLVM TaskDispatcher implementation issues
 (#58950)"

This reverts commit 6846af76eb4da22d7198b737de07895447e9585a.
---
 src/Makefile                     |   2 +-
 src/jitlayers.cpp                |  23 +-
 src/llvm-julia-task-dispatcher.h | 465 -------------------------------
 3 files changed, 16 insertions(+), 474 deletions(-)
 delete mode 100644 src/llvm-julia-task-dispatcher.h

diff --git a/src/Makefile b/src/Makefile
index 6dac1524145ef..e859acc765354 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -349,7 +349,7 @@ $(BUILDDIR)/gc-alloc-profiler.o $(BUILDDIR)/gc-alloc-profiler.dbg.obj: $(SRCDIR)
 $(BUILDDIR)/gc-page-profiler.o $(BUILDDIR)/gc-page-profiler.dbg.obj: $(SRCDIR)/gc-page-profiler.h
 $(BUILDDIR)/init.o $(BUILDDIR)/init.dbg.obj: $(SRCDIR)/builtin_proto.h
 $(BUILDDIR)/interpreter.o $(BUILDDIR)/interpreter.dbg.obj: $(SRCDIR)/builtin_proto.h
-$(BUILDDIR)/jitlayers.o $(BUILDDIR)/jitlayers.dbg.obj: $(SRCDIR)/jitlayers.h $(SRCDIR)/llvm-codegen-shared.h $(SRCDIR)/llvm-julia-task-dispatcher.h
+$(BUILDDIR)/jitlayers.o $(BUILDDIR)/jitlayers.dbg.obj: $(SRCDIR)/jitlayers.h $(SRCDIR)/llvm-codegen-shared.h
 $(BUILDDIR)/jltypes.o $(BUILDDIR)/jltypes.dbg.obj: $(SRCDIR)/builtin_proto.h
 $(build_shlibdir)/libllvmcalltest.$(SHLIB_EXT): $(SRCDIR)/llvm-codegen-shared.h $(BUILDDIR)/julia_version.h
 $(BUILDDIR)/llvm-alloc-helpers.o $(BUILDDIR)/llvm-alloc-helpers.dbg.obj: $(SRCDIR)/llvm-codegen-shared.h $(SRCDIR)/llvm-pass-helpers.h $(SRCDIR)/llvm-alloc-helpers.h
diff --git a/src/jitlayers.cpp b/src/jitlayers.cpp
index a44da64941985..3ea95ea42f596 100644
--- a/src/jitlayers.cpp
+++ b/src/jitlayers.cpp
@@ -6,7 +6,6 @@
 #include <string>
 
 #include "llvm/IR/Mangler.h"
-#include <llvm/ADT/BitmaskEnum.h>
 #include <llvm/ADT/Statistic.h>
 #include <llvm/ADT/StringMap.h>
 #include <llvm/Analysis/TargetLibraryInfo.h>
@@ -47,7 +46,6 @@ using namespace llvm;
 #include "jitlayers.h"
 #include "julia_assert.h"
 #include "processor.h"
-#include "llvm-julia-task-dispatcher.h"
 
 #if JL_LLVM_VERSION >= 180000
 # include <llvm/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.h>
@@ -668,8 +666,17 @@ static void jl_compile_codeinst_now(jl_code_instance_t *codeinst)
             if (!decls.specFunctionObject.empty())
                 NewDefs.push_back(decls.specFunctionObject);
         }
-        auto Addrs = jl_ExecutionEngine->findSymbols(NewDefs);
-
+        // Split batches to avoid stack overflow in the JIT linker.
+        // FIXME: Patch ORCJITs InPlaceTaskDispatcher to not recurse on task dispatches but
+        // push the tasks to a queue to be drained later. This avoids the stackoverflow caused by recursion
+        // in the linker when compiling a large number of functions at once.
+        SmallVector<uint64_t, 0> Addrs;
+        for (size_t i = 0; i < NewDefs.size(); i += 1000) {
+            auto end = std::min(i + 1000, NewDefs.size());
+            SmallVector<StringRef> batch(NewDefs.begin() + i, NewDefs.begin() + end);
+            auto AddrsBatch = jl_ExecutionEngine->findSymbols(batch);
+            Addrs.append(AddrsBatch);
+        }
         size_t nextaddr = 0;
         for (auto &this_code : linkready) {
             auto it = invokenames.find(this_code);
@@ -1834,7 +1841,7 @@ llvm::DataLayout jl_create_datalayout(TargetMachine &TM) {
 JuliaOJIT::JuliaOJIT()
   : TM(createTargetMachine()),
     DL(jl_create_datalayout(*TM)),
-    ES(cantFail(orc::SelfExecutorProcessControl::Create(nullptr, std::make_unique<::JuliaTaskDispatcher>()))),
+    ES(cantFail(orc::SelfExecutorProcessControl::Create())),
     GlobalJD(ES.createBareJITDylib("JuliaGlobals")),
     JD(ES.createBareJITDylib("JuliaOJIT")),
     ExternalJD(ES.createBareJITDylib("JuliaExternal")),
@@ -2091,7 +2098,7 @@ SmallVector<uint64_t> JuliaOJIT::findSymbols(ArrayRef<StringRef> Names)
         Unmangled[NonOwningSymbolStringPtr(Mangled)] = Unmangled.size();
         Exports.add(std::move(Mangled));
     }
-    SymbolMap Syms = cantFail(::safelookup(ES, orc::makeJITDylibSearchOrder(ArrayRef(&JD)), std::move(Exports)));
+    SymbolMap Syms = cantFail(ES.lookup(orc::makeJITDylibSearchOrder(ArrayRef(&JD)), std::move(Exports)));
     SmallVector<uint64_t> Addrs(Names.size());
     for (auto it : Syms) {
         Addrs[Unmangled.at(orc::NonOwningSymbolStringPtr(it.first))] = it.second.getAddress().getValue();
@@ -2103,7 +2110,7 @@ Expected<ExecutorSymbolDef> JuliaOJIT::findSymbol(StringRef Name, bool ExportedS
 {
     orc::JITDylib* SearchOrders[3] = {&JD, &GlobalJD, &ExternalJD};
     ArrayRef<orc::JITDylib*> SearchOrder = ArrayRef<orc::JITDylib*>(&SearchOrders[0], ExportedSymbolsOnly ? 3 : 1);
-    auto Sym = ::safelookup(ES, SearchOrder, Name);
+    auto Sym = ES.lookup(SearchOrder, Name);
     return Sym;
 }
 
@@ -2116,7 +2123,7 @@ Expected<ExecutorSymbolDef> JuliaOJIT::findExternalJDSymbol(StringRef Name, bool
 {
     orc::JITDylib* SearchOrders[3] = {&ExternalJD, &GlobalJD, &JD};
     ArrayRef<orc::JITDylib*> SearchOrder = ArrayRef<orc::JITDylib*>(&SearchOrders[0], ExternalJDOnly ? 1 : 3);
-    auto Sym = ::safelookup(ES, SearchOrder, getMangledName(Name));
+    auto Sym = ES.lookup(SearchOrder, getMangledName(Name));
     return Sym;
 }
 
diff --git a/src/llvm-julia-task-dispatcher.h b/src/llvm-julia-task-dispatcher.h
deleted file mode 100644
index dd4037378b6b6..0000000000000
--- a/src/llvm-julia-task-dispatcher.h
+++ /dev/null
@@ -1,465 +0,0 @@
-// This file is a part of Julia. License is MIT: https://julialang.org/license
-
-namespace {
-
-using namespace llvm::orc;
-
-template <typename U> struct future_value_storage {
-  // Union disables default construction/destruction semantics, allowing us to
-  // use placement new/delete for precise control over value lifetime
-  union {
-    U value_;
-  };
-
-  future_value_storage() {}
-  ~future_value_storage() {}
-};
-
-template <> struct future_value_storage<void> {
-  // No value_ member for void
-};
-
-struct JuliaTaskDispatcher : public TaskDispatcher {
-    /// Forward declarations
-    class future_base;
-    void dispatch(std::unique_ptr<Task> T) override;
-    void shutdown() override;
-    void work_until(future_base &F);
-private:
-  /// C++ does not support non-static thread_local variables, so this needs to
-  /// store both the task and the associated dispatcher queue so that shutdown
-  /// can wait for the correct tasks to finish.
-  thread_local static SmallVector<std::pair<std::unique_ptr<Task>, JuliaTaskDispatcher*>> TaskQueue;
-  std::mutex DispatchMutex;
-  std::condition_variable WorkFinishedCV;
-  SmallVector<future_base *> WaitingFutures;
-
-public:
-
-/// @name ORC Promise/Future Classes
-///
-/// ORC-aware promise/future implementation that integrates with the
-/// TaskDispatcher system to allow efficient cooperative multitasking while
-/// waiting for results (with certain limitations on what can be awaited).
-/// Together they provide building blocks for a full async/await-like runtime
-/// for llvm that supports multiple threads.
-///
-/// Unlike std::promise/std::future alone, these classes can help dispatch other
-/// tasks while waiting, preventing deadlocks and improving overall system
-/// throughput. They have a similar API, though with some important differences
-/// and some features simply not currently implemented.
-///
-/// @{
-
-template <typename T> class promise;
-template <typename T> class future;
-
-/// Status for future/promise state
-enum class FutureStatus : uint8_t { NotReady = 0, Ready = 1 };
-
-/// @}
-
-/// Type-erased base class for futures, generally for scheduler use to avoid
-/// needing virtual dispatches
-class future_base {
-public:
-  /// Check if the future is now ready with a value (precondition: get_promise()
-  /// must have been called)
-  bool ready() const {
-    if (!valid())
-      report_fatal_error("ready() called before get_promise()");
-    return state_->status_.load(std::memory_order_acquire) == FutureStatus::Ready;
-  }
-
-  /// Check if the future is in a valid state (not moved-from and get_promise() called)
-  bool valid() const { return state_ != nullptr; }
-
-  /// Wait for the future to be ready, helping with task dispatch
-  void wait(JuliaTaskDispatcher &D) {
-    // Keep helping with task dispatch until our future is ready
-    if (!ready()) {
-      D.work_until(*this);
-      if (state_->status_.load(std::memory_order_relaxed) != FutureStatus::Ready)
-        report_fatal_error(
-            "work_until() returned without this future being ready");
-    }
-  }
-
-protected:
-  struct state_base {
-    std::atomic<FutureStatus> status_{FutureStatus::NotReady};
-  };
-
-  future_base(state_base *state) : state_(state) {}
-  future_base() = default;
-
-  /// Only allow deleting the future once it is invalid
-  ~future_base() {
-    if (state_)
-      report_fatal_error("get() must be called before future destruction (ensuring promise::set_value memory is valid)");
-  }
-
-  // Move constructor and assignment
-  future_base(future_base &&other) noexcept : state_(other.state_) {
-    other.state_ = nullptr;
-  }
-
-  future_base &operator=(future_base &&other) noexcept {
-    if (this != &other) {
-      this->~future_base();
-      state_ = other.state_;
-      other.state_ = nullptr;
-    }
-    return *this;
-  }
-
-  state_base *state_;
-};
-
-/// TaskDispatcher-aware future class for cooperative await.
-///
-/// @tparam T The type of value this future will provide. Use void for futures
-/// that
-///           signal completion without providing a value.
-///
-/// This future implementation is similar to `std::future`, so most code can
-/// transition to it easily. However, it differs from `std::future` in a few
-/// key ways to be aware of:
-/// - No exception support (or the overhead for it).
-/// - The future is created before the promise, then the promise is created
-///   from the future.
-/// - The future is in an invalid state until get_promise() has been called.
-/// - Waiting operations (get(&D), wait(&D)) help dispatch other tasks while
-///   blocked, requiring an additional argument of which TaskDispatcher object
-///   of where all associated work will be scheduled.
-/// - While `wait` may be called multiple times and on multiple threads, all of
-///   them must have returned before calling `get` on exactly one thread.
-/// - Must call get() exactly once before destruction (enforced with
-///   `report_fatal_error`) after each call to `get_promise`. Internal state is
-///   freed when `get` returns, and allocated when `get_promise` is called.
-///
-/// Other notable features, in common with `std::future`:
-/// - Supports both value types and void specialization through the same
-/// interface.
-/// - Thread-safe through atomic operations.
-/// - Provides acquire-release ordering with `std::promise::set_value()`.
-/// - Concurrent access to any method (including to `ready`) on multiple threads
-///   is not allowed.
-/// - Holding any locks while calling `get()` is likely to lead to deadlock.
-///
-/// @warning Users should avoid borrowing references to futures. References may
-/// go out of scope and break the uniqueness contract, which may break the
-/// soundness of the types. Always use move semantics or pass by value.
-
-template <typename T> class future : public future_base {
-public:
-  future() : future_base(nullptr) {}
-  future(const future &) = delete;
-  future &operator=(const future &) = delete;
-  future(future &&) = default;
-  future &operator=(future &&) = default;
-
-  /// Get the value, helping with task dispatch while waiting.
-  /// This will destroy the underlying value, so this must be called exactly
-  /// once, which returns the future to the initial state.
-  T get(JuliaTaskDispatcher &D) {
-    if (!valid())
-      report_fatal_error("get() must only be called once, after get_promise()");
-    wait(D);
-    auto state_ = static_cast<state*>(this->state_);
-    this->state_ = nullptr;
-    return take_value(state_);
-  }
-
-  /// Get the associated promise (must only be called once)
-  promise<T> get_promise() {
-    if (valid())
-      report_fatal_error("get_promise() can only be called once");
-    auto state_ = new state();
-    this->state_ = state_;
-    return promise<T>(state_);
-  }
-
-private:
-  friend class promise<T>;
-
-  // Template the state struct with EBCO so that future<void> has no wasted
-  // overhead for the value. The declaration of future_value_storage is far
-  // above here since GCC doesn't implement it properly when nested.
-  struct state : future_base::state_base, future_value_storage<T> {};
-
-  template <typename U = T>
-  typename std::enable_if<!std::is_void<U>::value, U>::type take_value(state *state_) {
-    T result = std::move(state_->value_);
-    state_->value_.~T();
-    delete state_;
-    return result;
-  }
-
-  template <typename U = T>
-  typename std::enable_if<std::is_void<U>::value, U>::type take_value(state *state_) {
-    delete state_;
-  }
-};
-
-/// TaskDispatcher-aware promise class that provides values to associated
-/// futures.
-///
-/// @tparam T The type of value this promise will provide. Use void for promises
-/// that
-///           signal completion without providing a value.
-///
-/// This promise implementation provides the value-setting side of the
-/// promise/future pair and integrates with the ORC TaskDispatcher system. Key
-/// characteristics:
-/// - Created from a future via get_promise() rather than creating the future from the promise.
-/// - Must call get_future() on the thread that created it (it can be passed to another thread, but do not borrow a reference and use that to mutate it from another thread).
-/// - Must call set_value() exactly once per `get_promise()` call to provide the result.
-/// - Thread-safe from set_value to get.
-/// - Move-only semantics to prevent accidental copying.
-///
-/// The `promise` can usually be passed to another thread in one of two ways:
-/// - With move semantics:
-///     * `[P = F.get_promise()] () { P.set_value(); }`
-///     * `[P = std::move(P)] () { P.set_value(); }`
-///     * Advantages: clearer where `P` is owned, automatic deadlock detection
-///     on destruction,
-///       easier memory management if the future is returned from the function.
-/// - By reference:
-///     * `[&P] () { P.set_value(); }`
-///     * Advantages: simpler memory management if the future is consumed in the
-///     same function.
-///     * Disadvantages: more difficult memory management if the future is
-///     returned from the function, no deadlock detection.
-///
-/// @warning Users should avoid borrowing references to promises. References may
-/// go out of scope and break the uniqueness contract, which may break the
-/// soundness of the types. Always use move semantics or pass by value.
-///
-/// @par Error Handling:
-/// The promise/future system uses report_fatal_error() for misuse:
-/// - Calling set_value() more than once.
-/// - Destroying a future without calling get().
-/// - Calling get() more than once on a future.
-///
-/// @par Thread Safety:
-/// - Each promise/future must only be accessed by one thread, as concurrent
-///   calls to the API functions may result in crashes.
-/// - Multiple threads can safely access different promise/future pairs.
-/// - set_value() and get() operations are atomic and thread-safe.
-/// - Move operations should only be performed by a single thread.
-template <typename T> class promise {
-  friend class future<T>;
-
-public:
-  promise() : state_(nullptr) {}
-
-  ~promise() {
-    // Assert proper promise lifecycle: ensure set_value was called if promise was valid.
-    // This can catch deadlocks where a promise is created but set_value() is
-    // never called, though only if the promise is moved from instead of
-    // borrowed from the frame with the future.
-    // Empty promises (state_ == nullptr) are allowed to be destroyed without calling set_value.
-  }
-
-  promise(const promise &) = delete;
-  promise &operator=(const promise &) = delete;
-
-  promise(promise &&other) noexcept
-      : state_(other.state_) {
-    other.state_ = nullptr;
-  }
-
-  promise &operator=(promise &&other) noexcept {
-    if (this != &other) {
-      this->~promise();
-      state_ = other.state_;
-      other.state_ = nullptr;
-    }
-    return *this;
-  }
-
-
-  /// Set the value (must only be called once)
-  // In C++20, this std::conditional weirdness can probably be replaced just
-  // with requires. It ensures that we don't try to define a method for `void&`,
-  // but that if the user calls set_value(v) for any value v that they get a
-  // member function error, instead of no member named 'value_'.
-  template <typename U = T>
-  void
-  set_value(const typename std::conditional<std::is_void<T>::value,
-                                            std::nullopt_t, T>::type &value) const {
-    assert(state_ && "set_value() can only be called once");
-    new (&state_->value_) T(value);
-    state_->status_.store(FutureStatus::Ready, std::memory_order_release);
-    state_ = nullptr;
-  }
-
-  template <typename U = T>
-  void set_value(typename std::conditional<std::is_void<T>::value,
-                                           std::nullopt_t, T>::type &&value) const {
-    assert(state_ && "set_value() can only be called once");
-    new (&state_->value_) T(std::move(value));
-    state_->status_.store(FutureStatus::Ready, std::memory_order_release);
-    state_ = nullptr;
-  }
-
-  template <typename U = T>
-  typename std::enable_if<std::is_void<U>::value, void>::type
-  set_value(const std::nullopt_t &value) = delete;
-
-  template <typename U = T>
-  typename std::enable_if<std::is_void<U>::value, void>::type
-  set_value(std::nullopt_t &&value) = delete;
-
-  template <typename U = T>
-  typename std::enable_if<std::is_void<U>::value, void>::type set_value() const {
-    assert(state_ && "set_value() can only be called once");
-    state_->status_.store(FutureStatus::Ready, std::memory_order_release);
-    state_ = nullptr;
-  }
-
-  /// Swap with another promise
-  void swap(promise &other) noexcept {
-    using std::swap;
-    swap(state_, other.state_);
-  }
-
-private:
-  explicit promise(typename future<T>::state *state)
-      : state_(state) {}
-
-  mutable typename future<T>::state *state_;
-};
-
-}; // class JuliaTaskDispatcher
-
-thread_local SmallVector<std::pair<std::unique_ptr<Task>, JuliaTaskDispatcher *>> JuliaTaskDispatcher::TaskQueue;
-
-void JuliaTaskDispatcher::dispatch(std::unique_ptr<Task> T) {
-  TaskQueue.push_back(std::pair(std::move(T), this));
-}
-
-void JuliaTaskDispatcher::shutdown() {
-  // Keep processing until no tasks belonging to this dispatcher remain
-  while (true) {
-    // Check if any task belongs to this dispatcher
-    auto it = std::find_if(
-        TaskQueue.begin(), TaskQueue.end(),
-        [this](const auto &TaskPair) { return TaskPair.second == this; });
-
-    // If no tasks belonging to this dispatcher, we're done
-    if (it == TaskQueue.end())
-      return;
-
-    // Create a future/promise pair to wait for completion of this task
-    future<void> taskFuture;
-    // Replace the task with a GenericNamedTask that wraps the original task
-    // with a notification of completion that this thread can work_until.
-    auto originalTask = std::move(it->first);
-    it->first = makeGenericNamedTask(
-        [originalTask = std::move(originalTask),
-         taskPromise = taskFuture.get_promise()]() {
-          originalTask->run();
-          taskPromise.set_value();
-        },
-        "Shutdown task marker");
-
-    // Wait for the task to complete
-    taskFuture.get(*this);
-  }
-}
-
-void JuliaTaskDispatcher::work_until(future_base &F) {
-  while (!F.ready()) {
-    // First, process any tasks in our local queue
-    // Process in LIFO order (most recently added first) to avoid deadlocks
-    // when tasks have dependencies on each other
-    while (!TaskQueue.empty()) {
-      {
-        auto TaskPair = std::move(TaskQueue.back());
-        TaskQueue.pop_back();
-        TaskPair.first->run();
-      }
-
-      // Notify any threads that might be waiting for work to complete
-      {
-        std::lock_guard<std::mutex> Lock(DispatchMutex);
-        bool ShouldNotify = llvm::any_of(
-            WaitingFutures, [](future_base *F) { return F->ready(); });
-        if (ShouldNotify) {
-          WaitingFutures.clear();
-          WorkFinishedCV.notify_all();
-        }
-      }
-
-      // Check if our future is now ready
-      if (F.ready())
-        return;
-    }
-
-    // If we get here, our queue is empty but the future isn't ready
-    // We need to wait for other threads to finish work that should complete our
-    // future
-    {
-      std::unique_lock<std::mutex> Lock(DispatchMutex);
-      WaitingFutures.push_back(&F);
-      WorkFinishedCV.wait(Lock, [&F]() { return F.ready(); });
-    }
-  }
-}
-
-} // End namespace
-
-namespace std {
-template <typename T>
-void swap(::JuliaTaskDispatcher::promise<T> &lhs, ::JuliaTaskDispatcher::promise<T> &rhs) noexcept {
-  lhs.swap(rhs);
-}
-} // End namespace std
-
-// n.b. this actually is sometimes a safepoint
-Expected<SymbolMap>
-safelookup(ExecutionSession &ES,
-           const JITDylibSearchOrder &SearchOrder,
-           SymbolLookupSet Symbols, LookupKind K = LookupKind::Static,
-           SymbolState RequiredState = SymbolState::Ready,
-           RegisterDependenciesFunction RegisterDependencies = NoDependenciesToRegister) JL_NOTSAFEPOINT {
-  JuliaTaskDispatcher::future<MSVCPExpected<SymbolMap>> PromisedFuture;
-  auto NotifyComplete = [PromisedResult = PromisedFuture.get_promise()](Expected<SymbolMap> R) {
-    PromisedResult.set_value(std::move(R));
-  };
-  ES.lookup(K, SearchOrder, std::move(Symbols), RequiredState,
-        std::move(NotifyComplete), RegisterDependencies);
-  return PromisedFuture.get(static_cast<JuliaTaskDispatcher&>(ES.getExecutorProcessControl().getDispatcher()));
-}
-
-Expected<ExecutorSymbolDef>
-safelookup(ExecutionSession &ES,
-           const JITDylibSearchOrder &SearchOrder,
-           SymbolStringPtr Name,
-           SymbolState RequiredState = SymbolState::Ready) JL_NOTSAFEPOINT {
-  SymbolLookupSet Names({Name});
-
-  if (auto ResultMap = safelookup(ES, SearchOrder, std::move(Names), LookupKind::Static,
-                                  RequiredState, NoDependenciesToRegister)) {
-    assert(ResultMap->size() == 1 && "Unexpected number of results");
-    assert(ResultMap->count(Name) && "Missing result for symbol");
-    return std::move(ResultMap->begin()->second);
-  } else
-    return ResultMap.takeError();
-}
-
-Expected<ExecutorSymbolDef>
-safelookup(ExecutionSession &ES,
-           ArrayRef<JITDylib *> SearchOrder, SymbolStringPtr Name,
-           SymbolState RequiredState = SymbolState::Ready) JL_NOTSAFEPOINT {
-  return safelookup(ES, makeJITDylibSearchOrder(SearchOrder), Name, RequiredState);
-}
-
-Expected<ExecutorSymbolDef>
-safelookup(ExecutionSession &ES,
-           ArrayRef<JITDylib *> SearchOrder, StringRef Name,
-           SymbolState RequiredState = SymbolState::Ready) JL_NOTSAFEPOINT {
-  return safelookup(ES, SearchOrder, ES.intern(Name), RequiredState);
-}

From 0dfb7c9fc72a4cee0789e33eb2951237793ba93f Mon Sep 17 00:00:00 2001
From: Gabriel Baraldi <baraldigabriel@gmail.com>
Date: Fri, 25 Jul 2025 15:43:48 -0400
Subject: [PATCH 7/9] Fix llvmpasses

---
 test/llvmpasses/late-lower-gc.ll | 1 +
 1 file changed, 1 insertion(+)

diff --git a/test/llvmpasses/late-lower-gc.ll b/test/llvmpasses/late-lower-gc.ll
index 6d5c42f50fcde..0073d4be09289 100644
--- a/test/llvmpasses/late-lower-gc.ll
+++ b/test/llvmpasses/late-lower-gc.ll
@@ -201,6 +201,7 @@ define void @decayar([2 x {} addrspace(10)* addrspace(11)*] %ar) {
 
 define swiftcc ptr addrspace(10) @insert_element(ptr swiftself %0) {
 ; CHECK-LABEL: @insert_element
+  %v2 = call ptr @julia.get_pgcstack()
   %2 = alloca [10 x i64], i32 1, align 8
 ; CHECK: %gcframe = call ptr @julia.new_gc_frame(i32 10)
 ; CHECK: [[gc_slot_addr_:%.*]] = call ptr @julia.get_gc_frame_slot(ptr %gcframe, i32 0)

From 1adfbc3523bc58199daac1e7a9b992fb13917591 Mon Sep 17 00:00:00 2001
From: Kristoffer Carlsson <kcarlsson89@gmail.com>
Date: Wed, 23 Jul 2025 06:44:34 -0400
Subject: [PATCH 8/9] remove a testset from MMAP that might cause CI to now
 fail on Windows (#59062)

(cherry picked from commit 4718f43b5afe34ae7aa12d60755a817f0e4f6808)
---
 stdlib/Mmap/test/runtests.jl | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/stdlib/Mmap/test/runtests.jl b/stdlib/Mmap/test/runtests.jl
index 224664e5faaa2..0c34c49d400a7 100644
--- a/stdlib/Mmap/test/runtests.jl
+++ b/stdlib/Mmap/test/runtests.jl
@@ -341,18 +341,18 @@ end
 GC.gc()
 rm(file)
 
-@testset "test for #58982 - mmap with primitive types" begin
-    file = tempname()
-    primitive type PrimType9Bytes 9*8 end
-    arr = Vector{PrimType9Bytes}(undef, 2)
-    write(file, arr)
-    m = mmap(file, Vector{PrimType9Bytes})
-    @test length(m) == 2
-    @test m[1] == arr[1]
-    @test m[2] == arr[2]
-    finalize(m); m = nothing; GC.gc()
-    rm(file)
-end
+# test for #58982 - mmap with primitive types
+file = tempname()
+primitive type PrimType9Bytes 9*8 end
+arr = Vector{PrimType9Bytes}(undef, 2)
+write(file, arr)
+m = mmap(file, Vector{PrimType9Bytes})
+@test length(m) == 2
+@test m[1] == arr[1]
+@test m[2] == arr[2]
+finalize(m); m = nothing; GC.gc()
+rm(file)
+
 
 @testset "Docstrings" begin
     @test isempty(Docs.undocumented_names(Mmap))

From 227ff673657b598d196119920047cbbe5a311644 Mon Sep 17 00:00:00 2001
From: Keno Fischer <keno@juliacomputing.com>
Date: Thu, 24 Jul 2025 11:54:35 -0400
Subject: [PATCH 9/9] build: Also pass -fno-strict-aliasing for C++ (#59066)

As diagnosed by Andrew Pinski
(https://github.com/JuliaLang/julia/issues/58466#issuecomment-3105141193),
we are not respecting strict aliasing currently. We turn this off for C,
but the flag appears to be missing for C++. Looks like it's been that
way ever since that flag was first added to our build system (#484). We
should probably consider running TypeSanitizer over our code base to see
if we can make our code correct under strict aliasing as compilers are
increasingly taking advantage of it.

Fixes #58466

(cherry picked from commit 095e75304306484cb189249c3d21cf376afb9f6f)
---
 Make.inc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Make.inc b/Make.inc
index 466e97cc43c4f..32c3cd4595a20 100644
--- a/Make.inc
+++ b/Make.inc
@@ -555,7 +555,7 @@ JCPPFLAGS_COMMON  := -fasynchronous-unwind-tables
 JCPPFLAGS_CLANG   := $(JCPPFLAGS_COMMON) -mllvm -enable-tail-merge=0
 JCPPFLAGS_GCC     := $(JCPPFLAGS_COMMON) -fno-tree-tail-merge
 
-JCXXFLAGS_COMMON  := -pipe $(fPIC) -fno-rtti -std=c++17 -Wformat -Wformat-security
+JCXXFLAGS_COMMON  := -pipe $(fPIC) -fno-rtti -std=c++17 -Wformat -Wformat-security -fno-strict-aliasing
 JCXXFLAGS_CLANG   := $(JCXXFLAGS_COMMON) -pedantic
 JCXXFLAGS_GCC     := $(JCXXFLAGS_COMMON) -fno-gnu-unique