diff --git a/docs/pages.jl b/docs/pages.jl
index b8173eb07..c0efb5d62 100644
--- a/docs/pages.jl
+++ b/docs/pages.jl
@@ -21,9 +21,9 @@ pages = ["index.md",
             "examples/sde/SDE_control.md"],
         "Delay Differential Equations (DDEs)" => Any["examples/dde/delay_diffeq.md"],
         "Partial Differential Equations (PDEs)" => Any[
-            "examples/pde/pde_constrained.md",
-            "examples/pde/brusselator.md"
-        ],
+                "examples/pde/pde_constrained.md",
+                "examples/pde/gray_scott.md"
+            ],
         "Hybrid and Jump Equations" => Any["examples/hybrid_jump/hybrid_diffeq.md",
             "examples/hybrid_jump/bouncing_ball.md"],
         "Bayesian Estimation" => Any["examples/bayesian/turing_bayesian.md"],
diff --git a/docs/src/examples/pde/brusselator.md b/docs/src/examples/pde/brusselator.md
deleted file mode 100644
index fea9aaa14..000000000
--- a/docs/src/examples/pde/brusselator.md
+++ /dev/null
@@ -1,273 +0,0 @@
-# Learning Nonlinear Reaction Dynamics in the 2D Brusselator PDE Using Universal Differential Equations
-
-## Introduction
-
-The Brusselator is a mathematical model used to describe oscillating chemical reactions and spatial pattern formation, capturing how concentrations of chemical species evolve over time and space. In this documentation, we simulate the two-dimensional Brusselator partial differential equation (PDE) on a periodic square domain, generate time-resolved data using a finite difference discretization, and use this data to train a **Universal Differential Equation (UDE)**. Specifically, we replace the known nonlinear reaction term with a neural network, enabling us to learn complex dynamics directly from the generated data while preserving the known physical structure of the system.
-
-## The Brusselator PDE
-
-The Brusselator PDE is defined on a unit square periodic domain as follows:
-
-$$\frac{\partial U}{\partial t} = B + U^2V - (A+1)U + \alpha \nabla^2 U + f(x, y, t)$$
-
-$$\frac{\partial V}{\partial t} = AU - U^2V + \alpha \nabla^2 V$$
-
-where $A=3.4, B=1$ and the forcing term is:
-
-$$f(x, y, t) =
-\begin{cases}
-5 & \text{if } (x - 0.3)^2 + (y - 0.6)^2 \leq 0.1^2 \text{ and } t \geq 1.1 \\
-0 & \text{otherwise}
-\end{cases}$$
-
-and the Laplacian operator is:
-
-$$\nabla^2 = \frac{\partial^2}{\partial x^2} + \frac{\partial^2}{\partial y^2}$$
-
-These equations are solved over the time interval:
-
-$$t \in [0, 11.5]$$
-
-with the initial conditions:
-
-$$U(x, y, 0) = 22 \cdot \left( y(1 - y) \right)^{3/2}$$
-
-$$V(x, y, 0) = 27 \cdot \left( x(1 - x) \right)^{3/2}$$
-
-and the periodic boundary conditions:
-
-$$U(x + 1, y, t) = U(x, y, t)$$
-
-$$V(x, y + 1, t) = V(x, y, t)$$
-
-## Numerical Discretization
-
-To numerically solve this PDE, we discretize the unit square domain using $N$ grid points along each spatial dimension. The variables $U[i,j]$ and $V[i,j]$ then denote the concentrations at the grid point $(i, j)$ at a given time $t$.
-
-We represent the spatially discretized fields as:
-
-$$U[i,j] = U(i \cdot \Delta x, j \cdot \Delta y), \quad V[i,j] = V(i \cdot \Delta x, j \cdot \Delta y),$$
-
-where  $\Delta x = \Delta y = \frac{1}{N}$ for a grid of size  $N \times N$. To organize the simulation state efficiently, we store both $ U $ and $ V $ in a single 3D array:
-
-$$u[i,j,1] = U[i,j], \quad u[i,j,2] = V[i,j],$$
-
-giving us a field tensor of shape $(N, N, 2)$. This structure is flexible and extends naturally to systems with additional field variables.
-
-## Finite Difference Laplacian and Forcing
-
-For spatial derivatives, we apply a second-order central difference scheme using a three-point stencil. The Laplacian is discretized as:
-
-$$[\ 1,\ -2,\ 1\ ]$$
-
-in both the $ x $ and $ y $ directions, forming a tridiagonal structure in both the x and y directions; applying this 1D stencil (scaled appropriately by $\frac{1}{Δx^2}$ or $\frac{1}{Δy^2}$) along each axis and summing the contributions yields the standard 5-point stencil computation for the 2D Laplacian. Periodic boundary conditions are incorporated by wrapping the stencil at the domain edges, effectively connecting the boundaries. The nonlinear interaction terms are computed directly at each grid point, making the implementation straightforward and local in nature.
-
-## Generating Training Data
-
-This provides us with an `ODEProblem` that can be solved to obtain training data.
-
-```@example bruss
-import ComponentArrays as CA, Random, Plots, OrdinaryDiffEq as ODE
-import SciMLBase
-
-N_GRID = 16
-XYD = range(0.0f0, stop = 1.0f0, length = N_GRID)
-dx = step(XYD)
-T_FINAL = 11.5f0
-SAVE_AT = 0.5f0
-tspan = (0.0f0, T_FINAL)
-t_points = range(tspan[1], stop = tspan[2], step = SAVE_AT)
-A, B, alpha = 3.4f0, 1.0f0, 10.0f0
-
-brusselator_f(x, y, t) = (((x - 0.3f0)^2 + (y - 0.6f0)^2) <= 0.01f0) * (t >= 1.1f0) * 5.0f0
-limit(a, N) = a == 0 ? N : a == N+1 ? 1 : a
-
-function init_brusselator(xyd)
-    println("[Init] Creating initial condition array...")
-    u0 = zeros(Float32, N_GRID, N_GRID, 2)
-    for I in CartesianIndices((N_GRID, N_GRID))
-        x, y = xyd[I[1]], xyd[I[2]]
-        u0[I, 1] = 22.0f0 * (y * (1.0f0 - y))^(3.0f0/2.0f0)
-        u0[I, 2] = 27.0f0 * (x * (1.0f0 - x))^(3.0f0/2.0f0)
-    end
-    println("[Init] Done.")
-    return u0
-end
-u0 = init_brusselator(XYD)
-
-function pde_truth!(du, u, p, t)
-    A, B, alpha, dx = p
-    αdx = alpha / dx^2
-    for I in CartesianIndices((N_GRID, N_GRID))
-        i, j = Tuple(I)
-        x, y = XYD[i], XYD[j]
-        ip1, im1 = limit(i+1, N_GRID), limit(i-1, N_GRID)
-        jp1, jm1 = limit(j+1, N_GRID), limit(j-1, N_GRID)
-        U, V = u[i, j, 1], u[i, j, 2]
-        ΔU = u[im1, j, 1] + u[ip1, j, 1] + u[i, jp1, 1] + u[i, jm1, 1] - 4.0f0 * U
-        ΔV = u[im1, j, 2] + u[ip1, j, 2] + u[i, jp1, 2] + u[i, jm1, 2] - 4.0f0 * V
-        du[i, j, 1] = αdx*ΔU + B + U^2 * V - (A+1.0f0)*U + brusselator_f(x, y, t)
-        du[i, j, 2] = αdx*ΔV + A*U - U^2 * V
-    end
-end
-
-p_tuple = (A, B, alpha, dx)
-@time sol_truth = ODE.solve(ODE.ODEProblem(pde_truth!, u0, tspan, p_tuple), ODE.FBDF(), saveat = t_points)
-u_true = Array(sol_truth)
-```
-
-## Visualizing Mean Concentration Over Time
-
-We can now use this code for training our UDE, and generating time-series plots of the concentrations of species of U and V using the code:
-
-```@example bruss
-import Plots, Statistics
-
-# Compute average concentration at each timestep
-avg_U = [Statistics.mean(snapshot[:, :, 1]) for snapshot in sol_truth.u]
-avg_V = [Statistics.mean(snapshot[:, :, 2]) for snapshot in sol_truth.u]
-
-# Plot average concentrations over time
-Plots.plot(
-    sol_truth.t, avg_U, label = "Mean U", lw = 2, xlabel = "Time", ylabel = "Concentration",
-    title = "Mean Concentration of U and V Over Time")
-Plots.plot!(sol_truth.t, avg_V, label = "Mean V", lw = 2, linestyle = :dash)
-```
-
-With the ground truth data generated and visualized, we are now ready to construct a Universal Differential Equation (UDE) by replacing the nonlinear term  $U^2V$ with a neural network. The next section outlines how we define this hybrid model and train it to recover the reaction dynamics from data.
-
-## Universal Differential Equation (UDE) Formulation
-
-In the original Brusselator model, the nonlinear reaction term \( U^2V \) governs key dynamic behavior. In our UDE approach, we replace this known term with a trainable neural network \( \mathcal{N}_\theta(U, V) \), where \( \theta \) are the learnable parameters.
-
-The resulting system becomes:
-
-$$
-\frac{\partial U}{\partial t} = 1 + \mathcal{N}_\theta(U, V) - 4.4U + \alpha \nabla^2 U + f(x, y, t)
-$$
-
-$$
-\frac{\partial V}{\partial t} = 3.4U - \mathcal{N}_\theta(U, V) + \alpha \nabla^2 V
-$$
-
-Here, $\mathcal{N}_\theta(U, V)$ is trained to approximate the true interaction term $U^2V$ using simulation data. This hybrid formulation allows us to recover unknown or partially known physical processes while preserving the known structural components of the PDE.
-
-First, we have to define and configure the neural network that has to be used for the training. The implementation for that is as follows:
-
-```@example bruss
-import Lux, Random, Optimization as OPT, OptimizationOptimJL as OOJ,
-       SciMLSensitivity as SMS, Zygote
-
-model = Lux.Chain(Lux.Dense(2 => 16, tanh), Lux.Dense(16 => 1))
-rng = Random.default_rng()
-ps_init, st = Lux.setup(rng, model)
-ps_init = CA.ComponentArray(ps_init)
-```
-
-We use a simple fully connected neural network with one hidden layer of 16 tanh-activated units to approximate the nonlinear interaction term.
-
-To ensure consistency between the ground truth simulation and the learned Universal Differential Equation (UDE) model, we preserve the same spatial discretization scheme used in the original ODEProblem. This includes:
-
-  - the finite difference Laplacian,
-  - periodic boundary conditions, and
-  - the external forcing function.
-
-The only change lies in the replacement of the known nonlinear term $U^2V$ with a neural network approximation
-$\mathcal{N}_\theta(U, V)$. This design enables the UDE to learn complex or unknown dynamics from data while maintaining the underlying physical structure of the system.
-
-The function below implements this hybrid formulation:
-
-```@example bruss
-function pde_ude!(du, u, ps_nn, t)
-    αdx = alpha / dx^2
-    for I in CartesianIndices((N_GRID, N_GRID))
-        i, j = Tuple(I)
-        x, y = XYD[i], XYD[j]
-        ip1, im1 = limit(i+1, N_GRID), limit(i-1, N_GRID)
-        jp1, jm1 = limit(j+1, N_GRID), limit(j-1, N_GRID)
-        U, V = u[i, j, 1], u[i, j, 2]
-        ΔU = u[im1, j, 1] + u[ip1, j, 1] + u[i, jp1, 1] + u[i, jm1, 1] - 4.0f0 * U
-        ΔV = u[im1, j, 2] + u[ip1, j, 2] + u[i, jp1, 2] + u[i, jm1, 2] - 4.0f0 * V
-        nn_val, _ = model([U, V], ps_nn, st)
-        val = nn_val[1]
-        du[i, j, 1] = αdx*ΔU + B + val - (A+1.0f0)*U + brusselator_f(x, y, t)
-        du[i, j, 2] = αdx*ΔV + A*U - val
-    end
-end
-prob_ude_template = ODE.ODEProblem(pde_ude!, u0, tspan, ps_init)
-```
-
-## Loss Function and Optimization
-
-To train the neural network
-$\mathcal{N}_\theta(U, V)$ embedded in the UDE, we define a loss function that measures how closely the solution of the UDE matches the ground truth data generated earlier.
-
-The loss is computed as the sum of squared errors between the predicted solution from the UDE and the true solution at each saved time point. If the solver fails (e.g., due to numerical instability or incorrect parameters), we return an infinite loss to discard that configuration during optimization. We use `FBDF()` as the solver due to the stiff nature of the brusselators euqation. Other solvers like `KenCarp47()` could also be used.
-
-To efficiently compute gradients of the loss with respect to the neural network parameters, we use an adjoint sensitivity method (`GaussAdjoint`), which performs high-accuracy quadrature-based integration of the adjoint equations. This approach enables scalable and memory-efficient training for stiff PDEs by avoiding full trajectory storage while maintaining accurate gradient estimates.
-
-The loss function and initial evaluation are implemented as follows:
-
-```@example bruss
-println("[Loss] Defining loss function...")
-function loss_fn(ps, _)
-    prob = ODE.remake(prob_ude_template, p = ps)
-    sol = ODE.solve(prob, ODE.FBDF(), saveat = t_points)
-    # Failed solve 
-    if !SciMLBase.successful_retcode(sol)
-        return Inf32
-    end
-    pred = Array(sol)
-    lval = sum(abs2, pred .- u_true) / length(u_true)
-    return lval
-end
-```
-
-Once the loss function is defined, we use the ADAM optimizer to train the neural network. The optimization problem is defined using SciML's `Optimization.jl` tools, and gradients are computed via automatic differentiation using `AutoZygote()` from `SciMLSensitivity`:
-
-```@example bruss
-println("[Training] Starting optimization...")
-import OptimizationOptimisers as OPO
-optf = OPT.OptimizationFunction(loss_fn, SMS.AutoZygote())
-optprob = OPT.OptimizationProblem(optf, ps_init)
-loss_history = Float32[]
-
-callback = (ps, l) -> begin
-    push!(loss_history, l)
-    println("Epoch $(length(loss_history)): Loss = $l")
-    false
-end
-```
-
-Finally to run everything:
-
-```@example bruss
-res = OPT.solve(optprob, OPO.Optimisers.Adam(0.01), callback = callback, maxiters = 100)
-```
-
-```@example bruss
-res.objective
-```
-
-```@example bruss
-println("[Plot] Final U/V comparison plots...")
-center = N_GRID ÷ 2
-sol_final = ODE.solve(ODE.remake(prob_ude_template, p = res.u), ODE.FBDF(), saveat = t_points)
-pred = Array(sol_final)
-
-p1 = Plots.plot(t_points, u_true[center, center, 1, :], lw = 2, label = "U True")
-Plots.plot!(p1, t_points, pred[center, center, 1, :], lw = 2, ls = :dash, label = "U Pred")
-Plots.title!(p1, "Center U Concentration Over Time")
-
-p2 = Plots.plot(t_points, u_true[center, center, 2, :], lw = 2, label = "V True")
-Plots.plot!(p2, t_points, pred[center, center, 2, :], lw = 2, ls = :dash, label = "V Pred")
-Plots.title!(p2, "Center V Concentration Over Time")
-
-Plots.plot(p1, p2, layout = (1, 2), size = (900, 400))
-```
-
-## Results and Conclusion
-
-After training the Universal Differential Equation (UDE), we compared the predicted dynamics to the ground truth for both chemical species.
-
-The low training loss shows us that the neural network in the UDE was able to understand the underlying dynamics, and it was able to learn the $U^2V$ term in the partial differential equation.
diff --git a/docs/src/examples/pde/gray_scott.md b/docs/src/examples/pde/gray_scott.md
new file mode 100644
index 000000000..dddc6c055
--- /dev/null
+++ b/docs/src/examples/pde/gray_scott.md
@@ -0,0 +1,435 @@
+# Learning Non-Linear Reaction Dynamics for the Gray–Scott Reaction–Diffusion Model using Universal Differential Equations
+
+## Introduction
+The Gray–Scott model is a prototypical reaction–diffusion system known for generating a wide variety of spatial patterns — from spots and stripes to labyrinthine structures — driven purely by simple chemical kinetics and diffusion. 
+
+In this tutorial, we’ll employ a Universal Differential Equation (UDE) framework: embedding a small neural network within the PDE’s reaction term to learn unknown dynamics from data, while retaining the known diffusion physics.
+
+!!! note
+
+    The data generation and training process specifically avoids the chaotic regime of this equation in order to avoid the potential fitting issues involved in that case. See
+    [How chaotic is chaos? How some AI for Science / SciML papers are overstating accuracy claims](https://www.stochasticlifestyle.com/how-chaotic-is-chaos-how-some-ai-for-science-sciml-papers-are-overstating-accuracy-claims/) or
+    [Scientific Machine Learning of Chaotic Systems Discovers Governing Equations for Neural Populations](https://arxiv.org/abs/2507.03631) for more details
+
+## Equations of the Gray-Scott Model
+
+!!! note
+
+    For more information on the semi-discretization of the Gray-Scott model for performance, see
+    [the DifferentialEquations.jl tutorial](https://docs.sciml.ai/DiffEqDocs/stable/tutorials/faster_ode_example/#Example-Accelerating-Linear-Algebra-PDE-Semi-Discretization).
+
+The system is governed by the coupled PDEs:
+```math
+\frac{\partial u}{\partial t} = D_1\,\nabla^2 u + \frac{a\,u^2}{v} + \bar{u} - \alpha
+```
+
+```math
+\frac{\partial v}{\partial t} = D_2\,\nabla^2 v + a\,u^2 + \beta\,v 
+```
+
+where $u$ and $v$ are the two chemical concentrations, $D_1$ and $D_2$ are diffusion coefficients, and $a$, $\bar{u}$, $\alpha$, $\beta$ are reaction parameters.
+
+In its spatially discretized form (using Neumann boundary conditions and the tridiagonal stencil $[1, -2, 1]$), the Gray–Scott PDE reduces to:
+
+```math
+du = D1 * (A_y u + u A_x) + \frac{a u^2}{v} + \bar{u} - \alpha u
+```
+
+```math
+dv = D2 (A_y v + v A_x) + a u^2 + \beta v
+````
+Here $A_x$ and $A_y$ are the 1D Laplacian matrices for x- and y-directions, respectively.
+
+Now we will dive into the implementation of the UDE. 
+
+## Ground-truth data generation.
+```@example gray_scott
+using LinearAlgebra, DifferentialEquations
+using Plots
+
+const N = 16
+a = 1.0
+α = 1.0
+ubar = 1.0
+β = 10.0 
+D1 = 0.001
+D2 = 100.0
+
+Ax = Array(Tridiagonal([1.0 for i in 1:(N - 1)], [-2.0 for i in 1:N],
+    [1.0 for i in 1:(N - 1)]))
+Ay = copy(Ax)
+Ax[2, 1] = 2.0
+Ax[end - 1, end] = 2.0
+Ay[1, 2] = 2.0
+Ay[end, end - 1] = 2.0
+
+uss = (ubar + β) / α
+vss = (a / β) * uss^2
+r0 = zeros(N, N, 2)
+r0[:, :, 1] .= uss .+ 0.1 .* rand.()
+r0[:, :, 2] .= vss
+```
+
+Having set up the grid, parameters, and initial condition, we now generate “ground truth” data by integrating the pure physics Gray–Scott model. This dataset will serve as the target that our UDE aims to learn.
+
+```@example gray_scott
+function fast_gm!(du, u, p, t)
+    p = nothing
+    @inbounds for j in 2:(N - 1), i in 2:(N - 1)
+        du[i, j, 1] = D1 *
+                      (u[i - 1, j, 1] + u[i + 1, j, 1] + u[i, j + 1, 1] + u[i, j - 1, 1] -
+                       4u[i, j, 1]) +
+                      a * u[i, j, 1]^2 / u[i, j, 2] + ubar - α * u[i, j, 1]
+    end
+
+    @inbounds for j in 2:(N - 1), i in 2:(N - 1)
+        du[i, j, 2] = D2 *
+                      (u[i - 1, j, 2] + u[i + 1, j, 2] + u[i, j + 1, 2] + u[i, j - 1, 2] -
+                       4u[i, j, 2]) +
+                      a * u[i, j, 1]^2 - β * u[i, j, 2]
+    end
+
+    @inbounds for j in 2:(N - 1)
+        i = 1
+        du[1, j, 1] = D1 *
+                      (2u[i + 1, j, 1] + u[i, j + 1, 1] + u[i, j - 1, 1] - 4u[i, j, 1]) +
+                      a * u[i, j, 1]^2 / u[i, j, 2] + ubar - α * u[i, j, 1]
+    end
+    @inbounds for j in 2:(N - 1)
+        i = 1
+        du[1, j, 2] = D2 *
+                      (2u[i + 1, j, 2] + u[i, j + 1, 2] + u[i, j - 1, 2] - 4u[i, j, 2]) +
+                      a * u[i, j, 1]^2 - β * u[i, j, 2]
+    end
+    @inbounds for j in 2:(N - 1)
+        i = N
+        du[end, j, 1] = D1 *
+                        (2u[i - 1, j, 1] + u[i, j + 1, 1] + u[i, j - 1, 1] - 4u[i, j, 1]) +
+                        a * u[i, j, 1]^2 / u[i, j, 2] + ubar - α * u[i, j, 1]
+    end
+    @inbounds for j in 2:(N - 1)
+        i = N
+        du[end, j, 2] = D2 *
+                        (2u[i - 1, j, 2] + u[i, j + 1, 2] + u[i, j - 1, 2] - 4u[i, j, 2]) +
+                        a * u[i, j, 1]^2 - β * u[i, j, 2]
+    end
+
+    @inbounds for i in 2:(N - 1)
+        j = 1
+        du[i, 1, 1] = D1 *
+                      (u[i - 1, j, 1] + u[i + 1, j, 1] + 2u[i, j + 1, 1] - 4u[i, j, 1]) +
+                      a * u[i, j, 1]^2 / u[i, j, 2] + ubar - α * u[i, j, 1]
+    end
+    @inbounds for i in 2:(N - 1)
+        j = 1
+        du[i, 1, 2] = D2 *
+                      (u[i - 1, j, 2] + u[i + 1, j, 2] + 2u[i, j + 1, 2] - 4u[i, j, 2]) +
+                      a * u[i, j, 1]^2 - β * u[i, j, 2]
+    end
+    @inbounds for i in 2:(N - 1)
+        j = N
+        du[i, end, 1] = D1 *
+                        (u[i - 1, j, 1] + u[i + 1, j, 1] + 2u[i, j - 1, 1] - 4u[i, j, 1]) +
+                        a * u[i, j, 1]^2 / u[i, j, 2] + ubar - α * u[i, j, 1]
+    end
+    @inbounds for i in 2:(N - 1)
+        j = N
+        du[i, end, 2] = D2 *
+                        (u[i - 1, j, 2] + u[i + 1, j, 2] + 2u[i, j - 1, 2] - 4u[i, j, 2]) +
+                        a * u[i, j, 1]^2 - β * u[i, j, 2]
+    end
+
+    @inbounds begin
+        i = 1
+        j = 1
+        du[1, 1, 1] = D1 * (2u[i + 1, j, 1] + 2u[i, j + 1, 1] - 4u[i, j, 1]) +
+                      a * u[i, j, 1]^2 / u[i, j, 2] + ubar - α * u[i, j, 1]
+        du[1, 1, 2] = D2 * (2u[i + 1, j, 2] + 2u[i, j + 1, 2] - 4u[i, j, 2]) +
+                      a * u[i, j, 1]^2 - β * u[i, j, 2]
+
+        i = 1
+        j = N
+        du[1, N, 1] = D1 * (2u[i + 1, j, 1] + 2u[i, j - 1, 1] - 4u[i, j, 1]) +
+                      a * u[i, j, 1]^2 / u[i, j, 2] + ubar - α * u[i, j, 1]
+        du[1, N, 2] = D2 * (2u[i + 1, j, 2] + 2u[i, j - 1, 2] - 4u[i, j, 2]) +
+                      a * u[i, j, 1]^2 - β * u[i, j, 2]
+
+        i = N
+        j = 1
+        du[N, 1, 1] = D1 * (2u[i - 1, j, 1] + 2u[i, j + 1, 1] - 4u[i, j, 1]) +
+                      a * u[i, j, 1]^2 / u[i, j, 2] + ubar - α * u[i, j, 1]
+        du[N, 1, 2] = D2 * (2u[i - 1, j, 2] + 2u[i, j + 1, 2] - 4u[i, j, 2]) +
+                      a * u[i, j, 1]^2 - β * u[i, j, 2]
+
+        i = N
+        j = N
+        du[end, end, 1] = D1 * (2u[i - 1, j, 1] + 2u[i, j - 1, 1] - 4u[i, j, 1]) +
+                          a * u[i, j, 1]^2 / u[i, j, 2] + ubar - α * u[i, j, 1]
+        du[end, end, 2] = D2 * (2u[i - 1, j, 2] + 2u[i, j - 1, 2] - 4u[i, j, 2]) +
+                          a * u[i, j, 1]^2 - β * u[i, j, 2]
+    end
+end
+
+prob = ODEProblem(fast_gm!, r0, (0.0, 10), p = nothing)
+solution_true = solve(prob, FBDF(), saveat=0.5, abstol=1e-6, reltol=1e-6);
+```
+
+With the ground-truth solutions computed, we can now proceed to visualize the spatiotemporal evolution of $u$ and $v$ on the grid.
+
+```@example gray_scott
+tsteps = 0:0.5:10
+println("Plotting ground truth concentrations at the grid center...")
+c = N ÷ 2
+p1 = plot(tsteps, solution_true[c,c,1,:], lw=2, label="U True", color=:blue, title="Ground Truth: Center U")
+xlabel!("Time"); ylabel!("Concentration")
+p2 = plot(tsteps, solution_true[c,c,2,:], lw=2, label="V True", color=:red, title="Ground Truth: Center V")
+xlabel!("Time")
+p_center_combined = plot(p1, p2, layout=(1,2), size=(900,350))
+display(p_center_combined)
+```
+
+## Defining the UDE
+Now that we have an understanding of the data and its visualization, we can define the neural network and the UDE structure. We replace the $\frac{a u^2}{v} + \bar{u} - \alpha u$ term with a neural network, giving the resultant ODEs.
+
+```math
+du = D1 (A_y u + u A_x) + \mathcal{N}_\theta(u,v)
+```
+
+```math
+dv = D2 (A_y v + v A_x) + a u^2 + \beta v
+````
+
+The first step is to define the neural network structure.
+
+```@example gray_scott
+using Lux
+using Random
+using ComponentArrays
+neural_network = Lux.Chain(Dense(2, 16, tanh), Dense(16, 16, tanh), Dense(16, 1, tanh))
+ps, st = Lux.setup(Random.default_rng(), neural_network)
+ps = ComponentArray(ps)
+global global_st = st
+```
+
+The following code describes the UDE formulation with the neural network predictions embedded, as well as a function wrapper to make sure the arrays work with `Optimization.jl`
+
+```@example gray_scott
+function ude!(du, u, p, t)
+    @inbounds for j in 2:(N - 1), i in 2:(N - 1)
+        u_val = u[i, j, 1]
+        v_val = u[i, j, 2]
+        input = [u_val, v_val]
+        nn_out, _ = Lux.apply(neural_network, input, p, global_st)
+        du[i, j, 1] = D1 *
+                      (u[i - 1, j, 1] + u[i + 1, j, 1] + u[i, j + 1, 1] + u[i, j - 1, 1] -
+                       4u[i, j, 1]) +
+                      nn_out[1]
+    end
+
+    @inbounds for j in 2:(N - 1), i in 2:(N - 1)
+        du[i, j, 2] = D2 *
+                      (u[i - 1, j, 2] + u[i + 1, j, 2] + u[i, j + 1, 2] + u[i, j - 1, 2] -
+                       4u[i, j, 2]) +
+                      a * u[i, j, 1]^2 - β * u[i, j, 2]
+    end
+
+    @inbounds for j in 2:(N - 1)
+        i = 1
+        u_val = u[i, j, 1]
+        v_val = u[i, j, 2]
+        input = [u_val, v_val]
+        nn_out, _ = Lux.apply(neural_network, input, p, global_st)
+        du[1, j, 1] = D1 *
+                      (2u[i + 1, j, 1] + u[i, j + 1, 1] + u[i, j - 1, 1] - 4u[i, j, 1]) +
+                      nn_out[1]
+    end
+    @inbounds for j in 2:(N - 1)
+        i = 1
+        du[1, j, 2] = D2 *
+                      (2u[i + 1, j, 2] + u[i, j + 1, 2] + u[i, j - 1, 2] - 4u[i, j, 2]) +
+                      a * u[i, j, 1]^2 - β * u[i, j, 2]
+    end
+    @inbounds for j in 2:(N - 1)
+        i = N
+        u_val = u[i, j, 1]
+        v_val = u[i, j, 2]
+        input = [u_val, v_val]
+        nn_out, _ = Lux.apply(neural_network, input, p, global_st)
+        du[end, j, 1] = D1 *
+                        (2u[i - 1, j, 1] + u[i, j + 1, 1] + u[i, j - 1, 1] - 4u[i, j, 1]) +
+                                nn_out[1]
+    end
+    @inbounds for j in 2:(N - 1)
+        i = N
+        du[end, j, 2] = D2 *
+                        (2u[i - 1, j, 2] + u[i, j + 1, 2] + u[i, j - 1, 2] - 4u[i, j, 2]) +
+                        a * u[i, j, 1]^2 - β * u[i, j, 2]
+    end
+
+    @inbounds for i in 2:(N - 1)
+        j = 1
+        u_val = u[i, j, 1]
+        v_val = u[i, j, 2]
+        input = [u_val, v_val]
+        nn_out, _ = Lux.apply(neural_network, input, p, global_st)
+        du[i, 1, 1] = D1 *
+                      (u[i - 1, j, 1] + u[i + 1, j, 1] + 2u[i, j + 1, 1] - 4u[i, j, 1]) +
+                      nn_out[1]
+    end
+    @inbounds for i in 2:(N - 1)
+        j = 1
+        du[i, 1, 2] = D2 *
+                      (u[i - 1, j, 2] + u[i + 1, j, 2] + 2u[i, j + 1, 2] - 4u[i, j, 2]) +
+                      a * u[i, j, 1]^2 - β * u[i, j, 2]
+    end
+    @inbounds for i in 2:(N - 1)
+        j = N
+        u_val = u[i, j, 1]
+        v_val = u[i, j, 2]
+        input = [u_val, v_val]
+        nn_out, _ = Lux.apply(neural_network, input, p, global_st)
+        du[i, end, 1] = D1 *
+                        (u[i - 1, j, 1] + u[i + 1, j, 1] + 2u[i, j - 1, 1] - 4u[i, j, 1]) +
+                        nn_out[1]
+    end
+    @inbounds for i in 2:(N - 1)
+        j = N
+        du[i, end, 2] = D2 *
+                        (u[i - 1, j, 2] + u[i + 1, j, 2] + 2u[i, j - 1, 2] - 4u[i, j, 2]) +
+                        a * u[i, j, 1]^2 - β * u[i, j, 2]
+    end
+
+    @inbounds begin
+        i = 1
+        j = 1
+        u_val = u[i, j, 1]
+        v_val = u[i, j, 2]
+        input = [u_val, v_val]
+        nn_out, _ = Lux.apply(neural_network, input, p, global_st)
+        du[1, 1, 1] = D1 * (2u[i + 1, j, 1] + 2u[i, j + 1, 1] - 4u[i, j, 1]) +
+                      nn_out[1]
+        du[1, 1, 2] = D2 * (2u[i + 1, j, 2] + 2u[i, j + 1, 2] - 4u[i, j, 2]) +
+                      a * u[i, j, 1]^2 - β * u[i, j, 2]
+
+        i = 1
+        j = N
+        u_val = u[i, j, 1]
+        v_val = u[i, j, 2]
+        input = [u_val, v_val]
+        nn_out, _ = Lux.apply(neural_network, input, p, global_st)
+        du[1, N, 1] = D1 * (2u[i + 1, j, 1] + 2u[i, j - 1, 1] - 4u[i, j, 1]) +
+                      nn_out[1]
+        du[1, N, 2] = D2 * (2u[i + 1, j, 2] + 2u[i, j - 1, 2] - 4u[i, j, 2]) +
+                      a * u[i, j, 1]^2 - β * u[i, j, 2]
+
+        i = N
+        j = 1
+        u_val = u[i, j, 1]
+        v_val = u[i, j, 2]
+        input = [u_val, v_val]
+        nn_out, _ = Lux.apply(neural_network, input, p, global_st)
+        du[N, 1, 1] = D1 * (2u[i - 1, j, 1] + 2u[i, j + 1, 1] - 4u[i, j, 1]) +
+                      nn_out[1]
+        du[N, 1, 2] = D2 * (2u[i - 1, j, 2] + 2u[i, j + 1, 2] - 4u[i, j, 2]) +
+                      a * u[i, j, 1]^2 - β * u[i, j, 2]
+
+        i = N
+        j = N
+        u_val = u[i, j, 1]
+        v_val = u[i, j, 2]
+        input = [u_val, v_val]
+        nn_out, _ = Lux.apply(neural_network, input, p, global_st)
+        du[end, end, 1] = D1 * (2u[i - 1, j, 1] + 2u[i, j - 1, 1] - 4u[i, j, 1]) +
+                          nn_out[1]
+        du[end, end, 2] = D2 * (2u[i - 1, j, 2] + 2u[i, j - 1, 2] - 4u[i, j, 2]) +
+                          a * u[i, j, 1]^2 - β * u[i, j, 2]
+    end
+end
+
+# Create wrapper for ODEProblem to work with Optimization.jl
+const nstate = N*N*2
+function ude_wrapper!(du, u, p, t)
+
+    # take a view of just the real state‐portion
+    du_state = @view du[1:nstate]
+    u_state  = @view u[1:nstate]
+
+    # reshape those into your 3-D arrays
+    @views du3 = reshape(du_state, N, N, 2)
+    @views u3  = reshape(u_state,  N, N, 2)
+
+    # call your in-place UDE
+    ude!(du3, u3, p, t)
+
+    return nothing
+end
+```
+
+## Loss Function and Optimization
+The loss function is defined as:
+```@example gray_scott
+function loss_fn(θ)
+    sol = solve(prob_ude, FBDF(), saveat=tsteps, p = θ, abstol=1e-6, reltol=1e-6)
+    if sol.retcode != :Success
+        println("Solver failed at some iteration.")
+        return Inf
+    end
+    pred_u = [mean(reshape(s, N, N, 2)[:, :, 1]) for s in sol.u]
+    loss = sum(abs2, pred_u .- data_u)
+    return loss
+end
+```
+
+The callback function is defined as:
+```@example gray_scott
+function training_callback(state, loss)
+    push!(loss_hist, loss)
+    epoch = length(loss_hist)
+    println("Callback @ Epoch = $(epoch), Loss = $(round(loss, digits=5))")
+    return false  # don't stop training
+end
+```
+We reshape the initial condition tensor `r0` into a 1D vector to define the initial state of the ODE system. Then, we construct the `ODEProblem` using the UDE dynamics and extract the time series of average `U` and `V` concentrations from the ground-truth solution to serve as training targets.
+
+```@example gray_scott
+u0_vec = reshape(r0, :)
+ode_fn = ODEFunction(ude_wrapper!, jac = nothing)
+prob_ude = ODEProblem(ode_fn, u0_vec, (0.0, 10.0), ps)
+
+using Statistics
+using SciMLSensitivity
+data_u = [mean(s[:, :, 1]) for s in solution_true.u]
+data_v = [mean(s[:, :, 2]) for s in solution_true.u]
+```
+
+Finally, we create the optimization problem, and solve the UDE.
+
+```@example gray_scott
+using Optimization, OptimizationOptimisers, BSON, Zygote
+
+# Define the OptimizationFunction using EnzymeVJP for AD
+optf = OptimizationFunction((θ, p) -> loss_fn(θ), Optimization.AutoZygote())
+
+# Create the optimization problem with current ps
+optprob = OptimizationProblem(optf, ps)
+
+# Solve using Adam
+res = solve(optprob, Optimisers.Adam(0.003); maxiters=10000, callback=training_callback)
+res.objective
+```
+
+## Results and conclusion
+We can visualize the final results of the UDE and the ground-truth data with the following code.
+
+```@example gray_scott
+# Final model prediction and plot
+sol_pred = solve(prob_ude, FBDF(), saveat=tsteps, p = res.u, abstol=1e-6, reltol=1e-6)
+pred_u = [mean(s[:, :, 1]) for s in sol_pred.u]
+
+using Plots
+plot(tsteps, data_u; lw=2, label="True ⟨U⟩", color=:blue)
+plot!(tsteps, pred_u; lw=2, label="UDE ⟨U⟩", color=:red, ls=:dash)
+xlabel!("Time"); ylabel!("Mean U"); title!("Training Fit")
+```
+
+Now, as we can see, the UDE predictions match the ground-truth data very well, indicating the model has successfully learned the non-linear term.