diff --git a/justfile b/justfile
index 200f3114..0de109f8 100644
--- a/justfile
+++ b/justfile
@@ -138,7 +138,7 @@ benchmark-local:
 # Show recent local benchmark results.
 [group('benchmarking')]
 show-benchmark-results:
-    @uv run --group=benchmark-local pytest-benchmark compare --group-by=fullname --sort=name --columns=mean
+    @uv run pytest-benchmark compare --group-by=fullname --sort=name --columns=mean
 
 # Run benchmarks using Codspeed. This only works in CI.
 [group('benchmarking')]
diff --git a/tests/benchmarking/adaptors.py b/tests/benchmarking/adaptors.py
deleted file mode 100644
index 93bb9134..00000000
--- a/tests/benchmarking/adaptors.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from grimp.adaptors.caching import Cache, CacheMiss
-from typing import Set
-
-from grimp.application.ports.modulefinder import ModuleFile
-from grimp.domain.valueobjects import DirectImport
-
-
-class PrefixMissingCache(Cache):
-    """
-    Test double of the real cache that will miss caching any module that begins with
-    a special prefix.
-    """
-
-    MISSING_PREFIX = "miss_marker_8772f06d64b6_"  # Arbitrary prefix.
-
-    def read_imports(self, module_file: ModuleFile) -> Set[DirectImport]:
-        leaf_name = module_file.module.name.split(".")[-1]
-        if leaf_name.startswith(self.MISSING_PREFIX):
-            raise CacheMiss
-        return super().read_imports(module_file)
diff --git a/tests/benchmarking/test_benchmarking.py b/tests/benchmarking/test_benchmarking.py
index f1a58bc6..5e5b4517 100644
--- a/tests/benchmarking/test_benchmarking.py
+++ b/tests/benchmarking/test_benchmarking.py
@@ -1,18 +1,14 @@
+import uuid
+import random
 import pytest
 import json
 import importlib
 from pathlib import Path
 
-from tests.config import override_settings
 from grimp.application.graph import ImportGraph
 from grimp import PackageDependency, Route
 import grimp
 from copy import deepcopy
-from .adaptors import PrefixMissingCache
-
-
-def _run_benchmark(benchmark, fn, *args, **kwargs):
-    return benchmark(fn, *args, **kwargs)
 
 
 @pytest.fixture(scope="module")
@@ -59,14 +55,16 @@ def large_graph():
                     middle=(),
                     tails=frozenset(
                         {
-                            "mypackage.application.7537183614.6928774480.5676105139.3275676604"  # noqa:E501
+                            "mypackage.application.7537183614.6928774480.5676105139.3275676604"
+                            # noqa:E501
                         }
                     ),
                 ),
                 Route(
                     heads=frozenset(
                         {
-                            "mypackage.domain.6928774480.5676105139.1330171288.7588443317.4661445087"  # noqa:E501
+                            "mypackage.domain.6928774480.5676105139.1330171288.7588443317.4661445087"
+                            # noqa:E501
                         }
                     ),
                     middle=(),
@@ -87,7 +85,8 @@ def large_graph():
                 Route(
                     heads=frozenset(
                         {
-                            "mypackage.domain.6928774480.1028759677.7960519247.2888779155.7486857426"  # noqa:E501
+                            "mypackage.domain.6928774480.1028759677.7960519247.2888779155.7486857426"
+                            # noqa:E501
                         }
                     ),
                     middle=(),
@@ -139,15 +138,19 @@ def large_graph():
                 Route(
                     heads=frozenset(
                         {
-                            "mypackage.application.7537183614.2538372545.1153384736.6297289996",  # noqa:E501
-                            "mypackage.application.7537183614.2538372545.1153384736.6404547812.6297289996",  # noqa:E501
+                            "mypackage.application.7537183614.2538372545.1153384736.6297289996",
+                            # noqa:E501
+                            "mypackage.application.7537183614.2538372545.1153384736.6404547812.6297289996",
+                            # noqa:E501
                         }
                     ),
                     middle=("mypackage.6398020133.9075581450.6529869526.6297289996",),
                     tails=frozenset(
                         {
-                            "mypackage.plugins.5634303718.6180716911.7582995238.1039461003.2943193489",  # noqa:E501
-                            "mypackage.plugins.5634303718.6180716911.7582995238.1039461003.6322703811",  # noqa:E501
+                            "mypackage.plugins.5634303718.6180716911.7582995238.1039461003.2943193489",
+                            # noqa:E501
+                            "mypackage.plugins.5634303718.6180716911.7582995238.1039461003.6322703811",
+                            # noqa:E501
                         }
                     ),
                 )
@@ -300,7 +303,7 @@ def test_build_django_uncached(benchmark):
 
     In this benchmark, the cache is turned off.
     """
-    _run_benchmark(benchmark, grimp.build_graph, "django", cache_dir=None)
+    benchmark(grimp.build_graph, "django", cache_dir=None)
 
 
 def test_build_django_from_cache_no_misses(benchmark):
@@ -312,7 +315,7 @@ def test_build_django_from_cache_no_misses(benchmark):
     # Populate the cache first, before beginning the benchmark.
     grimp.build_graph("django")
 
-    _run_benchmark(benchmark, grimp.build_graph, "django")
+    benchmark(grimp.build_graph, "django")
 
 
 @pytest.mark.parametrize(
@@ -323,36 +326,65 @@ def test_build_django_from_cache_no_misses(benchmark):
         350,  # Around half the Django codebase.
     ),
 )
-def test_build_django_from_cache_a_few_misses(benchmark, number_of_misses):
+def test_build_django_from_cache_a_few_misses(benchmark, number_of_misses: int):
     """
     Benchmarks building a graph of real package - in this case Django.
 
     This benchmark utilizes the cache except for a few modules, which we add.
     """
-    # We must use a special cache class, otherwise the cache will be populated
-    # by the first iteration. It would be better to do this using a setup function,
-    # which is supported by pytest-benchmark's pedantic mode, but not codspeed.
-    # This won't give us a truly accurate picture, but it's better than nothing.
+    # We need to take care in benchmarking partially populated caches, because
+    # the benchmark may run multiple times, depending on the context in which it's run.
+    # If we're not careful, the cache will be populated the first time and not reset
+    # in subsequent runs.
+    #
+    # The benchmark fixture available here is either from pytest-benchmark (used locally)
+    # or pytest-codspeed (used in CI). Here's how they both behave:
+    #
+    # - pytest-benchmark: By default, dynamically decides how many times to run the benchmark.
+    #                     It does this to improve benchmarking of very fast single runs: it will run
+    #                     the code many times and average the total time. That's not so important
+    #                     for code that takes orders of magnitude longer than the timer resolution.
+    #
+    #                     We can override this by using benchmark.pedantic, where we specify the
+    #                     number of rounds and iterations. Each iteration contains multiple rounds.
+    #
+    #                     It's also possible to provide a setup function that runs
+    #                     between each iteration. A teardown function will be in the next release
+    #                     but isn't in pytest-benchmark<=5.1.0.
+    #
+    # - pytest-codspeed:  This can run in two modes, CPU instrumentation and wall time. Currently
+    #                     we use CPU instrumentation as wall time is only available to
+    #                     Github organizations. CPU mode will always run the benchmark once,
+    #                     regardless of what rounds and iterations are specified in pedantic mode.
+    #                     This mode measures the speed of the benchmark by simulating the CPU,
+    #                     rather than timing how long it actually takes, so there is no point in
+    #                     running it multiple times and taking an average.
+    #
+    # So - in this case, because we are benchmarking a relatively slow piece of code, we explicitly
+    # turn off multiple runs, which could potentially be misleading when running locally.
 
-    # Add some specially-named modules which will be treated as not in the cache.
-    django_path = Path(importlib.util.find_spec("django").origin).parent
-    extra_modules = [
-        django_path / f"{PrefixMissingCache.MISSING_PREFIX}{i}.py" for i in range(number_of_misses)
-    ]
-    # Use some real python, which will take time to parse.
+    # Populate the cache first, before beginning the benchmark.
+    grimp.build_graph("django")
+    # Add some modules which won't be in the cache.
+    # (Use some real python, which will take time to parse.)
+    django_path = Path(importlib.util.find_spec("django").origin).parent  # type: ignore
     module_to_copy = django_path / "forms" / "forms.py"
     module_contents = module_to_copy.read_text()
-    for extra_module in extra_modules:
-        extra_module.write_text(module_contents)
-
-    with override_settings(CACHE_CLASS=PrefixMissingCache):
-        # Populate the cache.
-        grimp.build_graph("django")
+    extra_modules = [
+        django_path / f"module_{i}_{random.randint(100000, 999999)}.py"
+        for i in range(number_of_misses)
+    ]
+    for new_module in extra_modules:
+        # Make sure the module contents aren't identical. Depending on how the cache is implemented,
+        # perhaps this could make a difference.
+        hash_buster = f"\n# Hash busting comment: {uuid.uuid4()}"
+        new_module.write_text(module_contents + hash_buster)
 
-        _run_benchmark(benchmark, grimp.build_graph, "django")
+    benchmark.pedantic(grimp.build_graph, ["django"], rounds=1, iterations=1)
 
-    # Clean up.
-    [module.unlink() for module in extra_modules]
+    # Delete the modules we just created.
+    for module in extra_modules:
+        module.unlink()
 
 
 class TestFindIllegalDependenciesForLayers:
@@ -371,8 +403,7 @@ def _remove_package_dependencies(graph, package_dependencies):
         return graph
 
     def test_top_level_large_graph_violated(self, large_graph, benchmark):
-        result = _run_benchmark(
-            benchmark,
+        result = benchmark(
             large_graph.find_illegal_dependencies_for_layers,
             layers=TOP_LEVEL_LAYERS,
             containers=("mypackage",),
@@ -383,8 +414,7 @@ def test_top_level_large_graph_kept(self, large_graph, benchmark):
         large_graph = self._remove_package_dependencies(
             large_graph, TOP_LEVEL_PACKAGE_DEPENDENCIES
         )
-        result = _run_benchmark(
-            benchmark,
+        result = benchmark(
             large_graph.find_illegal_dependencies_for_layers,
             layers=TOP_LEVEL_LAYERS,
             containers=("mypackage",),
@@ -392,50 +422,39 @@ def test_top_level_large_graph_kept(self, large_graph, benchmark):
         assert result == set()
 
     def test_deep_layers_large_graph_violated(self, large_graph, benchmark):
-        result = _run_benchmark(
-            benchmark, large_graph.find_illegal_dependencies_for_layers, layers=DEEP_LAYERS
-        )
+        result = benchmark(large_graph.find_illegal_dependencies_for_layers, layers=DEEP_LAYERS)
         assert result == DEEP_LAYER_PACKAGE_DEPENDENCIES
 
     def test_deep_layers_large_graph_kept(self, large_graph, benchmark):
         large_graph = self._remove_package_dependencies(
             large_graph, DEEP_LAYER_PACKAGE_DEPENDENCIES
         )
-        result = _run_benchmark(
-            benchmark, large_graph.find_illegal_dependencies_for_layers, layers=DEEP_LAYERS
-        )
+        result = benchmark(large_graph.find_illegal_dependencies_for_layers, layers=DEEP_LAYERS)
         assert result == set()
 
 
 def test_find_descendants(large_graph, benchmark):
-    result = _run_benchmark(benchmark, large_graph.find_descendants, "mypackage")
+    result = benchmark(large_graph.find_descendants, "mypackage")
     assert len(result) == 28222
 
 
 def test_find_downstream_modules(large_graph, benchmark):
-    result = _run_benchmark(
-        benchmark, large_graph.find_downstream_modules, DEEP_LAYERS[0], as_package=True
-    )
+    result = benchmark(large_graph.find_downstream_modules, DEEP_LAYERS[0], as_package=True)
     assert len(result) == 80
 
 
 def test_find_upstream_modules(large_graph, benchmark):
-    result = _run_benchmark(
-        benchmark, large_graph.find_upstream_modules, DEEP_LAYERS[0], as_package=True
-    )
+    result = benchmark(large_graph.find_upstream_modules, DEEP_LAYERS[0], as_package=True)
     assert len(result) == 2159
 
 
 class TestFindShortestChain:
     def test_chain_found(self, large_graph, benchmark):
-        result = _run_benchmark(
-            benchmark, large_graph.find_shortest_chain, DEEP_LAYERS[0], DEEP_LAYERS[1]
-        )
+        result = benchmark(large_graph.find_shortest_chain, DEEP_LAYERS[0], DEEP_LAYERS[1])
         assert result is not None
 
     def test_no_chain(self, large_graph, benchmark):
-        result = _run_benchmark(
-            benchmark,
+        result = benchmark(
             large_graph.find_shortest_chain,
             DEEP_LAYERS[0],
             "mypackage.data.vendors.4053192739.6373932949",
@@ -445,8 +464,7 @@ def test_no_chain(self, large_graph, benchmark):
 
 class TestFindShortestChains:
     def test_chains_found(self, large_graph, benchmark):
-        result = _run_benchmark(
-            benchmark,
+        result = benchmark(
             large_graph.find_shortest_chains,
             DEEP_LAYERS[0],
             DEEP_LAYERS[1],
@@ -455,8 +473,7 @@ def test_chains_found(self, large_graph, benchmark):
         assert len(result) > 0
 
     def test_no_chains(self, large_graph, benchmark):
-        result = _run_benchmark(
-            benchmark,
+        result = benchmark(
             large_graph.find_shortest_chains,
             DEEP_LAYERS[0],
             "mypackage.data.vendors.4053192739.6373932949",
@@ -489,8 +506,7 @@ def test_chains_found_sparse_imports(self, benchmark):
                 graph.add_import(importer=f"a.m{i}", imported=f"b.m{i}")
                 graph.add_import(importer=f"b.m{i}", imported=f"c.m{i}")
 
-        result = _run_benchmark(
-            benchmark,
+        result = benchmark(
             graph.find_shortest_chains,
             "a",
             "c",
@@ -500,7 +516,7 @@ def test_chains_found_sparse_imports(self, benchmark):
 
 
 def test_copy_graph(large_graph, benchmark):
-    _run_benchmark(benchmark, lambda: deepcopy(large_graph))
+    benchmark(lambda: deepcopy(large_graph))
 
 
 def test_modules_property_first_access(large_graph, benchmark):
@@ -512,7 +528,7 @@ def f():
         # Accessing the modules property is what we're benchmarking.
         _ = large_graph.modules
 
-    _run_benchmark(benchmark, f)
+    benchmark(f)
 
 
 def test_modules_property_many_accesses(large_graph, benchmark):
@@ -525,7 +541,7 @@ def f():
         for i in range(1000):
             _ = large_graph.modules
 
-    _run_benchmark(benchmark, f)
+    benchmark(f)
 
 
 def test_get_import_details(benchmark):
@@ -540,19 +556,16 @@ def f():
         for i in range(iterations):
             graph.get_import_details(importer=f"blue_{i}", imported=f"green_{i}")
 
-    _run_benchmark(benchmark, f)
+    benchmark(f)
 
 
 def test_find_matching_modules(benchmark, large_graph):
-    matching_modules = _run_benchmark(
-        benchmark, lambda: large_graph.find_matching_modules("mypackage.domain.**")
-    )
+    matching_modules = benchmark(lambda: large_graph.find_matching_modules("mypackage.domain.**"))
     assert len(matching_modules) == 2519
 
 
 def test_find_matching_direct_imports(benchmark, large_graph):
-    matching_imports = _run_benchmark(
-        benchmark,
+    matching_imports = benchmark(
         lambda: large_graph.find_matching_direct_imports(
             "mypackage.domain.** -> mypackage.data.**"
         ),