Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 15 additions & 1 deletion Lib/importlib/metadata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -890,6 +890,14 @@ def search(self, prepared: Prepared):
return itertools.chain(infos, eggs)


# Translation table for Prepared.normalize: lowercase and
# replace "-" (hyphen) and "." (dot) with "_" (underscore).
_normalize_table = str.maketrans(
"ABCDEFGHIJKLMNOPQRSTUVWXYZ-.",
"abcdefghijklmnopqrstuvwxyz__",
)


class Prepared:
"""
A prepared search query for metadata on a possibly-named package.
Expand Down Expand Up @@ -925,7 +933,13 @@ def normalize(name):
"""
PEP 503 normalization plus dashes as underscores.
"""
return re.sub(r"[-_.]+", "-", name).lower().replace('-', '_')
# Emulates ``re.sub(r"[-_.]+", "-", name).lower()`` from PEP 503
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@hugovk I did a quick scan of the 8.34M package names, and 3.17M are purely lowercase with no separators. Given that, I tried to add a fast path check here before we normalize the table and found strong improvements in the benchmark. I think the most readable version of the fast path would be:

if name.islower() and name.isalnum():
    return name

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it's worth it. What Hugo suggested is readable enough.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's not an unreasonable position. My reasoning was that a significant portion of packages (roughly 38%) are already alphanumeric and lowercase. This fast path allows skipping the translation and loop overhead for the most common case. I felt the performance gain for those users justified the small increase in complexity, but I'm happy to defer to your preference on the balance between speed and code footprint.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How much performance gain are we speaking about though?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

They're very close. If anything, the "fast path" seems to be a bit slower :)

# main./python.exe -m timeit -s "from importlib.metadata import Prepared" "Prepared.normalize('pillow')"
1000000 loops, best of 5: 390 nsec per loop./python.exe -m timeit -s "from importlib.metadata import Prepared" "Prepared.normalize('pillow')"
1000000 loops, best of 5: 393 nsec per loop
# PR./python.exe -m timeit -s "from importlib.metadata import Prepared" "Prepared.normalize('pillow')"
5000000 loops, best of 5: 95.8 nsec per loop./python.exe -m timeit -s "from importlib.metadata import Prepared" "Prepared.normalize('pillow')"
5000000 loops, best of 5: 96 nsec per loop
# fast path./python.exe -m timeit -s "from importlib.metadata import Prepared" "Prepared.normalize('pillow')"
5000000 loops, best of 5: 94.3 nsec per loop./python.exe -m timeit -s "from importlib.metadata import Prepared" "Prepared.normalize('pillow')"
5000000 loops, best of 5: 97.5 nsec per loop
❯ hyperfine --warmup 1 --runs 3 \
--prepare "git checkout main" "./python.exe benchmark_names_stdlib.py # main" \
--prepare "git checkout 3.15-importlib.metadata-canonicalize_name" "./python.exe benchmark_names_stdlib.py # PR" \
--prepare "git checkout 3.15-importlib.metadata-canonicalize_name-fast-path" "./python.exe benchmark_names_stdlib.py # fast path"
Benchmark 1: ./python.exe benchmark_names_stdlib.py # main
  Time (mean ± σ):      5.633 s ±  0.046 s    [User: 5.491 s, System: 0.101 s]
  Range (min … max):    5.592 s …  5.683 s    3 runs

Benchmark 2: ./python.exe benchmark_names_stdlib.py # PR
  Time (mean ± σ):      1.879 s ±  0.026 s    [User: 1.783 s, System: 0.081 s]
  Range (min … max):    1.858 s …  1.907 s    3 runs

Benchmark 3: ./python.exe benchmark_names_stdlib.py # fast path
  Time (mean ± σ):      1.952 s ±  0.005 s    [User: 1.863 s, System: 0.080 s]
  Range (min … max):    1.947 s …  1.957 s    3 runs

Summary
  ./python.exe benchmark_names_stdlib.py # PR ran
    1.04 ± 0.01 times faster than ./python.exe benchmark_names_stdlib.py # fast path
    3.00 ± 0.05 times faster than ./python.exe benchmark_names_stdlib.py # main

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Running a slightly modified benchmark as the above (timeit + best of 3), on my Macbook (Apple Silicon), main branch with debug build of cPython:

  • Current PR (Translate + Loop) 5.4756s
  • With Fast Path (isalnum) 4.4691s

So for me, about 18.4% reduction in total time (or +22% speedup) on the full pypi benchmarl.

Copy link
Contributor

@a12k a12k Jan 10, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, we posted at about the same time. Interesting results from my end compared to yours, but would considers your canonical (especially since I'm on debug build with the extra overhead), so please disregard my comments then @hugovk :)

# About 3x faster, safe since packages only support alphanumeric characters
value = name.translate(_normalize_table)
# Condense repeats (faster than regex)
while "__" in value:
value = value.replace("__", "_")
return value

@staticmethod
def legacy_normalize(name):
Expand Down
34 changes: 34 additions & 0 deletions Lib/test/test_importlib/metadata/test_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from importlib.metadata import (
Distribution,
PackageNotFoundError,
Prepared,
distribution,
entry_points,
files,
Expand Down Expand Up @@ -313,3 +314,36 @@ class InvalidateCache(unittest.TestCase):
def test_invalidate_cache(self):
# No externally observable behavior, but ensures test coverage...
importlib.invalidate_caches()


class PreparedTests(unittest.TestCase):
def test_normalize(self):
tests = [
# Simple
("sample", "sample"),
# Mixed case
("Sample", "sample"),
("SAMPLE", "sample"),
("SaMpLe", "sample"),
# Separator conversions
("sample-pkg", "sample_pkg"),
("sample.pkg", "sample_pkg"),
("sample_pkg", "sample_pkg"),
# Multiple separators
("sample---pkg", "sample_pkg"),
("sample___pkg", "sample_pkg"),
("sample...pkg", "sample_pkg"),
# Mixed separators
("sample-._pkg", "sample_pkg"),
("sample_.-pkg", "sample_pkg"),
# Complex
("Sample__Pkg-name.foo", "sample_pkg_name_foo"),
("Sample__Pkg.name__foo", "sample_pkg_name_foo"),
# Uppercase with separators
("SAMPLE-PKG", "sample_pkg"),
("Sample.Pkg", "sample_pkg"),
("SAMPLE_PKG", "sample_pkg"),
]
for name, expected in tests:
with self.subTest(name=name):
self.assertEqual(Prepared.normalize(name), expected)
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
:mod:`importlib.metadata`: Use :meth:`str.translate` to improve performance of
:meth:`!importlib.metadata.Prepared.normalize`. Patch by Hugo van Kemenade and
Henry Schreiner.
Loading