Skip to content

Commit 1198422

Browse files
refactor: optimize venv building for namespace packages (#3454)
When implicit namespace packages are used, it's common for multiple distributions to install into the same directory, triggering the expensive conflict merging logic. This can be observed wit our doc builds, where `sphinxcontrib` is a namespace package that 7 distributions install into. To fix, treat top-level directories that have an importable name and don't have an `__init__` looking file as implicit namespace packages and mark them as disallowed from being directly linked. The importable name check is to exclude dist-info directories. --------- Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
1 parent 4558ffb commit 1198422

File tree

2 files changed

+108
-0
lines changed

2 files changed

+108
-0
lines changed

python/private/venv_runfiles.bzl

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@ load(
1313
"VenvSymlinkEntry",
1414
"VenvSymlinkKind",
1515
)
16+
load(":py_internal.bzl", "py_internal")
1617

1718
def create_venv_app_files(ctx, deps, venv_dir_map):
1819
"""Creates the tree of app-specific files for a venv for a binary.
@@ -231,6 +232,21 @@ def _merge_venv_path_group(ctx, group, keep_map):
231232
if venv_path not in keep_map:
232233
keep_map[venv_path] = file
233234

235+
def _is_importable_name(name):
236+
# Requires Bazel 8+
237+
if hasattr(py_internal, "regex_match"):
238+
# ?U means activates unicode matching (Python allows most unicode
239+
# in module names / identifiers).
240+
# \w matches alphanumeric and underscore.
241+
# NOTE: regex_match has an implicit ^ and $
242+
return py_internal.regex_match(name, "(?U)\\w+")
243+
else:
244+
# Otherwise, use a rough hueristic that should catch most cases.
245+
return (
246+
"." not in name and
247+
"-" not in name
248+
)
249+
234250
def get_venv_symlinks(ctx, files, package, version_str, site_packages_root):
235251
"""Compute the VenvSymlinkEntry objects for a library.
236252
@@ -270,6 +286,9 @@ def get_venv_symlinks(ctx, files, package, version_str, site_packages_root):
270286
# List of (File, str venv_path) tuples
271287
files_left_to_link = []
272288

289+
# dict[str dirname, bool is_namespace_package]
290+
namespace_package_dirs = {}
291+
273292
# We want to minimize the number of files symlinked. Ideally, only the
274293
# top-level directories are symlinked. Unfortunately, shared libraries
275294
# complicate matters: if a shared library's directory is linked, then the
@@ -310,6 +329,29 @@ def get_venv_symlinks(ctx, files, package, version_str, site_packages_root):
310329
else:
311330
files_left_to_link.append((src, venv_path))
312331

332+
top_level_dirname, _, tail = venv_path.partition("/")
333+
if (
334+
# If it's already not directly linkable, nothing to do
335+
not cannot_be_linked_directly.get(top_level_dirname, False) and
336+
# If its already known to be non-implicit namespace, then skip
337+
namespace_package_dirs.get(top_level_dirname, True) and
338+
# It must be an importable name to be an implicit namespace package
339+
_is_importable_name(top_level_dirname)
340+
):
341+
namespace_package_dirs.setdefault(top_level_dirname, True)
342+
343+
# Looking for `__init__.` isn't 100% correct, as it'll match e.g.
344+
# `__init__.pyi`, but it's close enough.
345+
if "/" not in tail and tail.startswith("__init__."):
346+
namespace_package_dirs[top_level_dirname] = False
347+
348+
# We treat namespace packages as a hint that other distributions may
349+
# install into the same directory. As such, we avoid linking them directly
350+
# to avoid conflict merging later.
351+
for dirname, is_namespace_package in namespace_package_dirs.items():
352+
if is_namespace_package:
353+
cannot_be_linked_directly[dirname] = True
354+
313355
# At this point, venv_symlinks has entries for the shared libraries
314356
# and cannot_be_linked_directly has the directories that cannot be
315357
# directly linked. Next, we loop over the remaining files and group

tests/venv_site_packages_libs/app_files_building/app_files_building_tests.bzl

Lines changed: 66 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ def _test_optimized_grouping_single_toplevel(name):
219219
empty_files(
220220
name = name + "_files",
221221
paths = [
222+
"site-packages/pkg2/__init__.py",
222223
"site-packages/pkg2/a.txt",
223224
"site-packages/pkg2/b_mod.so",
224225
],
@@ -248,6 +249,7 @@ def _test_optimized_grouping_single_toplevel_impl(env, target):
248249
"pkg2",
249250
link_to_path = rr + "pkg2",
250251
files = [
252+
"tests/venv_site_packages_libs/app_files_building/site-packages/pkg2/__init__.py",
251253
"tests/venv_site_packages_libs/app_files_building/site-packages/pkg2/a.txt",
252254
"tests/venv_site_packages_libs/app_files_building/site-packages/pkg2/b_mod.so",
253255
],
@@ -264,6 +266,70 @@ def _test_optimized_grouping_single_toplevel_impl(env, target):
264266
# The point of the optimization is to avoid having to merge conflicts.
265267
env.expect.that_collection(conflicts).contains_exactly([])
266268

269+
def _test_optimized_grouping_implicit_namespace_packages(name):
270+
empty_files(
271+
name = name + "_files",
272+
paths = [
273+
# NOTE: An alphanumeric name with underscores is used to verify
274+
# name matching is correct.
275+
"site-packages/name_space9/part1/foo.py",
276+
"site-packages/name_space9/part2/bar.py",
277+
"site-packages/name_space9-1.0.dist-info/METADATA",
278+
],
279+
)
280+
analysis_test(
281+
name = name,
282+
impl = _test_optimized_grouping_implicit_namespace_packages_impl,
283+
target = name + "_files",
284+
)
285+
286+
_tests.append(_test_optimized_grouping_implicit_namespace_packages)
287+
288+
def _test_optimized_grouping_implicit_namespace_packages_impl(env, target):
289+
test_ctx = _ctx(workspace_name = env.ctx.workspace_name)
290+
entries = get_venv_symlinks(
291+
test_ctx,
292+
target.files.to_list(),
293+
package = "pkg3",
294+
version_str = "1.0",
295+
site_packages_root = env.ctx.label.package + "/site-packages",
296+
)
297+
actual = _venv_symlinks_from_entries(entries)
298+
299+
rr = "{}/{}/site-packages/".format(test_ctx.workspace_name, env.ctx.label.package)
300+
expected = [
301+
_venv_symlink(
302+
"name_space9/part1",
303+
link_to_path = rr + "name_space9/part1",
304+
files = [
305+
"tests/venv_site_packages_libs/app_files_building/site-packages/name_space9/part1/foo.py",
306+
],
307+
),
308+
_venv_symlink(
309+
"name_space9/part2",
310+
link_to_path = rr + "name_space9/part2",
311+
files = [
312+
"tests/venv_site_packages_libs/app_files_building/site-packages/name_space9/part2/bar.py",
313+
],
314+
),
315+
_venv_symlink(
316+
"name_space9-1.0.dist-info",
317+
link_to_path = rr + "name_space9-1.0.dist-info",
318+
files = [
319+
"tests/venv_site_packages_libs/app_files_building/site-packages/name_space9-1.0.dist-info/METADATA",
320+
],
321+
),
322+
]
323+
expected = sorted(expected, key = lambda e: (e.link_to_path, e.venv_path))
324+
env.expect.that_collection(
325+
actual,
326+
).contains_exactly(expected)
327+
328+
_, conflicts = build_link_map(test_ctx, entries, return_conflicts = True)
329+
330+
# The point of the optimization is to avoid having to merge conflicts.
331+
env.expect.that_collection(conflicts).contains_exactly([])
332+
267333
def _test_package_version_filtering(name):
268334
analysis_test(
269335
name = name,

0 commit comments

Comments
 (0)