From 80e4da24cb41d3f2a5433ca5fa10b933115c623c Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 29 Oct 2025 16:37:28 +0100 Subject: [PATCH 01/15] improve hffs cache for multiprocessing fork --- src/huggingface_hub/hf_file_system.py | 51 ++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index 57fde3bbb2..4282ab1660 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -1,10 +1,12 @@ import os import re import tempfile +import threading from collections import deque from contextlib import ExitStack from dataclasses import dataclass, field from datetime import datetime +from hashlib import md5 from itertools import chain from pathlib import Path from typing import Any, Iterator, NoReturn, Optional, Union @@ -56,7 +58,39 @@ def unresolve(self) -> str: return f"{repo_path}/{self.path_in_repo}".rstrip("/") -class HfFileSystem(fsspec.AbstractFileSystem): +class _Cached(type(fsspec.AbstractFileSystem)): + """ + Metaclass for caching HfFileSystem instances according to the args. + + This creates an additional reference to the filesystem, which prevents the + filesystem from being garbage collected when all *user* references go away. + A call to the :meth:`AbstractFileSystem.clear_instance_cache` must *also* + be made for a filesystem instance to be garbage collected. + + This is a slightly modified version of `fsspec.spec._Cache` to improve it. + In particular in `_tokenize` the pid isn't taken into account for the + `fs_token` used to identify cache instances. The `fs_token` logic is also + robust to defaults values and the order of the args. + """ + + def __call__(cls, *args, **kwargs): + skip = kwargs.pop("skip_instance_cache", False) + fs_token = cls._tokenize(cls, *args, **kwargs) + if not skip and cls.cachable and fs_token in cls._cache: + cls._latest = fs_token + return cls._cache[fs_token] + else: + obj = type.__call__(cls, *args, **kwargs) + obj._fs_token_ = fs_token + obj.storage_args = args + obj.storage_options = kwargs + if cls.cachable and not skip: + cls._latest = fs_token + cls._cache[fs_token] = obj + return obj + + +class HfFileSystem(fsspec.AbstractFileSystem, metaclass=_Cached): """ Access a remote Hugging Face Hub repository as if were a local file system. @@ -119,6 +153,21 @@ def __init__( # Maps parent directory path to path infos self.dircache: dict[str, list[dict[str, Any]]] = {} + @classmethod + def _tokenize(cls, *args, **kwargs) -> str: + """Deterministic token for caching""" + # make fs_token robust to default values and to kwargs order + kwargs["endpoint"] = kwargs.get("endpoint") or constants.ENDPOINT + kwargs = {key: kwargs[key] for key in sorted(kwargs)} + # contrary to fsspec, we don't include pid here + tokenize_args = (cls, threading.get_ident(), args, kwargs) + try: + h = md5(str(tokenize_args).encode()) + except ValueError: + # FIPS systems: https://github.com/fsspec/filesystem_spec/issues/380 + h = md5(str(tokenize_args).encode(), usedforsecurity=False) + return h.hexdigest() + def _repo_and_revision_exist( self, repo_type: str, repo_id: str, revision: Optional[str] ) -> tuple[bool, Optional[Exception]]: From 8f20a2d73ff126c23fa9a4a0b21945cb763342fc Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 29 Oct 2025 16:46:33 +0100 Subject: [PATCH 02/15] minor --- src/huggingface_hub/hf_file_system.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index 4282ab1660..4c8fe1e20d 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -73,6 +73,13 @@ class _Cached(type(fsspec.AbstractFileSystem)): robust to defaults values and the order of the args. """ + def __init__(cls, *args, **kwargs): + super().__init__(*args, **kwargs) + # Note: we intentionally create a reference here, to avoid garbage + # collecting instances when all other references are gone. To really + # delete a FileSystem, the cache must be cleared. + cls._cache = {} + def __call__(cls, *args, **kwargs): skip = kwargs.pop("skip_instance_cache", False) fs_token = cls._tokenize(cls, *args, **kwargs) From 696f4436595a313f29ff74a5485236f43d4493b9 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 29 Oct 2025 16:56:44 +0100 Subject: [PATCH 03/15] mypy --- src/huggingface_hub/hf_file_system.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index 4c8fe1e20d..431faffe74 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -58,7 +58,11 @@ def unresolve(self) -> str: return f"{repo_path}/{self.path_in_repo}".rstrip("/") -class _Cached(type(fsspec.AbstractFileSystem)): +# We need to improve fsspec.spec._Cached which is AbstractFileSystem's metaclass +_cached_base: Any = type(fsspec.AbstractFileSystem) + + +class _Cached(_cached_base): """ Metaclass for caching HfFileSystem instances according to the args. @@ -67,7 +71,7 @@ class _Cached(type(fsspec.AbstractFileSystem)): A call to the :meth:`AbstractFileSystem.clear_instance_cache` must *also* be made for a filesystem instance to be garbage collected. - This is a slightly modified version of `fsspec.spec._Cache` to improve it. + This is a slightly modified version of `fsspec.spec._Cached` to improve it. In particular in `_tokenize` the pid isn't taken into account for the `fs_token` used to identify cache instances. The `fs_token` logic is also robust to defaults values and the order of the args. From ad206897cc1b559bfde89040991bb16aff0cd8c1 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 29 Oct 2025 18:01:50 +0100 Subject: [PATCH 04/15] fix for token --- src/huggingface_hub/hf_file_system.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index 431faffe74..38b2139d7a 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -169,6 +169,7 @@ def _tokenize(cls, *args, **kwargs) -> str: """Deterministic token for caching""" # make fs_token robust to default values and to kwargs order kwargs["endpoint"] = kwargs.get("endpoint") or constants.ENDPOINT + kwargs["token"] = kwargs.get("token") kwargs = {key: kwargs[key] for key in sorted(kwargs)} # contrary to fsspec, we don't include pid here tokenize_args = (cls, threading.get_ident(), args, kwargs) From 5abbfeb0fab9b4e40c8115c7a3d9c4ad0d76df4a Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 29 Oct 2025 18:01:55 +0100 Subject: [PATCH 05/15] add test --- tests/test_hf_file_system.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/tests/test_hf_file_system.py b/tests/test_hf_file_system.py index 65a189f9d4..5c122c10f9 100644 --- a/tests/test_hf_file_system.py +++ b/tests/test_hf_file_system.py @@ -1,6 +1,7 @@ import copy import datetime import io +import multiprocessing import os import pickle import tempfile @@ -644,6 +645,31 @@ def test_exists_after_repo_deletion(): assert not hffs.exists(repo_id, refresh=True) +def _get_fs_token_and_dircache(fs): + fs = HfFileSystem(endpoint=fs.endpoint, token=fs.token) + return fs._fs_token, fs.dircache + + +def test_cache(): + fs = HfFileSystem() + fs.dircache = {"dummy": []} + + assert HfFileSystem() is fs + assert HfFileSystem(endpoint=constants.ENDPOINT) is fs + assert HfFileSystem(token=None, endpoint=constants.ENDPOINT) is fs + assert HfFileSystem(endpoint="something-else") is not fs + + with multiprocessing.get_context("spawn").Pool() as pool: + fs_token, dircache = pool.apply(_get_fs_token_and_dircache, (fs,)) + assert fs_token == fs._fs_token + assert dircache == fs.dircache + + with multiprocessing.get_context("fork").Pool() as pool: + fs_token, dircache = pool.apply(_get_fs_token_and_dircache, (fs,)) + assert fs_token == fs._fs_token + assert dircache == fs.dircache + + @with_production_testing def test_hf_file_system_file_can_handle_gzipped_file(): """Test that HfFileSystemStreamFile.read() can handle gzipped files.""" From ded5d850de117ddb0f1fa759246fa55df8cc6f23 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 29 Oct 2025 18:48:47 +0100 Subject: [PATCH 06/15] fix for threading too --- src/huggingface_hub/hf_file_system.py | 26 +++++++++++++++++++------- tests/test_hf_file_system.py | 6 ++++++ 2 files changed, 25 insertions(+), 7 deletions(-) diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index 38b2139d7a..f8a5b7de37 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -4,6 +4,7 @@ import threading from collections import deque from contextlib import ExitStack +from copy import deepcopy from dataclasses import dataclass, field from datetime import datetime from hashlib import md5 @@ -86,12 +87,20 @@ def __init__(cls, *args, **kwargs): def __call__(cls, *args, **kwargs): skip = kwargs.pop("skip_instance_cache", False) - fs_token = cls._tokenize(cls, *args, **kwargs) + fs_token = cls._tokenize(cls, threading.get_ident(), *args, **kwargs) + fs_token_main_thread = cls._tokenize(cls, threading.main_thread().ident, *args, **kwargs) if not skip and cls.cachable and fs_token in cls._cache: + # reuse cached instance cls._latest = fs_token return cls._cache[fs_token] else: + # create new instance obj = type.__call__(cls, *args, **kwargs) + if not skip and cls.cachable and fs_token_main_thread in cls._cache: + # reuse the cache from the main thread instance in the new instance + instance_cache_attributes_dict = cls._cache[fs_token_main_thread]._get_instance_cache_attributes_dict() + for attr, cached_value in instance_cache_attributes_dict.items(): + setattr(obj, attr, cached_value) obj._fs_token_ = fs_token obj.storage_args = args obj.storage_options = kwargs @@ -165,14 +174,14 @@ def __init__( self.dircache: dict[str, list[dict[str, Any]]] = {} @classmethod - def _tokenize(cls, *args, **kwargs) -> str: + def _tokenize(cls, threading_ident: int, *args, **kwargs) -> str: """Deterministic token for caching""" # make fs_token robust to default values and to kwargs order kwargs["endpoint"] = kwargs.get("endpoint") or constants.ENDPOINT kwargs["token"] = kwargs.get("token") kwargs = {key: kwargs[key] for key in sorted(kwargs)} # contrary to fsspec, we don't include pid here - tokenize_args = (cls, threading.get_ident(), args, kwargs) + tokenize_args = (cls, threading_ident, args, kwargs) try: h = md5(str(tokenize_args).encode()) except ValueError: @@ -997,12 +1006,15 @@ def __reduce__(self): type(self), self.storage_args, self.storage_options, - { - "dircache": self.dircache, - "_repo_and_revision_exists_cache": self._repo_and_revision_exists_cache, - }, + self._get_instance_cache_attributes_dict(), ) + def _get_instance_cache_attributes_dict(self): + return { + "dircache": deepcopy(self.dircache), + "_repo_and_revision_exists_cache": deepcopy(self._repo_and_revision_exists_cache), + } + class HfFileSystemFile(fsspec.spec.AbstractBufferedFile): def __init__(self, fs: HfFileSystem, path: str, revision: Optional[str] = None, **kwargs): diff --git a/tests/test_hf_file_system.py b/tests/test_hf_file_system.py index 5c122c10f9..1ba31c7e4b 100644 --- a/tests/test_hf_file_system.py +++ b/tests/test_hf_file_system.py @@ -2,6 +2,7 @@ import datetime import io import multiprocessing +import multiprocessing.pool import os import pickle import tempfile @@ -669,6 +670,11 @@ def test_cache(): assert fs_token == fs._fs_token assert dircache == fs.dircache + with multiprocessing.pool.ThreadPool() as pool: + fs_token, dircache = pool.apply(_get_fs_token_and_dircache, (fs,)) + assert fs_token != fs._fs_token # use a different instance for thread safety + assert dircache == fs.dircache + @with_production_testing def test_hf_file_system_file_can_handle_gzipped_file(): From f72ab1373dcfbbc44b452b3457db0d27b47ada21 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 29 Oct 2025 18:51:29 +0100 Subject: [PATCH 07/15] comment --- src/huggingface_hub/hf_file_system.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index f8a5b7de37..8b8adba32b 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -75,7 +75,8 @@ class _Cached(_cached_base): This is a slightly modified version of `fsspec.spec._Cached` to improve it. In particular in `_tokenize` the pid isn't taken into account for the `fs_token` used to identify cache instances. The `fs_token` logic is also - robust to defaults values and the order of the args. + robust to defaults values and the order of the args. Finally new instances + reuse the cache from instances in the main thread. """ def __init__(cls, *args, **kwargs): From 95c485ef39e6af040643efabbac63ba703adb672 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 29 Oct 2025 20:58:23 +0100 Subject: [PATCH 08/15] fix CI: make HfHubHTTPError picklable --- src/huggingface_hub/errors.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/src/huggingface_hub/errors.py b/src/huggingface_hub/errors.py index f429db7cc4..ce731fa45b 100644 --- a/src/huggingface_hub/errors.py +++ b/src/huggingface_hub/errors.py @@ -84,6 +84,16 @@ def append_to_message(self, additional_message: str) -> None: """Append additional information to the `HfHubHTTPError` initial message.""" self.args = (self.args[0] + additional_message,) + self.args[1:] + def __reduce_ex__(self, protocol): + """Fix pickling of Exception subclass with kwargs""" + args = (str(self),) + kwargs = {"response": self.response, "server_message": self.server_message} + return self._from_args, (args, kwargs) + + @classmethod + def _from_args(cls, args, kwargs): + return cls(*args, **kwargs) + # INFERENCE CLIENT ERRORS From 539a89241a647244680fa9b22e40858b74f54d9e Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 29 Oct 2025 22:32:54 +0100 Subject: [PATCH 09/15] fix tests --- tests/test_hf_file_system.py | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/test_hf_file_system.py b/tests/test_hf_file_system.py index 1ba31c7e4b..4cb6b7b6d9 100644 --- a/tests/test_hf_file_system.py +++ b/tests/test_hf_file_system.py @@ -658,22 +658,26 @@ def test_cache(): assert HfFileSystem() is fs assert HfFileSystem(endpoint=constants.ENDPOINT) is fs assert HfFileSystem(token=None, endpoint=constants.ENDPOINT) is fs - assert HfFileSystem(endpoint="something-else") is not fs + + another_fs = HfFileSystem(endpoint="something-else") + assert another_fs is not fs + assert another_fs.dircache != fs.dircache with multiprocessing.get_context("spawn").Pool() as pool: - fs_token, dircache = pool.apply(_get_fs_token_and_dircache, (fs,)) - assert fs_token == fs._fs_token + (fs_token, dircache), (_, another_dircache) = pool.map(_get_fs_token_and_dircache, [fs, another_fs]) assert dircache == fs.dircache + assert another_dircache != fs.dircache with multiprocessing.get_context("fork").Pool() as pool: - fs_token, dircache = pool.apply(_get_fs_token_and_dircache, (fs,)) - assert fs_token == fs._fs_token + (fs_token, dircache), (_, another_dircache) = pool.map(_get_fs_token_and_dircache, [fs, another_fs]) assert dircache == fs.dircache + assert another_dircache != fs.dircache with multiprocessing.pool.ThreadPool() as pool: - fs_token, dircache = pool.apply(_get_fs_token_and_dircache, (fs,)) - assert fs_token != fs._fs_token # use a different instance for thread safety + (fs_token, dircache), (_, another_dircache) = pool.map(_get_fs_token_and_dircache, [fs, another_fs]) assert dircache == fs.dircache + assert another_dircache != fs.dircache + assert fs_token != fs._fs_token # use a different instance for thread safety @with_production_testing From dffdbbbad46070757b4fbad84e3fbb270b02ffee Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 29 Oct 2025 22:44:12 +0100 Subject: [PATCH 10/15] better naming --- src/huggingface_hub/hf_file_system.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index 8b8adba32b..12abf307f1 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -74,9 +74,9 @@ class _Cached(_cached_base): This is a slightly modified version of `fsspec.spec._Cached` to improve it. In particular in `_tokenize` the pid isn't taken into account for the - `fs_token` used to identify cache instances. The `fs_token` logic is also + `fs_token` used to identify cached instances. The `fs_token` logic is also robust to defaults values and the order of the args. Finally new instances - reuse the cache from instances in the main thread. + reuse the states from sister instances in the main thread. """ def __init__(cls, *args, **kwargs): @@ -99,9 +99,9 @@ def __call__(cls, *args, **kwargs): obj = type.__call__(cls, *args, **kwargs) if not skip and cls.cachable and fs_token_main_thread in cls._cache: # reuse the cache from the main thread instance in the new instance - instance_cache_attributes_dict = cls._cache[fs_token_main_thread]._get_instance_cache_attributes_dict() - for attr, cached_value in instance_cache_attributes_dict.items(): - setattr(obj, attr, cached_value) + instance_state = cls._cache[fs_token_main_thread]._get_instance_state() + for attr, state_value in instance_state.items(): + setattr(obj, attr, state_value) obj._fs_token_ = fs_token obj.storage_args = args obj.storage_options = kwargs @@ -1002,15 +1002,15 @@ def start_transaction(self): raise NotImplementedError("Transactional commits are not supported.") def __reduce__(self): - # re-populate the instance cache at HfFileSystem._cache and re-populate the cache attributes of every instance + # re-populate the instance cache at HfFileSystem._cache and re-populate the state of every instance return make_instance, ( type(self), self.storage_args, self.storage_options, - self._get_instance_cache_attributes_dict(), + self._get_instance_state(), ) - def _get_instance_cache_attributes_dict(self): + def _get_instance_state(self): return { "dircache": deepcopy(self.dircache), "_repo_and_revision_exists_cache": deepcopy(self._repo_and_revision_exists_cache), @@ -1252,8 +1252,8 @@ def _partial_read(response: httpx.Response, length: int = -1) -> bytes: return bytes(buf) # may be < length if response ended -def make_instance(cls, args, kwargs, instance_cache_attributes_dict): +def make_instance(cls, args, kwargs, instance_state): fs = cls(*args, **kwargs) - for attr, cached_value in instance_cache_attributes_dict.items(): - setattr(fs, attr, cached_value) + for attr, state_value in instance_state.items(): + setattr(fs, attr, state_value) return fs From 991443e0a1b055370478a96397414d8120f3d9ec Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Wed, 29 Oct 2025 22:48:50 +0100 Subject: [PATCH 11/15] clear instance cache before testing to ignore remaning Mock objects --- tests/test_hf_file_system.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_hf_file_system.py b/tests/test_hf_file_system.py index 4cb6b7b6d9..74c4f01ad4 100644 --- a/tests/test_hf_file_system.py +++ b/tests/test_hf_file_system.py @@ -652,6 +652,7 @@ def _get_fs_token_and_dircache(fs): def test_cache(): + HfFileSystem.clear_instance_cache() fs = HfFileSystem() fs.dircache = {"dummy": []} From 82a58eeee49ba2033c96a46a40d73df3fdb8a07c Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Thu, 30 Oct 2025 14:03:30 +0100 Subject: [PATCH 12/15] don't test "fork" on windows --- tests/test_hf_file_system.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_hf_file_system.py b/tests/test_hf_file_system.py index 74c4f01ad4..1c22ab9d29 100644 --- a/tests/test_hf_file_system.py +++ b/tests/test_hf_file_system.py @@ -669,10 +669,11 @@ def test_cache(): assert dircache == fs.dircache assert another_dircache != fs.dircache - with multiprocessing.get_context("fork").Pool() as pool: - (fs_token, dircache), (_, another_dircache) = pool.map(_get_fs_token_and_dircache, [fs, another_fs]) - assert dircache == fs.dircache - assert another_dircache != fs.dircache + if os.name != "nt": # "fork" is unavailable on windows + with multiprocessing.get_context("fork").Pool() as pool: + (fs_token, dircache), (_, another_dircache) = pool.map(_get_fs_token_and_dircache, [fs, another_fs]) + assert dircache == fs.dircache + assert another_dircache != fs.dircache with multiprocessing.pool.ThreadPool() as pool: (fs_token, dircache), (_, another_dircache) = pool.map(_get_fs_token_and_dircache, [fs, another_fs]) From 156627bcc347702347fe60047c426329b21fd69d Mon Sep 17 00:00:00 2001 From: Quentin Lhoest <42851186+lhoestq@users.noreply.github.com> Date: Mon, 3 Nov 2025 16:26:40 +0100 Subject: [PATCH 13/15] Apply suggestions from code review Co-authored-by: Lucain --- src/huggingface_hub/errors.py | 14 ++++++-------- src/huggingface_hub/hf_file_system.py | 2 ++ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/huggingface_hub/errors.py b/src/huggingface_hub/errors.py index ce731fa45b..a2651c3a1e 100644 --- a/src/huggingface_hub/errors.py +++ b/src/huggingface_hub/errors.py @@ -85,14 +85,12 @@ def append_to_message(self, additional_message: str) -> None: self.args = (self.args[0] + additional_message,) + self.args[1:] def __reduce_ex__(self, protocol): - """Fix pickling of Exception subclass with kwargs""" - args = (str(self),) - kwargs = {"response": self.response, "server_message": self.server_message} - return self._from_args, (args, kwargs) - - @classmethod - def _from_args(cls, args, kwargs): - return cls(*args, **kwargs) + """Fix pickling of Exception subclass with kwargs. We need to override __reduce_ex__ of the parent class""" + return ( + self.__class__, + (str(self),), + {"response": self.response, "server_message": self.server_message} + ) # INFERENCE CLIENT ERRORS diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index 12abf307f1..195c7cbbb1 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -80,6 +80,7 @@ class _Cached(_cached_base): """ def __init__(cls, *args, **kwargs): + # Hack: override https://github.com/fsspec/filesystem_spec/blob/dcb167e8f50e6273d4cfdfc4cab8fc5aa4c958bf/fsspec/spec.py#L53 super().__init__(*args, **kwargs) # Note: we intentionally create a reference here, to avoid garbage # collecting instances when all other references are gone. To really @@ -87,6 +88,7 @@ def __init__(cls, *args, **kwargs): cls._cache = {} def __call__(cls, *args, **kwargs): + # Hack: override https://github.com/fsspec/filesystem_spec/blob/dcb167e8f50e6273d4cfdfc4cab8fc5aa4c958bf/fsspec/spec.py#L65 skip = kwargs.pop("skip_instance_cache", False) fs_token = cls._tokenize(cls, threading.get_ident(), *args, **kwargs) fs_token_main_thread = cls._tokenize(cls, threading.main_thread().ident, *args, **kwargs) From a0698ead72011856a4705c7509c3b114b1008742 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 4 Nov 2025 11:26:39 +0100 Subject: [PATCH 14/15] use insecure_hashlib --- src/huggingface_hub/hf_file_system.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/src/huggingface_hub/hf_file_system.py b/src/huggingface_hub/hf_file_system.py index 195c7cbbb1..614eb6cc15 100644 --- a/src/huggingface_hub/hf_file_system.py +++ b/src/huggingface_hub/hf_file_system.py @@ -7,7 +7,6 @@ from copy import deepcopy from dataclasses import dataclass, field from datetime import datetime -from hashlib import md5 from itertools import chain from pathlib import Path from typing import Any, Iterator, NoReturn, Optional, Union @@ -24,6 +23,7 @@ from .file_download import hf_hub_url, http_get from .hf_api import HfApi, LastCommitInfo, RepoFile from .utils import HFValidationError, hf_raise_for_status, http_backoff, http_stream_backoff +from .utils.insecure_hashlib import md5 # Regex used to match special revisions with "/" in them (see #1710) @@ -185,11 +185,7 @@ def _tokenize(cls, threading_ident: int, *args, **kwargs) -> str: kwargs = {key: kwargs[key] for key in sorted(kwargs)} # contrary to fsspec, we don't include pid here tokenize_args = (cls, threading_ident, args, kwargs) - try: - h = md5(str(tokenize_args).encode()) - except ValueError: - # FIPS systems: https://github.com/fsspec/filesystem_spec/issues/380 - h = md5(str(tokenize_args).encode(), usedforsecurity=False) + h = md5(str(tokenize_args).encode()) return h.hexdigest() def _repo_and_revision_exist( From 75bf5d1e6b4d489f255fa48c6f1f9326f9a9e3b1 Mon Sep 17 00:00:00 2001 From: Quentin Lhoest Date: Tue, 4 Nov 2025 11:26:42 +0100 Subject: [PATCH 15/15] style --- src/huggingface_hub/errors.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/huggingface_hub/errors.py b/src/huggingface_hub/errors.py index a2651c3a1e..1c5e71b569 100644 --- a/src/huggingface_hub/errors.py +++ b/src/huggingface_hub/errors.py @@ -86,11 +86,7 @@ def append_to_message(self, additional_message: str) -> None: def __reduce_ex__(self, protocol): """Fix pickling of Exception subclass with kwargs. We need to override __reduce_ex__ of the parent class""" - return ( - self.__class__, - (str(self),), - {"response": self.response, "server_message": self.server_message} - ) + return (self.__class__, (str(self),), {"response": self.response, "server_message": self.server_message}) # INFERENCE CLIENT ERRORS