From a55a78fe96a127341307489ff5b7273ca4f6be75 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 7 Jan 2026 10:27:57 +0100 Subject: [PATCH 1/6] chore: Migrate to ty type checker --- .gitignore | 2 +- CONTRIBUTING.md | 2 +- Makefile | 4 +- .../code_examples/google/cloud_run_example.py | 3 +- .../code_examples/google/google_example.py | 3 +- ...default_fingerprint_generator_with_args.py | 2 +- .../running_in_web_server/server.py | 4 +- pyproject.toml | 62 ++------ src/crawlee/_browserforge_workaround.py | 2 +- src/crawlee/_request.py | 4 +- src/crawlee/_types.py | 27 ++-- src/crawlee/_utils/context.py | 2 +- src/crawlee/_utils/file.py | 2 +- src/crawlee/_utils/globs.py | 2 +- src/crawlee/_utils/recurring_task.py | 4 +- src/crawlee/_utils/sitemap.py | 10 +- src/crawlee/_utils/system.py | 38 +++-- src/crawlee/browsers/_browser_pool.py | 2 +- src/crawlee/browsers/_playwright_browser.py | 2 +- .../_abstract_http/_abstract_http_crawler.py | 6 +- .../_adaptive_playwright_crawler.py | 8 +- src/crawlee/crawlers/_basic/_basic_crawler.py | 6 +- .../_playwright/_playwright_crawler.py | 8 +- src/crawlee/events/_event_manager.py | 4 +- src/crawlee/http_clients/_curl_impersonate.py | 14 +- src/crawlee/otel/crawler_instrumentor.py | 4 +- src/crawlee/sessions/_cookies.py | 4 +- src/crawlee/sessions/_models.py | 6 +- src/crawlee/statistics/_models.py | 27 +++- .../storage_clients/_base/_dataset_client.py | 4 +- .../_base/_key_value_store_client.py | 4 +- .../_file_system/_dataset_client.py | 19 +-- .../_file_system/_key_value_store_client.py | 20 +-- .../_file_system/_request_queue_client.py | 16 +- .../storage_clients/_redis/_client_mixin.py | 5 +- .../storage_clients/_redis/_dataset_client.py | 6 +- .../_redis/_key_value_store_client.py | 8 +- .../_redis/_request_queue_client.py | 13 +- .../storage_clients/_redis/_storage_client.py | 21 +-- src/crawlee/storage_clients/_redis/_utils.py | 2 +- .../storage_clients/_sql/_client_mixin.py | 2 +- src/crawlee/storage_clients/models.py | 11 +- tests/unit/_autoscaling/test_snapshotter.py | 5 +- tests/unit/_utils/test_html_to_text.py | 2 +- tests/unit/_utils/test_recurring_task.py | 2 +- tests/unit/_utils/test_timedelta_ms.py | 2 +- .../test_adaptive_playwright_crawler.py | 23 +-- .../crawlers/_basic/test_basic_crawler.py | 28 ++-- .../crawlers/_basic/test_context_pipeline.py | 4 +- .../_playwright/test_playwright_crawler.py | 2 +- .../test_header_generator.py | 4 +- .../test_new_proxy_info.py | 2 +- tests/unit/proxy_configuration/test_tiers.py | 12 +- .../_redis/test_redis_dataset_client.py | 2 +- .../_redis/test_redis_kvs_client.py | 2 +- .../_redis/test_redis_rq_client.py | 2 +- tests/unit/storages/test_dataset.py | 5 +- uv.lock | 148 ++++-------------- 58 files changed, 267 insertions(+), 373 deletions(-) diff --git a/.gitignore b/.gitignore index 74cb9fe9c0..ea73de62b4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ # Cache __pycache__ -.mypy_cache +.uv_cache .pytest_cache .ruff_cache .uv-cache diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 940b6b7ec4..95b1982bdb 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -46,7 +46,7 @@ make format ### Type checking -Type checking is handled by [mypy](https://mypy.readthedocs.io/), verifying code against type annotations. Configuration settings can be found in `pyproject.toml`. +Type checking is handled by [ty](https://docs.astral.sh/ty/), verifying code against type annotations. Configuration settings can be found in `pyproject.toml`. To run type checking: diff --git a/Makefile b/Makefile index 390181e21a..7224fcb752 100644 --- a/Makefile +++ b/Makefile @@ -5,7 +5,7 @@ E2E_TESTS_CONCURRENCY = 1 clean: - rm -rf .mypy_cache .pytest_cache .ruff_cache .uv-cache build dist htmlcov .coverage + rm -rf .uv_cache .pytest_cache .ruff_cache .uv-cache build dist htmlcov .coverage install-sync: uv sync --all-extras @@ -27,7 +27,7 @@ lint: uv run ruff check type-check: - uv run mypy + uv run ty check unit-tests: uv run pytest \ diff --git a/docs/deployment/code_examples/google/cloud_run_example.py b/docs/deployment/code_examples/google/cloud_run_example.py index 4176cf60e4..27d23b99eb 100644 --- a/docs/deployment/code_examples/google/cloud_run_example.py +++ b/docs/deployment/code_examples/google/cloud_run_example.py @@ -1,4 +1,3 @@ -# mypy: disable-error-code="misc" import json import os @@ -9,7 +8,7 @@ from crawlee.storage_clients import MemoryStorageClient -@get('/') # type: ignore[untyped-decorator] +@get('/') async def main() -> str: """The crawler entry point that will be called when the HTTP endpoint is accessed.""" # highlight-start diff --git a/docs/deployment/code_examples/google/google_example.py b/docs/deployment/code_examples/google/google_example.py index 474e121b71..68deac804c 100644 --- a/docs/deployment/code_examples/google/google_example.py +++ b/docs/deployment/code_examples/google/google_example.py @@ -1,4 +1,3 @@ -# mypy: disable-error-code="misc" import asyncio import json from datetime import timedelta @@ -48,7 +47,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None: # highlight-end -@functions_framework.http # type: ignore[untyped-decorator] +@functions_framework.http def crawlee_run(request: Request) -> Response: # You can pass data to your crawler using `request` function_id = request.headers['Function-Execution-Id'] diff --git a/docs/guides/code_examples/avoid_blocking/default_fingerprint_generator_with_args.py b/docs/guides/code_examples/avoid_blocking/default_fingerprint_generator_with_args.py index a6d2072ad3..4e6ed92aa6 100644 --- a/docs/guides/code_examples/avoid_blocking/default_fingerprint_generator_with_args.py +++ b/docs/guides/code_examples/avoid_blocking/default_fingerprint_generator_with_args.py @@ -9,7 +9,7 @@ async def main() -> None: fingerprint_generator = DefaultFingerprintGenerator( - header_options=HeaderGeneratorOptions(browsers=['chromium']), + header_options=HeaderGeneratorOptions(browsers=['chrome']), screen_options=ScreenOptions(min_width=400), ) diff --git a/docs/guides/code_examples/running_in_web_server/server.py b/docs/guides/code_examples/running_in_web_server/server.py index 09be14e2be..64e192af37 100644 --- a/docs/guides/code_examples/running_in_web_server/server.py +++ b/docs/guides/code_examples/running_in_web_server/server.py @@ -14,7 +14,7 @@ app = FastAPI(lifespan=lifespan, title='Crawler app') -@app.get('/', response_class=HTMLResponse) # type: ignore[untyped-decorator] +@app.get('/', response_class=HTMLResponse) def index() -> str: return """ @@ -32,7 +32,7 @@ def index() -> str: """ -@app.get('/scrape') # type: ignore[untyped-decorator] +@app.get('/scrape') async def scrape_url(request: Request, url: str | None = None) -> dict: if not url: return {'url': 'missing', 'scrape result': 'no results'} diff --git a/pyproject.toml b/pyproject.toml index 1593212972..54fecf8ce4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -102,7 +102,6 @@ dev = [ "build<2.0.0", # For e2e tests. "dycw-pytest-only<3.0.0", "fakeredis[probabilistic,json,lua]<3.0.0", - "mypy~=1.19.0", "pre-commit<5.0.0", "proxy-py<3.0.0", "pydoc-markdown<5.0.0", @@ -113,6 +112,7 @@ dev = [ "pytest<9.0.0", "ruff~=0.14.0", "setuptools", # setuptools are used by pytest, but not explicitly required + "ty~=0.0.0", "types-beautifulsoup4<5.0.0", "types-cachetools<7.0.0", "types-colorama<1.0.0", @@ -230,62 +230,24 @@ filterwarnings = [ "ignore:websockets.server.WebSocketServerProtocol is deprecated:DeprecationWarning", ] -[tool.mypy] -python_version = "3.10" -plugins = ["pydantic.mypy"] +[tool.ty.environment] +python-version = "3.10" + +[tool.ty.src] +include = ["src", "tests", "scripts", "docs", "website"] exclude = [ "src/crawlee/project_template", "docs/guides/code_examples/storage_clients/custom_storage_client_example.py", ] -files = ["src", "tests", "docs", "website"] -check_untyped_defs = true -disallow_incomplete_defs = true -disallow_untyped_calls = true -disallow_untyped_decorators = true -disallow_untyped_defs = true -no_implicit_optional = true -warn_redundant_casts = true -warn_return_any = true -warn_unreachable = true -warn_unused_ignores = true - -[[tool.mypy.overrides]] -# Example codes are sometimes showing integration of crawlee with external tool, which is not dependency of crawlee. -module = [ - "apify", # Example code shows integration of apify and crawlee. - "apify_fingerprint_datapoints", # Untyped and stubs not available - "camoufox", # Example code shows integration of camoufox and crawlee. - "fastapi", # Example code shows running in webserver. - "saxonche", # Example code shows HttpCrawler with custom parser. - "scrapling.*", # Example code shows HttpCrawler with custom parser. - "selectolax.*", # Example code shows HttpCrawler with custom parser. - "stagehand.*", # Example code shows integration of Stagehand and crawlee. - "starlette.*", # Example code shows running in webserver. - "flask", # Example code shows deploy on Google Cloud. - "functions_framework", # Example code shows deploy on Google Cloud. - "jaro", # Untyped and stubs not available - "litestar", # Example code shows deploy on Google Cloud Run. - "loguru", # Example code shows integration of loguru and crawlee for JSON logging. - "lxml.*", # Example code shows HttpCrawler with custom parser. - "sklearn.linear_model", # Untyped and stubs not available - "cookiecutter.*", # Untyped and stubs not available - "inquirer.*", # Untyped and stubs not available - "pyquery", # Example code shows HttpCrawler with custom parser. - "warcio.*", # Example code shows WARC files creation. - "wrapt" # Untyped and stubs not available -] -ignore_missing_imports = true -[[tool.mypy.overrides]] -module = [ - "running_in_web_server.*", # False positive when fastapi not available +[[tool.ty.overrides]] +include = [ + "docs/**/*.py", + "website/**/*.py", ] -disable_error_code = ["misc"] -[tool.basedpyright] -pythonVersion = "3.10" -typeCheckingMode = "standard" -include = ["src", "tests", "docs", "website"] +[tool.ty.overrides.rules] +unresolved-import = "ignore" [tool.coverage.report] exclude_lines = ["pragma: no cover", "if TYPE_CHECKING:", "assert_never()"] diff --git a/src/crawlee/_browserforge_workaround.py b/src/crawlee/_browserforge_workaround.py index 495d2a8298..8e8dcceca4 100644 --- a/src/crawlee/_browserforge_workaround.py +++ b/src/crawlee/_browserforge_workaround.py @@ -20,7 +20,7 @@ def patch_browserforge() -> None: def DownloadIfNotExists(**flags: bool) -> None: pass - download.DownloadIfNotExists = DownloadIfNotExists + download.DownloadIfNotExists = DownloadIfNotExists # ty: ignore[invalid-assignment] import browserforge.bayesian_network diff --git a/src/crawlee/_request.py b/src/crawlee/_request.py index 9e0fd1dfc6..fd1feef791 100644 --- a/src/crawlee/_request.py +++ b/src/crawlee/_request.py @@ -93,7 +93,7 @@ def __setitem__(self, key: str, value: JsonSerializable) -> None: def __delitem__(self, key: str) -> None: del self.__pydantic_extra__[key] - def __iter__(self) -> Iterator[str]: # type: ignore[override] + def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-override] yield from self.__pydantic_extra__ def __len__(self) -> int: @@ -195,7 +195,7 @@ class Request(BaseModel): ] = None """HTTP request payload.""" - # Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory + # Workaround for Pydantic and type checkers when using Annotated with default_factory if TYPE_CHECKING: headers: HttpHeaders = HttpHeaders() """HTTP request headers.""" diff --git a/src/crawlee/_types.py b/src/crawlee/_types.py index a98664d02d..bf10dd6ff3 100644 --- a/src/crawlee/_types.py +++ b/src/crawlee/_types.py @@ -62,14 +62,14 @@ class HttpHeaders(RootModel, Mapping[str, str]): model_config = ConfigDict(validate_by_name=True, validate_by_alias=True) - # Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory + # Workaround for Pydantic and type checkers when using Annotated with default_factory if TYPE_CHECKING: root: dict[str, str] = {} else: root: Annotated[ dict[str, str], PlainValidator(lambda value: _normalize_headers(value)), - Field(default_factory=dict), + Field(default_factory=lambda: dict[str, str]()), ] def __getitem__(self, key: str) -> str: @@ -91,7 +91,7 @@ def __ror__(self, other: HttpHeaders) -> HttpHeaders: combined_headers = {**other, **self.root} return HttpHeaders(combined_headers) - def __iter__(self) -> Iterator[str]: # type: ignore[override] + def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-override] yield from self.root def __len__(self) -> int: @@ -671,17 +671,16 @@ def create_modified_copy( get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None, ) -> Self: """Create a modified copy of the crawling context with specified changes.""" - original_fields = {field.name: getattr(self, field.name) for field in dataclasses.fields(self)} - modified_fields = { - key: value - for key, value in { - 'push_data': push_data, - 'add_requests': add_requests, - 'get_key_value_store': get_key_value_store, - }.items() - if value - } - return self.__class__(**{**original_fields, **modified_fields}) + modifications = dict[str, Any]() + + if push_data is not None: + modifications['push_data'] = push_data + if add_requests is not None: + modifications['add_requests'] = add_requests + if get_key_value_store is not None: + modifications['get_key_value_store'] = get_key_value_store + + return dataclasses.replace(self, **modifications) class GetDataKwargs(TypedDict): diff --git a/src/crawlee/_utils/context.py b/src/crawlee/_utils/context.py index fb750cf0e7..6f3a65094b 100644 --- a/src/crawlee/_utils/context.py +++ b/src/crawlee/_utils/context.py @@ -44,4 +44,4 @@ async def async_wrapper(self: Any, *args: Any, **kwargs: Any) -> Any: return await method(self, *args, **kwargs) - return async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper # type: ignore[return-value] + return async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper # ty: ignore[invalid-return-type] diff --git a/src/crawlee/_utils/file.py b/src/crawlee/_utils/file.py index f53b6bab0a..1d297fa724 100644 --- a/src/crawlee/_utils/file.py +++ b/src/crawlee/_utils/file.py @@ -170,7 +170,7 @@ async def export_csv_to_stream( if 'lineterminator' not in kwargs: kwargs['lineterminator'] = '\n' - writer = csv.writer(dst, **kwargs) # type: ignore[arg-type] + writer = csv.writer(dst, **kwargs) write_header = True # Iterate over the dataset and write to CSV. diff --git a/src/crawlee/_utils/globs.py b/src/crawlee/_utils/globs.py index f7e1a57927..aed82e1f18 100644 --- a/src/crawlee/_utils/globs.py +++ b/src/crawlee/_utils/globs.py @@ -36,7 +36,7 @@ def _translate( if not seps: seps = (os.path.sep, os.path.altsep) if os.path.altsep else os.path.sep - escaped_seps = ''.join(map(re.escape, seps)) + escaped_seps = ''.join(map(re.escape, seps)) # ty: ignore[invalid-argument-type] any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps not_sep = f'[^{escaped_seps}]' if include_hidden: diff --git a/src/crawlee/_utils/recurring_task.py b/src/crawlee/_utils/recurring_task.py index 3a6553b6c0..ba80f8f8b0 100644 --- a/src/crawlee/_utils/recurring_task.py +++ b/src/crawlee/_utils/recurring_task.py @@ -25,7 +25,7 @@ class RecurringTask: """ def __init__(self, func: Callable, delay: timedelta) -> None: - logger.debug(f'Calling RecurringTask.__init__(func={func.__name__}, delay={delay})...') + logger.debug(f'Calling RecurringTask.__init__(func={func.__name__}, delay={delay})...') # ty: ignore[unresolved-attribute] self.func = func self.delay = delay self.task: asyncio.Task | None = None @@ -55,7 +55,7 @@ async def _wrapper(self) -> None: def start(self) -> None: """Start the recurring task execution.""" - self.task = asyncio.create_task(self._wrapper(), name=f'Task-recurring-{self.func.__name__}') + self.task = asyncio.create_task(self._wrapper(), name=f'Task-recurring-{self.func.__name__}') # ty: ignore[possibly-missing-attribute] async def stop(self) -> None: """Stop the recurring task execution.""" diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index 0d839cd1ed..ba844ca47b 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -430,10 +430,10 @@ async def parse_sitemap( up to the specified maximum depth. """ # Set default options - options = options or {} - emit_nested_sitemaps = options.get('emit_nested_sitemaps', False) - max_depth = options.get('max_depth', float('inf')) - sitemap_retries = options.get('sitemap_retries', 3) + options = options or {} # ty: ignore[invalid-assignment] + emit_nested_sitemaps = options.get('emit_nested_sitemaps', False) # ty: ignore[possibly-missing-attribute] + max_depth = options.get('max_depth', float('inf')) # ty: ignore[possibly-missing-attribute] + sitemap_retries = options.get('sitemap_retries', 3) # ty: ignore[possibly-missing-attribute] # Setup working state sources = list(initial_sources) @@ -472,7 +472,7 @@ async def parse_sitemap( sitemap_retries, emit_nested_sitemaps=emit_nested_sitemaps, proxy_info=proxy_info, - timeout=options.get('timeout', timedelta(seconds=30)), + timeout=options.get('timeout', timedelta(seconds=30)), # ty: ignore[possibly-missing-attribute] ): yield result else: diff --git a/src/crawlee/_utils/system.py b/src/crawlee/_utils/system.py index d1f1cd9976..56eeaadf24 100644 --- a/src/crawlee/_utils/system.py +++ b/src/crawlee/_utils/system.py @@ -5,7 +5,7 @@ from contextlib import suppress from datetime import datetime, timezone from logging import getLogger -from typing import Annotated +from typing import TYPE_CHECKING, Annotated import psutil from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator @@ -41,11 +41,19 @@ class CpuInfo(BaseModel): used_ratio: Annotated[float, Field(alias='usedRatio')] """The ratio of CPU currently in use, represented as a float between 0 and 1.""" - created_at: datetime = Field( - alias='createdAt', - default_factory=lambda: datetime.now(timezone.utc), - ) - """The time at which the measurement was taken.""" + # Workaround for Pydantic and type checkers when using Annotated with default_factory + if TYPE_CHECKING: + created_at: datetime = datetime.now(timezone.utc) + """The time at which the measurement was taken.""" + else: + created_at: Annotated[ + datetime, + Field( + alias='createdAt', + default_factory=lambda: datetime.now(timezone.utc), + ), + ] + """The time at which the measurement was taken.""" class MemoryUsageInfo(BaseModel): @@ -61,11 +69,19 @@ class MemoryUsageInfo(BaseModel): ] """Memory usage of the current Python process and its children.""" - created_at: datetime = Field( - alias='createdAt', - default_factory=lambda: datetime.now(timezone.utc), - ) - """The time at which the measurement was taken.""" + # Workaround for Pydantic and type checkers when using Annotated with default_factory + if TYPE_CHECKING: + created_at: datetime = datetime.now(timezone.utc) + """The time at which the measurement was taken.""" + else: + created_at: Annotated[ + datetime, + Field( + alias='createdAt', + default_factory=lambda: datetime.now(timezone.utc), + ), + ] + """The time at which the measurement was taken.""" class MemoryInfo(MemoryUsageInfo): diff --git a/src/crawlee/browsers/_browser_pool.py b/src/crawlee/browsers/_browser_pool.py index 7d3fe0409c..480fb9fac5 100644 --- a/src/crawlee/browsers/_browser_pool.py +++ b/src/crawlee/browsers/_browser_pool.py @@ -142,7 +142,7 @@ def with_default_plugin( plugin_options['browser_new_context_options'] = browser_new_context_options or {} if headless is not None: - plugin_options['browser_launch_options']['headless'] = headless + plugin_options['browser_launch_options']['headless'] = headless # ty: ignore[invalid-assignment] if use_incognito_pages is not None: plugin_options['use_incognito_pages'] = use_incognito_pages diff --git a/src/crawlee/browsers/_playwright_browser.py b/src/crawlee/browsers/_playwright_browser.py index aba8e6b7e1..c66dcb21be 100644 --- a/src/crawlee/browsers/_playwright_browser.py +++ b/src/crawlee/browsers/_playwright_browser.py @@ -78,7 +78,7 @@ async def new_context(self, **context_options: Any) -> BrowserContext: async def _delete_temp_dir(self, _: BrowserContext | None) -> None: if self._temp_dir and self._temp_dir.exists(): - await asyncio.to_thread(shutil.rmtree, self._temp_dir, ignore_errors=True) + await asyncio.to_thread(lambda: shutil.rmtree(self._temp_dir, ignore_errors=True)) # ty: ignore[invalid-argument-type] @override async def close(self, **kwargs: Any) -> None: diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 6c1fbb63f8..059b3adbe9 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -102,7 +102,7 @@ def create_parsed_http_crawler_class( class _ParsedHttpCrawler( AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult] - ): + ): # ty: ignore[invalid-generic-class] def __init__( self, parser: AbstractHttpParser[TParseResult, TSelectResult] = static_parser, @@ -122,9 +122,9 @@ def _create_static_content_crawler_pipeline(self) -> ContextPipeline[ParsedHttpC ContextPipeline() .compose(self._execute_pre_navigation_hooks) .compose(self._make_http_request) - .compose(self._handle_status_code_response) + .compose(self._handle_status_code_response) # ty: ignore[invalid-argument-type] .compose(self._parse_http_response) - .compose(self._handle_blocked_request_by_content) + .compose(self._handle_blocked_request_by_content) # ty: ignore[invalid-argument-type] ) async def _execute_pre_navigation_hooks( diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index e7d865aae0..18285cbb92 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -162,7 +162,7 @@ def __init__( super().__init__(statistics=adaptive_statistics, **kwargs) # Sub crawlers related. - playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or {} + playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or {} # ty: ignore[invalid-assignment] # Each sub crawler will use custom logger . static_logger = getLogger('Subcrawler_static') @@ -183,7 +183,7 @@ def __init__( ) playwright_crawler = PlaywrightCrawler( statistics=_NonPersistentStatistics(), - **playwright_crawler_specific_kwargs, + **playwright_crawler_specific_kwargs, # ty: ignore[invalid-argument-type] **basic_crawler_kwargs_for_pw_crawler, ) @@ -337,7 +337,7 @@ async def from_static_pipeline_to_top_router( ) await self.router(adaptive_crawling_context) - return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router) + return self._static_context_pipeline(context_linked_to_result, from_static_pipeline_to_top_router) # ty: ignore[invalid-argument-type] if rendering_type == 'client only': @@ -347,7 +347,7 @@ async def from_pw_pipeline_to_top_router(context: PlaywrightCrawlingContext) -> ) await self.router(adaptive_crawling_context) - return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router) + return self._pw_context_pipeline(context_linked_to_result, from_pw_pipeline_to_top_router) # ty: ignore[invalid-argument-type] raise RuntimeError( f'Not a valid rendering type. Must be one of the following: {", ".join(get_args(RenderingType))}' diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 5fa1c02927..be489f9c9a 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -410,7 +410,7 @@ def __init__( self._context_result_map = WeakKeyDictionary[BasicCrawlingContext, RequestHandlerRunResult]() # Context pipeline - self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects) + self._context_pipeline = (_context_pipeline or ContextPipeline()).compose(self._check_url_after_redirects) # ty: ignore[invalid-argument-type] # Crawl settings self._max_request_retries = max_request_retries @@ -774,7 +774,7 @@ async def _run_crawler(self) -> None: async with AsyncExitStack() as exit_stack: for context in contexts_to_enter: - await exit_stack.enter_async_context(context) # type: ignore[arg-type] + await exit_stack.enter_async_context(context) # ty: ignore[invalid-argument-type] await self._autoscaled_pool.run() @@ -873,7 +873,7 @@ async def export_data( dataset_id: str | None = None, dataset_name: str | None = None, dataset_alias: str | None = None, - **additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs], # type: ignore[misc] + **additional_kwargs: Unpack[ExportDataJsonKwargs | ExportDataCsvKwargs], ) -> None: """Export all items from a Dataset to a JSON or CSV file. diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index 0beb04a375..c71bb71510 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -183,7 +183,7 @@ def __init__( generator_browser_type = [fingerprint_browser_type_from_playwright_browser_type(browser_type)] fingerprint_generator = DefaultFingerprintGenerator( - header_options=HeaderGeneratorOptions(browsers=generator_browser_type) + header_options=HeaderGeneratorOptions(browsers=generator_browser_type) # ty: ignore[invalid-argument-type] ) browser_pool = BrowserPool.with_default_plugin( @@ -202,9 +202,9 @@ def __init__( kwargs['_context_pipeline'] = ( ContextPipeline() .compose(self._open_page) - .compose(self._navigate) + .compose(self._navigate) # ty: ignore[invalid-argument-type] .compose(self._handle_status_code_response) - .compose(self._handle_blocked_request_by_content) + .compose(self._handle_blocked_request_by_content) # ty: ignore[invalid-argument-type] ) kwargs['_additional_context_managers'] = [self._browser_pool] kwargs.setdefault('_logger', logging.getLogger(__name__)) @@ -516,7 +516,7 @@ async def _get_cookies(self, page: Page) -> list[PlaywrightCookieParam]: async def _update_cookies(self, page: Page, cookies: list[PlaywrightCookieParam]) -> None: """Update the cookies in the page context.""" - await page.context.add_cookies([{**cookie} for cookie in cookies]) + await page.context.add_cookies([{**cookie} for cookie in cookies]) # ty: ignore[invalid-argument-type] async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile: """Find the robots.txt file for a given URL. diff --git a/src/crawlee/events/_event_manager.py b/src/crawlee/events/_event_manager.py index c623b341c1..2183727483 100644 --- a/src/crawlee/events/_event_manager.py +++ b/src/crawlee/events/_event_manager.py @@ -178,7 +178,7 @@ async def listener_wrapper(event_data: EventData) -> None: else asyncio.to_thread(cast('Callable[..., None]', listener), *bound_args.args, **bound_args.kwargs) ) - listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener.__name__}') + listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener.__name__}') # ty: ignore[invalid-argument-type, unresolved-attribute] self._listener_tasks.add(listener_task) try: @@ -189,7 +189,7 @@ async def listener_wrapper(event_data: EventData) -> None: # We need to swallow the exception and just log it here, otherwise it could break the event emitter logger.exception( 'Exception in the event listener', - extra={'event_name': event.value, 'listener_name': listener.__name__}, + extra={'event_name': event.value, 'listener_name': listener.__name__}, # ty: ignore[unresolved-attribute] ) finally: logger.debug('EventManager.on.listener_wrapper(): Removing listener task from the set...') diff --git a/src/crawlee/http_clients/_curl_impersonate.py b/src/crawlee/http_clients/_curl_impersonate.py index b4eff2421b..342a60ef08 100644 --- a/src/crawlee/http_clients/_curl_impersonate.py +++ b/src/crawlee/http_clients/_curl_impersonate.py @@ -93,12 +93,12 @@ async def read(self) -> bytes: return self._response.content async def read_stream(self) -> AsyncGenerator[bytes, None]: - if not self._response.astream_task or self._response.astream_task.done(): # type: ignore[attr-defined] + if not self._response.astream_task or self._response.astream_task.done(): # ty: ignore[possibly-missing-attribute] raise RuntimeError( 'Cannot read stream: either already consumed or Response not obtained from `stream` method' ) - async for chunk in self._response.aiter_content(): # type: ignore[no-untyped-call] + async for chunk in self._response.aiter_content(): yield chunk @@ -156,7 +156,7 @@ async def crawl( try: response = await client.request( url=request.url, - method=request.method.upper(), # type: ignore[arg-type] # curl-cffi requires uppercase method + method=request.method.upper(), # ty: ignore[invalid-argument-type] headers=request.headers, data=request.payload, cookies=session.cookies.jar if session else None, @@ -203,7 +203,7 @@ async def send_request( try: response = await client.request( url=url, - method=method.upper(), # type: ignore[arg-type] # curl-cffi requires uppercase method + method=method.upper(), # ty: ignore[invalid-argument-type] headers=dict(headers) if headers else None, data=payload, cookies=session.cookies.jar if session else None, @@ -244,7 +244,7 @@ async def stream( try: response = await client.request( url=url, - method=method.upper(), # type: ignore[arg-type] # curl-cffi requires uppercase method + method=method.upper(), # ty: ignore[invalid-argument-type] headers=dict(headers) if headers else None, data=payload, cookies=session.cookies.jar if session else None, @@ -309,8 +309,8 @@ def _is_proxy_error(error: CurlRequestError) -> bool: @staticmethod def _get_cookies(curl: Curl) -> list[Cookie]: cookies: list[Cookie] = [] - for curl_cookie in curl.getinfo(CurlInfo.COOKIELIST): # type: ignore[union-attr] - curl_morsel = CurlMorsel.from_curl_format(curl_cookie) # type: ignore[arg-type] + for curl_cookie in curl.getinfo(CurlInfo.COOKIELIST): # ty: ignore[not-iterable] + curl_morsel = CurlMorsel.from_curl_format(curl_cookie) # ty: ignore[invalid-argument-type] cookie = curl_morsel.to_cookiejar_cookie() cookies.append(cookie) return cookies diff --git a/src/crawlee/otel/crawler_instrumentor.py b/src/crawlee/otel/crawler_instrumentor.py index 09f2fda525..9c12e9e17e 100644 --- a/src/crawlee/otel/crawler_instrumentor.py +++ b/src/crawlee/otel/crawler_instrumentor.py @@ -3,9 +3,7 @@ import inspect from typing import TYPE_CHECKING, Any -from opentelemetry.instrumentation.instrumentor import ( # type:ignore[attr-defined] # Mypy has troubles with OTEL - BaseInstrumentor, -) +from opentelemetry.instrumentation.instrumentor import BaseInstrumentor from opentelemetry.instrumentation.utils import unwrap from opentelemetry.semconv.attributes.code_attributes import CODE_FUNCTION_NAME from opentelemetry.semconv.attributes.http_attributes import HTTP_REQUEST_METHOD diff --git a/src/crawlee/sessions/_cookies.py b/src/crawlee/sessions/_cookies.py index eb5a6a12ea..1089fc37f5 100644 --- a/src/crawlee/sessions/_cookies.py +++ b/src/crawlee/sessions/_cookies.py @@ -68,7 +68,7 @@ def __init__(self, cookies: SessionCookies | CookieJar | dict[str, str] | list[C if isinstance(cookies, dict): for key, value in cookies.items(): - self.set(key, value) + self.set(key, value) # ty: ignore[invalid-argument-type] elif isinstance(cookies, list): for item in cookies: @@ -152,7 +152,7 @@ def _convert_cookie_to_dict(self, cookie: Cookie) -> CookieParam: cookie_dict['expires'] = cookie.expires if (same_site := cookie.get_nonstandard_attr('SameSite')) and same_site in {'Lax', 'None', 'Strict'}: - cookie_dict['same_site'] = same_site # type: ignore[typeddict-item] + cookie_dict['same_site'] = same_site # ty: ignore[invalid-assignment] return cookie_dict diff --git a/src/crawlee/sessions/_models.py b/src/crawlee/sessions/_models.py index da709f1cdb..2f5b4a0483 100644 --- a/src/crawlee/sessions/_models.py +++ b/src/crawlee/sessions/_models.py @@ -63,19 +63,19 @@ class SessionPoolModel(BaseModel): ), ] - @computed_field(alias='sessionCount') # type: ignore[prop-decorator] + @computed_field(alias='sessionCount') @property def session_count(self) -> int: """Get the total number of sessions currently maintained in the pool.""" return len(self.sessions) - @computed_field(alias='usableSessionCount') # type: ignore[prop-decorator] + @computed_field(alias='usableSessionCount') @property def usable_session_count(self) -> int: """Get the number of sessions that are currently usable.""" return len([session for _, session in self.sessions.items() if session.is_usable]) - @computed_field(alias='retiredSessionCount') # type: ignore[prop-decorator] + @computed_field(alias='retiredSessionCount') @property def retired_session_count(self) -> int: """Get the number of sessions that are no longer usable.""" diff --git a/src/crawlee/statistics/_models.py b/src/crawlee/statistics/_models.py index 11b4310f3a..b17c618540 100644 --- a/src/crawlee/statistics/_models.py +++ b/src/crawlee/statistics/_models.py @@ -4,7 +4,7 @@ import warnings from dataclasses import asdict, dataclass from datetime import datetime, timedelta, timezone -from typing import Annotated, Any +from typing import TYPE_CHECKING, Annotated, Any from pydantic import BaseModel, ConfigDict, Field, PlainSerializer, PlainValidator, computed_field from typing_extensions import override @@ -77,9 +77,20 @@ class StatisticsState(BaseModel): crawler_started_at: Annotated[datetime | None, Field(alias='crawlerStartedAt')] = None crawler_last_started_at: Annotated[datetime | None, Field(alias='crawlerLastStartTimestamp')] = None crawler_finished_at: Annotated[datetime | None, Field(alias='crawlerFinishedAt')] = None - errors: dict[str, Any] = Field(default_factory=dict) - retry_errors: dict[str, Any] = Field(alias='retryErrors', default_factory=dict) - requests_with_status_code: dict[str, int] = Field(alias='requestsWithStatusCode', default_factory=dict) + + # Workaround for Pydantic and type checkers when using Annotated with default_factory + if TYPE_CHECKING: + errors: dict[str, Any] = {} + retry_errors: dict[str, Any] = {} + requests_with_status_code: dict[str, int] = {} + else: + errors: Annotated[dict[str, Any], Field(default_factory=dict)] + retry_errors: Annotated[dict[str, Any], Field(alias='retryErrors', default_factory=dict)] + requests_with_status_code: Annotated[ + dict[str, int], + Field(alias='requestsWithStatusCode', default_factory=dict), + ] + stats_persisted_at: Annotated[ datetime | None, Field(alias='statsPersistedAt'), PlainSerializer(lambda _: datetime.now(timezone.utc)) ] = None @@ -124,22 +135,22 @@ def crawler_runtime_for_serialization(self) -> timedelta: return self._runtime_offset + finished_at - self.crawler_last_started_at return self._runtime_offset - @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms) # type: ignore[prop-decorator] + @computed_field(alias='requestTotalDurationMillis', return_type=timedelta_ms) @property def request_total_duration(self) -> timedelta: return self.request_total_finished_duration + self.request_total_failed_duration - @computed_field(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None) # type: ignore[prop-decorator] + @computed_field(alias='requestAvgFailedDurationMillis', return_type=timedelta_ms | None) @property def request_avg_failed_duration(self) -> timedelta | None: return (self.request_total_failed_duration / self.requests_failed) if self.requests_failed else None - @computed_field(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None) # type: ignore[prop-decorator] + @computed_field(alias='requestAvgFinishedDurationMillis', return_type=timedelta_ms | None) @property def request_avg_finished_duration(self) -> timedelta | None: return (self.request_total_finished_duration / self.requests_finished) if self.requests_finished else None - @computed_field(alias='requestsTotal') # type: ignore[prop-decorator] + @computed_field(alias='requestsTotal') @property def requests_total(self) -> int: return self.requests_failed + self.requests_finished diff --git a/src/crawlee/storage_clients/_base/_dataset_client.py b/src/crawlee/storage_clients/_base/_dataset_client.py index 87573a3916..d2eeb86665 100644 --- a/src/crawlee/storage_clients/_base/_dataset_client.py +++ b/src/crawlee/storage_clients/_base/_dataset_client.py @@ -87,8 +87,8 @@ async def iterate_items( The backend method for the `Dataset.iterate_items` call. """ - # This syntax is to make mypy properly work with abstract AsyncIterator. + # This syntax is to make type checker properly work with abstract AsyncIterator. # https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators raise NotImplementedError - if False: # type: ignore[unreachable] + if False: yield 0 diff --git a/src/crawlee/storage_clients/_base/_key_value_store_client.py b/src/crawlee/storage_clients/_base/_key_value_store_client.py index d23abd7d70..33c36f67bd 100644 --- a/src/crawlee/storage_clients/_base/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_base/_key_value_store_client.py @@ -72,10 +72,10 @@ async def iterate_keys( The backend method for the `KeyValueStore.iterate_keys` call. """ - # This syntax is to make mypy properly work with abstract AsyncIterator. + # This syntax is to make type checker properly work with abstract AsyncIterator. # https://mypy.readthedocs.io/en/stable/more_types.html#asynchronous-iterators raise NotImplementedError - if False: # type: ignore[unreachable] + if False: yield 0 @abstractmethod diff --git a/src/crawlee/storage_clients/_file_system/_dataset_client.py b/src/crawlee/storage_clients/_file_system/_dataset_client.py index 55130d587f..4a222dc037 100644 --- a/src/crawlee/storage_clients/_file_system/_dataset_client.py +++ b/src/crawlee/storage_clients/_file_system/_dataset_client.py @@ -120,7 +120,7 @@ async def open( dataset_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR if not dataset_base_path.exists(): - await asyncio.to_thread(dataset_base_path.mkdir, parents=True, exist_ok=True) + await asyncio.to_thread(lambda: dataset_base_path.mkdir(parents=True, exist_ok=True)) # Get a new instance by ID. if id: @@ -134,7 +134,7 @@ async def open( continue try: - file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8') + file = await asyncio.to_thread(lambda p=path_to_metadata: p.open(mode='r', encoding='utf-8')) try: file_content = json.load(file) metadata = DatasetMetadata(**file_content) @@ -163,7 +163,7 @@ async def open( # If the dataset directory exists, reconstruct the client from the metadata file. if path_to_dataset.exists() and path_to_metadata.exists(): - file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8') + file = await asyncio.to_thread(lambda: path_to_metadata.open(mode='r', encoding='utf-8')) try: file_content = json.load(file) finally: @@ -211,7 +211,7 @@ async def drop(self) -> None: async def purge(self) -> None: async with self._lock: for file_path in await self._get_sorted_data_files(): - await asyncio.to_thread(file_path.unlink, missing_ok=True) + await asyncio.to_thread(lambda f=file_path: f.unlink(missing_ok=True)) await self._update_metadata( update_accessed_at=True, @@ -435,7 +435,7 @@ async def _update_metadata( self._metadata.item_count = new_item_count # Ensure the parent directory for the metadata file exists. - await asyncio.to_thread(self.path_to_metadata.parent.mkdir, parents=True, exist_ok=True) + await asyncio.to_thread(lambda: self.path_to_metadata.parent.mkdir(parents=True, exist_ok=True)) # Dump the serialized metadata to the file. data = await json_dumps(self._metadata.model_dump()) @@ -456,7 +456,7 @@ async def _push_item(self, item: dict[str, Any], item_id: int) -> None: file_path = self.path_to_dataset / filename # Ensure the dataset directory exists. - await asyncio.to_thread(self.path_to_dataset.mkdir, parents=True, exist_ok=True) + await asyncio.to_thread(lambda: self.path_to_dataset.mkdir(parents=True, exist_ok=True)) # Dump the serialized item to the file. data = await json_dumps(item) @@ -473,9 +473,10 @@ async def _get_sorted_data_files(self) -> list[Path]: """ # Retrieve and sort all JSON files in the dataset directory numerically. files = await asyncio.to_thread( - sorted, - self.path_to_dataset.glob('*.json'), - key=lambda f: int(f.stem) if f.stem.isdigit() else 0, + lambda: sorted( + self.path_to_dataset.glob('*.json'), + key=lambda f: int(f.stem) if f.stem.isdigit() else 0, + ) ) # Remove the metadata file from the list if present. diff --git a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py index 6a3db78fbc..28e724fda8 100644 --- a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py @@ -119,7 +119,7 @@ async def open( kvs_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR if not kvs_base_path.exists(): - await asyncio.to_thread(kvs_base_path.mkdir, parents=True, exist_ok=True) + await asyncio.to_thread(lambda: kvs_base_path.mkdir(parents=True, exist_ok=True)) # Get a new instance by ID. if id: @@ -133,7 +133,7 @@ async def open( continue try: - file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8') + file = await asyncio.to_thread(lambda p=path_to_metadata: p.open(mode='r', encoding='utf-8')) try: file_content = json.load(file) metadata = KeyValueStoreMetadata(**file_content) @@ -162,7 +162,7 @@ async def open( # If the key-value store directory exists, reconstruct the client from the metadata file. if path_to_kvs.exists() and path_to_metadata.exists(): - file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8') + file = await asyncio.to_thread(lambda: path_to_metadata.open(mode='r', encoding='utf-8')) try: file_content = json.load(file) finally: @@ -212,7 +212,7 @@ async def purge(self) -> None: for file_path in self.path_to_kvs.glob('*'): if file_path.name == METADATA_FILENAME: continue - await asyncio.to_thread(file_path.unlink, missing_ok=True) + await asyncio.to_thread(lambda f=file_path: f.unlink(missing_ok=True)) await self._update_metadata( update_accessed_at=True, @@ -239,7 +239,7 @@ async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: # Read the metadata file async with self._lock: try: - file = await asyncio.to_thread(open, record_metadata_filepath, 'r', encoding='utf-8') + file = await asyncio.to_thread(lambda: record_metadata_filepath.open(mode='r', encoding='utf-8')) except FileNotFoundError: logger.warning(f'Metadata file disappeared for key "{key}", aborting get_value') return None @@ -346,11 +346,11 @@ async def delete_value(self, *, key: str) -> None: async with self._lock: # Delete the value file and its metadata if found if record_path.exists(): - await asyncio.to_thread(record_path.unlink, missing_ok=True) + await asyncio.to_thread(lambda: record_path.unlink(missing_ok=True)) # Delete the metadata file if it exists if metadata_path.exists(): - await asyncio.to_thread(metadata_path.unlink, missing_ok=True) + await asyncio.to_thread(lambda: metadata_path.unlink(missing_ok=True)) else: logger.warning(f'Found value file for key "{key}" but no metadata file when trying to delete it.') @@ -373,7 +373,7 @@ async def iterate_keys( # List and sort all files *inside* a brief lock, then release it immediately: async with self._lock: - files = sorted(await asyncio.to_thread(list, self.path_to_kvs.glob('*'))) + files = sorted(await asyncio.to_thread(lambda: list(self.path_to_kvs.glob('*')))) count = 0 @@ -395,7 +395,7 @@ async def iterate_keys( # Try to read and parse the metadata file try: - metadata_content = await asyncio.to_thread(file_path.read_text, encoding='utf-8') + metadata_content = await asyncio.to_thread(lambda f=file_path: f.read_text(encoding='utf-8')) except FileNotFoundError: logger.warning(f'Metadata file disappeared for key "{key_name}", skipping it.') continue @@ -475,7 +475,7 @@ async def _update_metadata( self._metadata.modified_at = now # Ensure the parent directory for the metadata file exists. - await asyncio.to_thread(self.path_to_metadata.parent.mkdir, parents=True, exist_ok=True) + await asyncio.to_thread(lambda: self.path_to_metadata.parent.mkdir(parents=True, exist_ok=True)) # Dump the serialized metadata to the file. data = await json_dumps(self._metadata.model_dump()) diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index e49771b7c9..1a91ecea9e 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -183,7 +183,7 @@ async def open( rq_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR if not rq_base_path.exists(): - await asyncio.to_thread(rq_base_path.mkdir, parents=True, exist_ok=True) + await asyncio.to_thread(lambda: rq_base_path.mkdir(parents=True, exist_ok=True)) # Open an existing RQ by its ID, raise an error if not found. if id: @@ -197,7 +197,7 @@ async def open( continue try: - file = await asyncio.to_thread(path_to_metadata.open, 'r', encoding='utf-8') + file = await asyncio.to_thread(lambda p=path_to_metadata: p.open(mode='r', encoding='utf-8')) try: file_content = json.load(file) metadata = RequestQueueMetadata(**file_content) @@ -232,7 +232,7 @@ async def open( # If the RQ directory exists, reconstruct the client from the metadata file. if path_to_rq.exists() and path_to_metadata.exists(): - file = await asyncio.to_thread(open, path_to_metadata, 'r', encoding='utf-8') + file = await asyncio.to_thread(lambda: path_to_metadata.open(encoding='utf-8')) try: file_content = json.load(file) finally: @@ -300,7 +300,7 @@ async def purge(self) -> None: request_files = await self._get_request_files(self.path_to_rq) for file_path in request_files: - await asyncio.to_thread(file_path.unlink, missing_ok=True) + await asyncio.to_thread(lambda f=file_path: f.unlink(missing_ok=True)) # Clear recoverable state await self._state.reset() @@ -675,7 +675,7 @@ async def _update_metadata( self._metadata.had_multiple_clients = True # Ensure the parent directory for the metadata file exists. - await asyncio.to_thread(self.path_to_metadata.parent.mkdir, parents=True, exist_ok=True) + await asyncio.to_thread(lambda: self.path_to_metadata.parent.mkdir(parents=True, exist_ok=True)) # Dump the serialized metadata to the file. data = await json_dumps(self._metadata.model_dump()) @@ -753,10 +753,10 @@ async def _get_request_files(cls, path_to_rq: Path) -> list[Path]: A list of paths to all request files. """ # Create the requests directory if it doesn't exist. - await asyncio.to_thread(path_to_rq.mkdir, parents=True, exist_ok=True) + await asyncio.to_thread(lambda: path_to_rq.mkdir(parents=True, exist_ok=True)) # List all the json files. - files = await asyncio.to_thread(list, path_to_rq.glob('*.json')) + files = await asyncio.to_thread(lambda: list(path_to_rq.glob('*.json'))) # Filter out metadata file and non-file entries. filtered = filter(lambda request_file: request_file.is_file() and request_file.name != METADATA_FILENAME, files) @@ -775,7 +775,7 @@ async def _parse_request_file(cls, file_path: Path) -> Request | None: """ # Open the request file. try: - file = await asyncio.to_thread(open, file_path, 'r', encoding='utf-8') + file = await asyncio.to_thread(lambda f=file_path: f.open(mode='r', encoding='utf-8')) except FileNotFoundError: logger.warning(f'Request file "{file_path}" not found.') return None diff --git a/src/crawlee/storage_clients/_redis/_client_mixin.py b/src/crawlee/storage_clients/_redis/_client_mixin.py index 6c66e5db7b..8a54896577 100644 --- a/src/crawlee/storage_clients/_redis/_client_mixin.py +++ b/src/crawlee/storage_clients/_redis/_client_mixin.py @@ -179,7 +179,7 @@ async def _get_pipeline(self, *, with_execute: bool = True) -> AsyncIterator[Pip """Create a new Redis pipeline.""" async with self._redis.pipeline() as pipe: try: - pipe.multi() # type: ignore[no-untyped-call] + pipe.multi() yield pipe finally: if with_execute: @@ -187,7 +187,6 @@ async def _get_pipeline(self, *, with_execute: bool = True) -> AsyncIterator[Pip async def _create_storage(self, pipeline: Pipeline) -> None: """Create the actual storage structure in Redis.""" - _ = pipeline # To avoid unused variable mypy error async def _create_script(self, script_name: str) -> AsyncScript: """Load a Lua script from a file and return a Script object.""" @@ -262,8 +261,6 @@ async def _specific_update_metadata(self, pipeline: Pipeline, **kwargs: Any) -> pipeline: The Redis pipeline to use for the update. **kwargs: Storage-specific update parameters. """ - _ = pipeline # To avoid unused variable mypy error - _ = kwargs async def _update_metadata( self, diff --git a/src/crawlee/storage_clients/_redis/_dataset_client.py b/src/crawlee/storage_clients/_redis/_dataset_client.py index 44a78bce62..74c9d6c496 100644 --- a/src/crawlee/storage_clients/_redis/_dataset_client.py +++ b/src/crawlee/storage_clients/_redis/_dataset_client.py @@ -179,13 +179,13 @@ async def get_data( case (True, int(), None): json_path += f'[:-{offset}]' case (True, int(), int()): - json_path += f'[-{offset + limit}:-{offset}]' + json_path += f'[-{offset + limit}:-{offset}]' # ty: ignore[unsupported-operator] case (False, 0, int()): json_path += f'[:{limit}]' case (False, int(), None): json_path += f'[{offset}:]' case (False, int(), int()): - json_path += f'[{offset}:{offset + limit}]' + json_path += f'[{offset}:{offset + limit}]' # ty: ignore[unsupported-operator] if json_path == '$': json_path = '$[*]' @@ -210,7 +210,7 @@ async def get_data( limit=limit or (total - offset), total=total, desc=desc, - items=data, + items=data, # ty: ignore[invalid-argument-type] ) @override diff --git a/src/crawlee/storage_clients/_redis/_key_value_store_client.py b/src/crawlee/storage_clients/_redis/_key_value_store_client.py index 99f9665ea7..8aeaa1a01d 100644 --- a/src/crawlee/storage_clients/_redis/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_redis/_key_value_store_client.py @@ -144,7 +144,7 @@ async def set_value(self, *, key: str, value: Any, content_type: str | None = No async with self._get_pipeline() as pipe: # redis-py typing issue - await await_redis_response(pipe.hset(self._items_key, key, value_bytes)) # type: ignore[arg-type] + await await_redis_response(pipe.hset(self._items_key, key, value_bytes)) # ty: ignore[invalid-argument-type] await await_redis_response( pipe.hset( @@ -174,9 +174,7 @@ async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: # Query the record by key # redis-py typing issue - value_bytes: bytes | None = await await_redis_response( - self._redis.hget(self._items_key, key) # type: ignore[arg-type] - ) + value_bytes: bytes | None = await await_redis_response(self._redis.hget(self._items_key, key)) # ty: ignore[invalid-assignment] if value_bytes is None: logger.warning(f'Value for key "{key}" is missing.') @@ -225,7 +223,7 @@ async def iterate_keys( raise TypeError('The items data was received in an incorrect format.') # Get all keys, sorted alphabetically - keys = sorted(items_data.keys()) + keys = sorted(items_data.keys()) # ty: ignore[invalid-argument-type] # Apply exclusive_start_key filter if provided if exclusive_start_key is not None: diff --git a/src/crawlee/storage_clients/_redis/_request_queue_client.py b/src/crawlee/storage_clients/_redis/_request_queue_client.py index 90a86ee64f..74f9028bec 100644 --- a/src/crawlee/storage_clients/_redis/_request_queue_client.py +++ b/src/crawlee/storage_clients/_redis/_request_queue_client.py @@ -247,7 +247,6 @@ async def add_batch_of_requests( *, forefront: bool = False, ) -> AddRequestsResponse: - # Mypy workaround if self._add_requests_script is None: raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.') @@ -264,8 +263,8 @@ async def add_batch_of_requests( await await_redis_response(pipe.smismember(self._pending_set_key, unique_keys)) await await_redis_response(pipe.smismember(self._handled_set_key, unique_keys)) elif self._dedup_strategy == 'bloom': - await await_redis_response(pipe.bf().mexists(self._added_filter_key, *unique_keys)) # type: ignore[no-untyped-call] - await await_redis_response(pipe.bf().mexists(self._handled_filter_key, *unique_keys)) # type: ignore[no-untyped-call] + await await_redis_response(pipe.bf().mexists(self._added_filter_key, *unique_keys)) + await await_redis_response(pipe.bf().mexists(self._handled_filter_key, *unique_keys)) pipe_results = await pipe.execute() @@ -353,7 +352,6 @@ async def fetch_next_request(self) -> Request | None: if self._pending_fetch_cache: return self._pending_fetch_cache.popleft() - # Mypy workaround if self._fetch_script is None: raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.') @@ -399,7 +397,7 @@ async def mark_request_as_handled(self, request: Request) -> ProcessedRequest | await await_redis_response(pipe.sadd(self._handled_set_key, request.unique_key)) await await_redis_response(pipe.srem(self._pending_set_key, request.unique_key)) elif self._dedup_strategy == 'bloom': - await await_redis_response(pipe.bf().add(self._handled_filter_key, request.unique_key)) # type: ignore[no-untyped-call] + await await_redis_response(pipe.bf().add(self._handled_filter_key, request.unique_key)) await await_redis_response(pipe.hdel(self._in_progress_key, request.unique_key)) await await_redis_response(pipe.hdel(self._data_key, request.unique_key)) @@ -499,17 +497,16 @@ async def _create_storage(self, pipeline: Pipeline) -> None: await await_redis_response( pipeline.bf().create( self._added_filter_key, errorRate=self._bloom_error_rate, capacity=100000, expansion=10 - ) # type: ignore[no-untyped-call] + ) ) await await_redis_response( pipeline.bf().create( self._handled_filter_key, errorRate=self._bloom_error_rate, capacity=100000, expansion=10 - ) # type: ignore[no-untyped-call] + ) ) async def _reclaim_stale_requests(self) -> None: """Reclaim requests that have been in progress for too long.""" - # Mypy workaround if self._reclaim_stale_script is None: raise RuntimeError('Scripts not loaded. Call _ensure_scripts_loaded() before using the client.') diff --git a/src/crawlee/storage_clients/_redis/_storage_client.py b/src/crawlee/storage_clients/_redis/_storage_client.py index 78e7bed603..a6c39f5def 100644 --- a/src/crawlee/storage_clients/_redis/_storage_client.py +++ b/src/crawlee/storage_clients/_redis/_storage_client.py @@ -57,16 +57,19 @@ def __init__( queue_bloom_error_rate: Desired false positive rate for Bloom filter deduplication. Only relevant if `queue_dedup_strategy` is set to 'bloom'. """ - match (redis, connection_string): - case (None, None): - raise ValueError('Either redis or connection_string must be provided.') - case (Redis(), None): - self._redis = redis - case (None, str()): - self._redis = Redis.from_url(connection_string) - case (Redis(), str()): - raise ValueError('Either redis or connection_string must be provided, not both.') + if redis is None and connection_string is None: + raise ValueError('Either redis or connection_string must be provided.') + if redis is not None and connection_string is not None: + raise ValueError('Either redis or connection_string must be provided, not both.') + + if isinstance(redis, Redis) and connection_string is None: + self._redis = redis + + if isinstance(connection_string, str) and redis is None: + self._redis = Redis.from_url(connection_string) + + self._redis: Redis # to help type checker self._queue_dedup_strategy = queue_dedup_strategy self._queue_bloom_error_rate = queue_bloom_error_rate diff --git a/src/crawlee/storage_clients/_redis/_utils.py b/src/crawlee/storage_clients/_redis/_utils.py index a86f979fa4..27f051d692 100644 --- a/src/crawlee/storage_clients/_redis/_utils.py +++ b/src/crawlee/storage_clients/_redis/_utils.py @@ -19,5 +19,5 @@ async def await_redis_response(response: Awaitable[T] | T) -> T: def read_lua_script(script_name: str) -> str: """Read a Lua script from a file.""" file_path = Path(__file__).parent / 'lua_scripts' / script_name - with file_path.open('r', encoding='utf-8') as file: + with file_path.open(mode='r', encoding='utf-8') as file: return file.read() diff --git a/src/crawlee/storage_clients/_sql/_client_mixin.py b/src/crawlee/storage_clients/_sql/_client_mixin.py index c681e3a220..e7ee2ae8d9 100644 --- a/src/crawlee/storage_clients/_sql/_client_mixin.py +++ b/src/crawlee/storage_clients/_sql/_client_mixin.py @@ -105,7 +105,7 @@ async def _open( else: stmt = select(cls._METADATA_TABLE).where(cls._METADATA_TABLE.internal_name == internal_name) result = await session.execute(stmt) - orm_metadata = result.scalar_one_or_none() # type: ignore[assignment] + orm_metadata = result.scalar_one_or_none() if orm_metadata: client = cls(id=orm_metadata.id, storage_client=storage_client) diff --git a/src/crawlee/storage_clients/models.py b/src/crawlee/storage_clients/models.py index e7b5927d7d..2ebd65914d 100644 --- a/src/crawlee/storage_clients/models.py +++ b/src/crawlee/storage_clients/models.py @@ -1,7 +1,7 @@ from __future__ import annotations from datetime import datetime -from typing import Annotated, Any, Generic +from typing import TYPE_CHECKING, Annotated, Any, Generic from pydantic import BaseModel, BeforeValidator, ConfigDict, Field from typing_extensions import TypeVar @@ -127,8 +127,13 @@ class DatasetItemsListPage(BaseModel): desc: Annotated[bool, Field(default=False)] """Indicates if the returned list is in descending order.""" - items: Annotated[list[dict], Field(default_factory=list)] - """The list of dataset items returned on this page.""" + # Workaround for Pydantic and type checkers when using Annotated with default_factory + if TYPE_CHECKING: + items: list[dict] = [] + """The list of dataset items returned on this page.""" + else: + items: Annotated[list[dict], Field(default_factory=list)] + """The list of dataset items returned on this page.""" @docs_group('Storage data') diff --git a/tests/unit/_autoscaling/test_snapshotter.py b/tests/unit/_autoscaling/test_snapshotter.py index 06a5682a32..7b3d50d75d 100644 --- a/tests/unit/_autoscaling/test_snapshotter.py +++ b/tests/unit/_autoscaling/test_snapshotter.py @@ -139,7 +139,10 @@ async def test_get_cpu_sample( events_data = [ EventSystemInfoData( - cpu_info=CpuInfo(used_ratio=0.5, created_at=now - timedelta(hours=delta)), + cpu_info=CpuInfo( + used_ratio=0.5, + created_at=now - timedelta(hours=delta), + ), memory_info=default_memory_info, ) for delta in range(5, 0, -1) diff --git a/tests/unit/_utils/test_html_to_text.py b/tests/unit/_utils/test_html_to_text.py index c802eee248..0a535e58db 100644 --- a/tests/unit/_utils/test_html_to_text.py +++ b/tests/unit/_utils/test_html_to_text.py @@ -191,7 +191,7 @@ def test_html_to_text(source: str, expected_text: str, html_to_text: Callable[[s @pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup]) def test_html_to_text_raises_on_wrong_input_type(html_to_text: Callable[[str], str]) -> None: with pytest.raises(TypeError): - html_to_text(1) # type: ignore[arg-type] # Intentional wrong type test. + html_to_text(1) # ty: ignore[invalid-argument-type] def test_html_to_text_parsel() -> None: diff --git a/tests/unit/_utils/test_recurring_task.py b/tests/unit/_utils/test_recurring_task.py index 78f43601eb..61951ec11e 100644 --- a/tests/unit/_utils/test_recurring_task.py +++ b/tests/unit/_utils/test_recurring_task.py @@ -48,7 +48,7 @@ async def test_execution(function: AsyncMock, delay: timedelta) -> None: await asyncio.sleep(0.1) # Wait enough for the task to execute a few times await task.stop() - assert isinstance(task.func, AsyncMock) # To let MyPy know that the function is a mocked + assert isinstance(task.func, AsyncMock) # To let type checker know that the function is a mock assert task.func.call_count >= 3 await task.stop() diff --git a/tests/unit/_utils/test_timedelta_ms.py b/tests/unit/_utils/test_timedelta_ms.py index dd5fc7a8f0..5f5b0f4f4f 100644 --- a/tests/unit/_utils/test_timedelta_ms.py +++ b/tests/unit/_utils/test_timedelta_ms.py @@ -30,6 +30,6 @@ class _ModelWithTimedeltaMs(BaseModel): def test_model_with_timedelta_ms_input_types( time_delta_input: float | timedelta | Any | None, expected_time_delta: timedelta, expected_model_dump_value: int ) -> None: - model = _ModelWithTimedeltaMs(time_delta=time_delta_input) + model = _ModelWithTimedeltaMs(time_delta=time_delta_input) # ty: ignore[invalid-argument-type] assert model.time_delta == expected_time_delta assert model.model_dump() == {'time_delta': expected_model_dump_value} diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 8dc686f4c8..4a5619c854 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -116,7 +116,7 @@ class TestInput: TestInput( expected_pw_count=0, expected_static_count=2, - rendering_types=cycle(['static']), + rendering_types=cycle(['static']), # ty: ignore[invalid-argument-type] detection_probability_recommendation=cycle([0]), ), id='Static only', @@ -125,7 +125,7 @@ class TestInput: TestInput( expected_pw_count=2, expected_static_count=0, - rendering_types=cycle(['client only']), + rendering_types=cycle(['client only']), # ty: ignore[invalid-argument-type] detection_probability_recommendation=cycle([0]), ), id='Client only', @@ -134,7 +134,7 @@ class TestInput: TestInput( expected_pw_count=1, expected_static_count=1, - rendering_types=cycle(['static', 'client only']), + rendering_types=cycle(['static', 'client only']), # ty: ignore[invalid-argument-type] detection_probability_recommendation=cycle([0]), ), id='Mixed', @@ -143,7 +143,7 @@ class TestInput: TestInput( expected_pw_count=2, expected_static_count=2, - rendering_types=cycle(['static', 'client only']), + rendering_types=cycle(['static', 'client only']), # ty: ignore[invalid-argument-type] detection_probability_recommendation=cycle([1]), ), id='Enforced rendering type detection', @@ -207,7 +207,8 @@ async def pre_nav_hook(context: AdaptivePlaywrightPreNavCrawlingContext) -> None async def test_adaptive_crawling_parsel(test_urls: list[str]) -> None: """Top level test for parsel. Only one argument combination. (The rest of code is tested with bs variant.)""" predictor = _SimpleRenderingTypePredictor( - rendering_types=cycle(['static', 'client only']), detection_probability_recommendation=cycle([0]) + rendering_types=cycle(['static', 'client only']), # ty: ignore[invalid-argument-type] + detection_probability_recommendation=cycle([0]), ) crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( @@ -692,7 +693,8 @@ async def test_adaptive_context_helpers_on_changed_selector(test_urls: list[str] dynamically changed text instead of the original static text. """ browser_only_predictor_no_detection = _SimpleRenderingTypePredictor( - rendering_types=cycle(['client only']), detection_probability_recommendation=cycle([0]) + rendering_types=cycle(['client only']), # ty: ignore[invalid-argument-type] + detection_probability_recommendation=cycle([0]), ) expected_h3_tag = f'

{_H3_CHANGED_TEXT}

' @@ -717,7 +719,8 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: async def test_adaptive_context_query_non_existing_element(test_urls: list[str]) -> None: """Test that querying non-existing selector returns `None`""" browser_only_predictor_no_detection = _SimpleRenderingTypePredictor( - rendering_types=cycle(['client only']), detection_probability_recommendation=cycle([0]) + rendering_types=cycle(['client only']), # ty: ignore[invalid-argument-type] + detection_probability_recommendation=cycle([0]), ) crawler = AdaptivePlaywrightCrawler.with_parsel_static_parser( @@ -743,7 +746,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: TestInput( expected_pw_count=0, expected_static_count=2, - rendering_types=cycle(['static']), + rendering_types=cycle(['static']), # ty: ignore[invalid-argument-type] detection_probability_recommendation=cycle([0]), ), id='Static only', @@ -752,7 +755,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: TestInput( expected_pw_count=2, expected_static_count=0, - rendering_types=cycle(['client only']), + rendering_types=cycle(['client only']), # ty: ignore[invalid-argument-type] detection_probability_recommendation=cycle([0]), ), id='Client only', @@ -761,7 +764,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: TestInput( expected_pw_count=2, expected_static_count=2, - rendering_types=cycle(['static', 'client only']), + rendering_types=cycle(['static', 'client only']), # ty: ignore[invalid-argument-type] detection_probability_recommendation=cycle([1]), ), id='Enforced rendering type detection', diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index bc5618c439..5800e59d4e 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -2,7 +2,6 @@ from __future__ import annotations import asyncio -import concurrent import json import logging import os @@ -11,6 +10,7 @@ import time from asyncio import Future from collections import Counter +from concurrent.futures import ProcessPoolExecutor from dataclasses import dataclass from datetime import timedelta from itertools import product @@ -307,7 +307,7 @@ async def request_handler(context: BasicCrawlingContext) -> None: raise RuntimeError('Arbitrary crash for testing purposes') # Apply one of the handlers - @getattr(crawler, handler) # type: ignore[untyped-decorator] + @getattr(crawler, handler) async def handler_implementation(context: BasicCrawlingContext, error: Exception) -> None: await context.push_data(test_data) await context.add_requests(requests=[test_request], rq_alias=rq_alias) @@ -1043,16 +1043,16 @@ async def handler(context: BasicCrawlingContext) -> None: assert final_statistics.msg == 'Final request statistics:' # ignore[attr-defined] since `extra` parameters are not defined for `LogRecord` - assert final_statistics.requests_finished == 4 # type: ignore[attr-defined] - assert final_statistics.requests_failed == 33 # type: ignore[attr-defined] - assert final_statistics.retry_histogram == [1, 4, 8] # type: ignore[attr-defined] - assert final_statistics.request_avg_failed_duration == 99.0 # type: ignore[attr-defined] - assert final_statistics.request_avg_finished_duration == 0.483 # type: ignore[attr-defined] - assert final_statistics.requests_finished_per_minute == 0.33 # type: ignore[attr-defined] - assert final_statistics.requests_failed_per_minute == 0.1 # type: ignore[attr-defined] - assert final_statistics.request_total_duration == 720.0 # type: ignore[attr-defined] - assert final_statistics.requests_total == 37 # type: ignore[attr-defined] - assert final_statistics.crawler_runtime == 300.0 # type: ignore[attr-defined] + assert final_statistics.requests_finished == 4 + assert final_statistics.requests_failed == 33 + assert final_statistics.retry_histogram == [1, 4, 8] + assert final_statistics.request_avg_failed_duration == 99.0 + assert final_statistics.request_avg_finished_duration == 0.483 + assert final_statistics.requests_finished_per_minute == 0.33 + assert final_statistics.requests_failed_per_minute == 0.1 + assert final_statistics.request_total_duration == 720.0 + assert final_statistics.requests_total == 37 + assert final_statistics.crawler_runtime == 300.0 async def test_crawler_manual_stop() -> None: @@ -1782,7 +1782,7 @@ async def test_crawler_statistics_persistence(tmp_path: Path) -> None: This test simulates starting the crawler process twice, and checks that the statistics include first run.""" - with concurrent.futures.ProcessPoolExecutor() as executor: + with ProcessPoolExecutor() as executor: # Crawl 2 requests in the first run and automatically persist the state. first_run_state = executor.submit( _process_run_crawler, @@ -1792,7 +1792,7 @@ async def test_crawler_statistics_persistence(tmp_path: Path) -> None: assert first_run_state.requests_finished == 2 # Do not reuse the executor to simulate a fresh process to avoid modified class attributes. - with concurrent.futures.ProcessPoolExecutor() as executor: + with ProcessPoolExecutor() as executor: # Crawl 1 additional requests in the second run, but use previously automatically persisted state. second_run_state = executor.submit( _process_run_crawler, requests=['https://c.placeholder.com'], storage_dir=str(tmp_path) diff --git a/tests/unit/crawlers/_basic/test_context_pipeline.py b/tests/unit/crawlers/_basic/test_context_pipeline.py index 35de6c60de..b910322f08 100644 --- a/tests/unit/crawlers/_basic/test_context_pipeline.py +++ b/tests/unit/crawlers/_basic/test_context_pipeline.py @@ -88,7 +88,7 @@ async def middleware_b(context: EnhancedCrawlingContext) -> AsyncGenerator[MoreE ) events.append('middleware_b_out') - pipeline = ContextPipeline[BasicCrawlingContext]().compose(middleware_a).compose(middleware_b) + pipeline = ContextPipeline[BasicCrawlingContext]().compose(middleware_a).compose(middleware_b) # ty: ignore[invalid-argument-type] context = BasicCrawlingContext( request=Request.from_url(url='https://test.io/'), @@ -142,7 +142,7 @@ async def step_1(context: BasicCrawlingContext) -> AsyncGenerator[BasicCrawlingC async def step_2(context: BasicCrawlingContext) -> AsyncGenerator[BasicCrawlingContext, None]: raise RuntimeError('Crash during middleware initialization') - yield context # type: ignore[unreachable] + yield context pipeline = ContextPipeline().compose(step_1).compose(step_2) context = BasicCrawlingContext( diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index 1ecdb8859b..db60867b44 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -425,7 +425,7 @@ async def test_save_cookies_after_handler_processing(server_url: URL) -> None: @crawler.router.default_handler async def request_handler(context: PlaywrightCrawlingContext) -> None: # Simulate cookies installed from an external source in the browser - await context.page.context.add_cookies([{'name': 'check', 'value': 'test', 'url': str(server_url)}]) + await context.page.context.add_cookies([{'name': 'check', 'value': 'test', 'url': str(server_url)}]) # ty: ignore[invalid-argument-type] if context.session: session_ids.append(context.session.id) diff --git a/tests/unit/fingerprint_suite/test_header_generator.py b/tests/unit/fingerprint_suite/test_header_generator.py index dfcb41e79d..ae9ab71bf0 100644 --- a/tests/unit/fingerprint_suite/test_header_generator.py +++ b/tests/unit/fingerprint_suite/test_header_generator.py @@ -51,7 +51,7 @@ def test_get_user_agent_header_invalid_browser_type() -> None: header_generator = HeaderGenerator() with pytest.raises(ValueError, match=r'Unsupported browser type'): - header_generator.get_user_agent_header(browser_type='invalid_browser') # type: ignore[arg-type] + header_generator.get_user_agent_header(browser_type='invalid_browser') # ty: ignore[invalid-argument-type] def test_get_sec_ch_ua_headers_chromium(header_network: dict) -> None: @@ -77,4 +77,4 @@ def test_get_sec_ch_ua_headers_invalid_browser_type() -> None: header_generator = HeaderGenerator() with pytest.raises(ValueError, match=r'Unsupported browser type'): - header_generator.get_sec_ch_ua_headers(browser_type='invalid_browser') # type: ignore[arg-type] + header_generator.get_sec_ch_ua_headers(browser_type='invalid_browser') # ty: ignore[invalid-argument-type] diff --git a/tests/unit/proxy_configuration/test_new_proxy_info.py b/tests/unit/proxy_configuration/test_new_proxy_info.py index 8c0fa3497d..1a8efe0289 100644 --- a/tests/unit/proxy_configuration/test_new_proxy_info.py +++ b/tests/unit/proxy_configuration/test_new_proxy_info.py @@ -86,7 +86,7 @@ async def test_rotates_proxies() -> None: async def test_rotates_proxies_with_sessions() -> None: proxy_urls: list[str | None] = ['http://proxy:1111', 'http://proxy:2222', 'http://proxy:3333'] - request = Request(url='http://some.domain/abc', unique_key='1', id='1') + request = Request(url='http://some.domain/abc', unique_key='1') sessions = [f'session_{i}' for i in range(6)] config = ProxyConfiguration(proxy_urls=proxy_urls) diff --git a/tests/unit/proxy_configuration/test_tiers.py b/tests/unit/proxy_configuration/test_tiers.py index 641fd6b338..59db9a43d7 100644 --- a/tests/unit/proxy_configuration/test_tiers.py +++ b/tests/unit/proxy_configuration/test_tiers.py @@ -44,7 +44,7 @@ async def test_retrying_request_makes_tier_go_up() -> None: config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls) # Calling `new_proxy_info` with the same request most probably means it's being retried - request_1 = Request(url='http://some.domain/abc', unique_key='1', id='1') + request_1 = Request(url='http://some.domain/abc', unique_key='1') info = await config.new_proxy_info(None, request_1, None) assert info is not None @@ -59,7 +59,7 @@ async def test_retrying_request_makes_tier_go_up() -> None: assert info.url == tiered_proxy_urls[2][0] # Subsequent requests with the same domain should use the same tier - request_2 = Request(url='http://some.domain/xyz', unique_key='2', id='2') + request_2 = Request(url='http://some.domain/xyz', unique_key='2') info = await config.new_proxy_info(None, request_2, None) assert info is not None @@ -76,7 +76,7 @@ async def test_retrying_request_makes_tier_go_up_with_sessions() -> None: config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls) - request = Request(url='http://some.domain/abc', unique_key='1', id='1') + request = Request(url='http://some.domain/abc', unique_key='1') # Calling `new_proxy_info` with the same request likely means that it is being retried. # However, a single session should always receive the same proxy @@ -116,7 +116,7 @@ async def test_successful_request_makes_tier_go_down() -> None: config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls) - request_1 = Request(url='http://some.domain/abc', unique_key='1', id='1') + request_1 = Request(url='http://some.domain/abc', unique_key='1') info = None for tier in tiered_proxy_urls: @@ -125,7 +125,7 @@ async def test_successful_request_makes_tier_go_down() -> None: assert info.url == tier[0] for i in range(100): - new_request = Request(url=f'http://some.domain/{i}', unique_key=str(i), id=str(i)) + new_request = Request(url=f'http://some.domain/{i}', unique_key=str(i)) info = await config.new_proxy_info(None, new_request, None) assert info is not None @@ -141,7 +141,7 @@ async def test_none_proxy_retrying_request_makes_tier_go_up() -> None: config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls) # Calling `new_proxy_info` with the same request most probably means it's being retried - request_1 = Request(url='http://some.domain/abc', unique_key='1', id='1') + request_1 = Request(url='http://some.domain/abc', unique_key='1') # No proxy used. info = await config.new_proxy_info(None, request_1, None) diff --git a/tests/unit/storage_clients/_redis/test_redis_dataset_client.py b/tests/unit/storage_clients/_redis/test_redis_dataset_client.py index 3101ac5e0b..a80264e23b 100644 --- a/tests/unit/storage_clients/_redis/test_redis_dataset_client.py +++ b/tests/unit/storage_clients/_redis/test_redis_dataset_client.py @@ -49,7 +49,7 @@ async def test_base_keys_creation(dataset_client: RedisDatasetClient) -> None: metadata_data = await await_redis_response(dataset_client.redis.json().get('datasets:test_dataset:metadata')) assert isinstance(metadata_data, dict) - assert metadata_data['id'] == metadata.id # type: ignore[unreachable] # py-json typing is broken + assert metadata_data['id'] == metadata.id async def test_record_and_content_verification(dataset_client: RedisDatasetClient) -> None: diff --git a/tests/unit/storage_clients/_redis/test_redis_kvs_client.py b/tests/unit/storage_clients/_redis/test_redis_kvs_client.py index 92a9f0d46d..d2413a06c0 100644 --- a/tests/unit/storage_clients/_redis/test_redis_kvs_client.py +++ b/tests/unit/storage_clients/_redis/test_redis_kvs_client.py @@ -46,7 +46,7 @@ async def test_base_keys_creation(kvs_client: RedisKeyValueStoreClient) -> None: metadata_data = await await_redis_response(kvs_client.redis.json().get('key_value_stores:test_kvs:metadata')) assert isinstance(metadata_data, dict) - assert metadata_data['id'] == metadata.id # type: ignore[unreachable] # py-json typing is broken + assert metadata_data['id'] == metadata.id async def test_value_record_creation_and_content(kvs_client: RedisKeyValueStoreClient) -> None: diff --git a/tests/unit/storage_clients/_redis/test_redis_rq_client.py b/tests/unit/storage_clients/_redis/test_redis_rq_client.py index 3f878ea981..26191d8189 100644 --- a/tests/unit/storage_clients/_redis/test_redis_rq_client.py +++ b/tests/unit/storage_clients/_redis/test_redis_rq_client.py @@ -60,7 +60,7 @@ async def test_base_keys_creation(rq_client: RedisRequestQueueClient) -> None: metadata_data = await await_redis_response(rq_client.redis.json().get('request_queues:test_request_queue:metadata')) assert isinstance(metadata_data, dict) - assert metadata_data['id'] == metadata.id # type: ignore[unreachable] # py-json typing is broken + assert metadata_data['id'] == metadata.id async def test_request_records_persistence(rq_client: RedisRequestQueueClient) -> None: diff --git a/tests/unit/storages/test_dataset.py b/tests/unit/storages/test_dataset.py index bb3da626b2..a95efff7f6 100644 --- a/tests/unit/storages/test_dataset.py +++ b/tests/unit/storages/test_dataset.py @@ -472,10 +472,7 @@ async def test_export_to_csv( async def test_export_to_invalid_content_type(dataset: Dataset) -> None: """Test exporting dataset with invalid content type raises error.""" with pytest.raises(ValueError, match=r'Unsupported content type'): - await dataset.export_to( - key='invalid_export', - content_type='invalid', # type: ignore[call-overload] # Intentionally invalid content type - ) + await dataset.export_to(key='invalid_export', content_type='invalid') # ty: ignore[no-matching-overload] async def test_export_with_multiple_kwargs(dataset: Dataset, tmp_path: Path) -> None: diff --git a/uv.lock b/uv.lock index 27d674efe6..8f013fd6a3 100644 --- a/uv.lock +++ b/uv.lock @@ -811,7 +811,6 @@ dev = [ { name = "build" }, { name = "dycw-pytest-only" }, { name = "fakeredis", extra = ["json", "lua", "probabilistic"] }, - { name = "mypy" }, { name = "pre-commit" }, { name = "proxy-py" }, { name = "pydoc-markdown" }, @@ -822,6 +821,7 @@ dev = [ { name = "pytest-xdist" }, { name = "ruff" }, { name = "setuptools" }, + { name = "ty" }, { name = "types-beautifulsoup4" }, { name = "types-cachetools" }, { name = "types-colorama" }, @@ -886,7 +886,6 @@ dev = [ { name = "build", specifier = "<2.0.0" }, { name = "dycw-pytest-only", specifier = "<3.0.0" }, { name = "fakeredis", extras = ["probabilistic", "json", "lua"], specifier = "<3.0.0" }, - { name = "mypy", specifier = "~=1.19.0" }, { name = "pre-commit", specifier = "<5.0.0" }, { name = "proxy-py", specifier = "<3.0.0" }, { name = "pydoc-markdown", specifier = "<5.0.0" }, @@ -897,6 +896,7 @@ dev = [ { name = "pytest-xdist", specifier = "<4.0.0" }, { name = "ruff", specifier = "~=0.14.0" }, { name = "setuptools" }, + { name = "ty", specifier = "~=0.0.0" }, { name = "types-beautifulsoup4", specifier = "<5.0.0" }, { name = "types-cachetools", specifier = "<7.0.0" }, { name = "types-colorama", specifier = "<1.0.0" }, @@ -1632,79 +1632,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/35/5a/73ecb3d82f8615f32ccdadeb9356726d6cae3a4bbc840b437ceb95708063/jsonpath_ng-1.7.0-py3-none-any.whl", hash = "sha256:f3d7f9e848cba1b6da28c55b1c26ff915dc9e0b1ba7e752a53d6da8d5cbd00b6", size = 30105, upload-time = "2024-11-20T17:58:30.418Z" }, ] -[[package]] -name = "librt" -version = "0.7.7" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b7/29/47f29026ca17f35cf299290292d5f8331f5077364974b7675a353179afa2/librt-0.7.7.tar.gz", hash = "sha256:81d957b069fed1890953c3b9c3895c7689960f233eea9a1d9607f71ce7f00b2c", size = 145910, upload-time = "2026-01-01T23:52:22.87Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c6/84/2cfb1f3b9b60bab52e16a220c931223fc8e963d0d7bb9132bef012aafc3f/librt-0.7.7-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e4836c5645f40fbdc275e5670819bde5ab5f2e882290d304e3c6ddab1576a6d0", size = 54709, upload-time = "2026-01-01T23:50:48.326Z" }, - { url = "https://files.pythonhosted.org/packages/19/a1/3127b277e9d3784a8040a54e8396d9ae5c64d6684dc6db4b4089b0eedcfb/librt-0.7.7-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:6ae8aec43117a645a31e5f60e9e3a0797492e747823b9bda6972d521b436b4e8", size = 56658, upload-time = "2026-01-01T23:50:49.74Z" }, - { url = "https://files.pythonhosted.org/packages/3a/e9/b91b093a5c42eb218120445f3fef82e0b977fa2225f4d6fc133d25cdf86a/librt-0.7.7-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:aea05f701ccd2a76b34f0daf47ca5068176ff553510b614770c90d76ac88df06", size = 161026, upload-time = "2026-01-01T23:50:50.853Z" }, - { url = "https://files.pythonhosted.org/packages/c7/cb/1ded77d5976a79d7057af4a010d577ce4f473ff280984e68f4974a3281e5/librt-0.7.7-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7b16ccaeff0ed4355dfb76fe1ea7a5d6d03b5ad27f295f77ee0557bc20a72495", size = 169529, upload-time = "2026-01-01T23:50:52.24Z" }, - { url = "https://files.pythonhosted.org/packages/da/6e/6ca5bdaa701e15f05000ac1a4c5d1475c422d3484bd3d1ca9e8c2f5be167/librt-0.7.7-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c48c7e150c095d5e3cea7452347ba26094be905d6099d24f9319a8b475fcd3e0", size = 183271, upload-time = "2026-01-01T23:50:55.287Z" }, - { url = "https://files.pythonhosted.org/packages/e7/2d/55c0e38073997b4bbb5ddff25b6d1bbba8c2f76f50afe5bb9c844b702f34/librt-0.7.7-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:4dcee2f921a8632636d1c37f1bbdb8841d15666d119aa61e5399c5268e7ce02e", size = 179039, upload-time = "2026-01-01T23:50:56.807Z" }, - { url = "https://files.pythonhosted.org/packages/33/4e/3662a41ae8bb81b226f3968426293517b271d34d4e9fd4b59fc511f1ae40/librt-0.7.7-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:14ef0f4ac3728ffd85bfc58e2f2f48fb4ef4fa871876f13a73a7381d10a9f77c", size = 173505, upload-time = "2026-01-01T23:50:58.291Z" }, - { url = "https://files.pythonhosted.org/packages/f8/5d/cf768deb8bdcbac5f8c21fcb32dd483d038d88c529fd351bbe50590b945d/librt-0.7.7-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:e4ab69fa37f8090f2d971a5d2bc606c7401170dbdae083c393d6cbf439cb45b8", size = 193570, upload-time = "2026-01-01T23:50:59.546Z" }, - { url = "https://files.pythonhosted.org/packages/a1/ea/ee70effd13f1d651976d83a2812391f6203971740705e3c0900db75d4bce/librt-0.7.7-cp310-cp310-win32.whl", hash = "sha256:4bf3cc46d553693382d2abf5f5bd493d71bb0f50a7c0beab18aa13a5545c8900", size = 42600, upload-time = "2026-01-01T23:51:00.694Z" }, - { url = "https://files.pythonhosted.org/packages/f0/eb/dc098730f281cba76c279b71783f5de2edcba3b880c1ab84a093ef826062/librt-0.7.7-cp310-cp310-win_amd64.whl", hash = "sha256:f0c8fe5aeadd8a0e5b0598f8a6ee3533135ca50fd3f20f130f9d72baf5c6ac58", size = 48977, upload-time = "2026-01-01T23:51:01.726Z" }, - { url = "https://files.pythonhosted.org/packages/f0/56/30b5c342518005546df78841cb0820ae85a17e7d07d521c10ef367306d0d/librt-0.7.7-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:a487b71fbf8a9edb72a8c7a456dda0184642d99cd007bc819c0b7ab93676a8ee", size = 54709, upload-time = "2026-01-01T23:51:02.774Z" }, - { url = "https://files.pythonhosted.org/packages/72/78/9f120e3920b22504d4f3835e28b55acc2cc47c9586d2e1b6ba04c3c1bf01/librt-0.7.7-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f4d4efb218264ecf0f8516196c9e2d1a0679d9fb3bb15df1155a35220062eba8", size = 56663, upload-time = "2026-01-01T23:51:03.838Z" }, - { url = "https://files.pythonhosted.org/packages/1c/ea/7d7a1ee7dfc1151836028eba25629afcf45b56bbc721293e41aa2e9b8934/librt-0.7.7-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:b8bb331aad734b059c4b450cd0a225652f16889e286b2345af5e2c3c625c3d85", size = 161705, upload-time = "2026-01-01T23:51:04.917Z" }, - { url = "https://files.pythonhosted.org/packages/45/a5/952bc840ac8917fbcefd6bc5f51ad02b89721729814f3e2bfcc1337a76d6/librt-0.7.7-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:467dbd7443bda08338fc8ad701ed38cef48194017554f4c798b0a237904b3f99", size = 171029, upload-time = "2026-01-01T23:51:06.09Z" }, - { url = "https://files.pythonhosted.org/packages/fa/bf/c017ff7da82dc9192cf40d5e802a48a25d00e7639b6465cfdcee5893a22c/librt-0.7.7-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:50d1d1ee813d2d1a3baf2873634ba506b263032418d16287c92ec1cc9c1a00cb", size = 184704, upload-time = "2026-01-01T23:51:07.549Z" }, - { url = "https://files.pythonhosted.org/packages/77/ec/72f3dd39d2cdfd6402ab10836dc9cbf854d145226062a185b419c4f1624a/librt-0.7.7-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:c7e5070cf3ec92d98f57574da0224f8c73faf1ddd6d8afa0b8c9f6e86997bc74", size = 180719, upload-time = "2026-01-01T23:51:09.062Z" }, - { url = "https://files.pythonhosted.org/packages/78/86/06e7a1a81b246f3313bf515dd9613a1c81583e6fd7843a9f4d625c4e926d/librt-0.7.7-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:bdb9f3d865b2dafe7f9ad7f30ef563c80d0ddd2fdc8cc9b8e4f242f475e34d75", size = 174537, upload-time = "2026-01-01T23:51:10.611Z" }, - { url = "https://files.pythonhosted.org/packages/83/08/f9fb2edc9c7a76e95b2924ce81d545673f5b034e8c5dd92159d1c7dae0c6/librt-0.7.7-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:8185c8497d45164e256376f9da5aed2bb26ff636c798c9dabe313b90e9f25b28", size = 195238, upload-time = "2026-01-01T23:51:11.762Z" }, - { url = "https://files.pythonhosted.org/packages/ba/56/ea2d2489d3ea1f47b301120e03a099e22de7b32c93df9a211e6ff4f9bf38/librt-0.7.7-cp311-cp311-win32.whl", hash = "sha256:44d63ce643f34a903f09ff7ca355aae019a3730c7afd6a3c037d569beeb5d151", size = 42939, upload-time = "2026-01-01T23:51:13.192Z" }, - { url = "https://files.pythonhosted.org/packages/58/7b/c288f417e42ba2a037f1c0753219e277b33090ed4f72f292fb6fe175db4c/librt-0.7.7-cp311-cp311-win_amd64.whl", hash = "sha256:7d13cc340b3b82134f8038a2bfe7137093693dcad8ba5773da18f95ad6b77a8a", size = 49240, upload-time = "2026-01-01T23:51:14.264Z" }, - { url = "https://files.pythonhosted.org/packages/7c/24/738eb33a6c1516fdb2dfd2a35db6e5300f7616679b573585be0409bc6890/librt-0.7.7-cp311-cp311-win_arm64.whl", hash = "sha256:983de36b5a83fe9222f4f7dcd071f9b1ac6f3f17c0af0238dadfb8229588f890", size = 42613, upload-time = "2026-01-01T23:51:15.268Z" }, - { url = "https://files.pythonhosted.org/packages/56/72/1cd9d752070011641e8aee046c851912d5f196ecd726fffa7aed2070f3e0/librt-0.7.7-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:2a85a1fc4ed11ea0eb0a632459ce004a2d14afc085a50ae3463cd3dfe1ce43fc", size = 55687, upload-time = "2026-01-01T23:51:16.291Z" }, - { url = "https://files.pythonhosted.org/packages/50/aa/d5a1d4221c4fe7e76ae1459d24d6037783cb83c7645164c07d7daf1576ec/librt-0.7.7-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:c87654e29a35938baead1c4559858f346f4a2a7588574a14d784f300ffba0efd", size = 57136, upload-time = "2026-01-01T23:51:17.363Z" }, - { url = "https://files.pythonhosted.org/packages/23/6f/0c86b5cb5e7ef63208c8cc22534df10ecc5278efc0d47fb8815577f3ca2f/librt-0.7.7-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:c9faaebb1c6212c20afd8043cd6ed9de0a47d77f91a6b5b48f4e46ed470703fe", size = 165320, upload-time = "2026-01-01T23:51:18.455Z" }, - { url = "https://files.pythonhosted.org/packages/16/37/df4652690c29f645ffe405b58285a4109e9fe855c5bb56e817e3e75840b3/librt-0.7.7-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:1908c3e5a5ef86b23391448b47759298f87f997c3bd153a770828f58c2bb4630", size = 174216, upload-time = "2026-01-01T23:51:19.599Z" }, - { url = "https://files.pythonhosted.org/packages/9a/d6/d3afe071910a43133ec9c0f3e4ce99ee6df0d4e44e4bddf4b9e1c6ed41cc/librt-0.7.7-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dbc4900e95a98fc0729523be9d93a8fedebb026f32ed9ffc08acd82e3e181503", size = 189005, upload-time = "2026-01-01T23:51:21.052Z" }, - { url = "https://files.pythonhosted.org/packages/d5/18/74060a870fe2d9fd9f47824eba6717ce7ce03124a0d1e85498e0e7efc1b2/librt-0.7.7-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a7ea4e1fbd253e5c68ea0fe63d08577f9d288a73f17d82f652ebc61fa48d878d", size = 183961, upload-time = "2026-01-01T23:51:22.493Z" }, - { url = "https://files.pythonhosted.org/packages/7c/5e/918a86c66304af66a3c1d46d54df1b2d0b8894babc42a14fb6f25511497f/librt-0.7.7-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:ef7699b7a5a244b1119f85c5bbc13f152cd38240cbb2baa19b769433bae98e50", size = 177610, upload-time = "2026-01-01T23:51:23.874Z" }, - { url = "https://files.pythonhosted.org/packages/b2/d7/b5e58dc2d570f162e99201b8c0151acf40a03a39c32ab824dd4febf12736/librt-0.7.7-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:955c62571de0b181d9e9e0a0303c8bc90d47670a5eff54cf71bf5da61d1899cf", size = 199272, upload-time = "2026-01-01T23:51:25.341Z" }, - { url = "https://files.pythonhosted.org/packages/18/87/8202c9bd0968bdddc188ec3811985f47f58ed161b3749299f2c0dd0f63fb/librt-0.7.7-cp312-cp312-win32.whl", hash = "sha256:1bcd79be209313b270b0e1a51c67ae1af28adad0e0c7e84c3ad4b5cb57aaa75b", size = 43189, upload-time = "2026-01-01T23:51:26.799Z" }, - { url = "https://files.pythonhosted.org/packages/61/8d/80244b267b585e7aa79ffdac19f66c4861effc3a24598e77909ecdd0850e/librt-0.7.7-cp312-cp312-win_amd64.whl", hash = "sha256:4353ee891a1834567e0302d4bd5e60f531912179578c36f3d0430f8c5e16b456", size = 49462, upload-time = "2026-01-01T23:51:27.813Z" }, - { url = "https://files.pythonhosted.org/packages/2d/1f/75db802d6a4992d95e8a889682601af9b49d5a13bbfa246d414eede1b56c/librt-0.7.7-cp312-cp312-win_arm64.whl", hash = "sha256:a76f1d679beccccdf8c1958e732a1dfcd6e749f8821ee59d7bec009ac308c029", size = 42828, upload-time = "2026-01-01T23:51:28.804Z" }, - { url = "https://files.pythonhosted.org/packages/8d/5e/d979ccb0a81407ec47c14ea68fb217ff4315521730033e1dd9faa4f3e2c1/librt-0.7.7-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:8f4a0b0a3c86ba9193a8e23bb18f100d647bf192390ae195d84dfa0a10fb6244", size = 55746, upload-time = "2026-01-01T23:51:29.828Z" }, - { url = "https://files.pythonhosted.org/packages/f5/2c/3b65861fb32f802c3783d6ac66fc5589564d07452a47a8cf9980d531cad3/librt-0.7.7-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:5335890fea9f9e6c4fdf8683061b9ccdcbe47c6dc03ab8e9b68c10acf78be78d", size = 57174, upload-time = "2026-01-01T23:51:31.226Z" }, - { url = "https://files.pythonhosted.org/packages/50/df/030b50614b29e443607220097ebaf438531ea218c7a9a3e21ea862a919cd/librt-0.7.7-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:9b4346b1225be26def3ccc6c965751c74868f0578cbcba293c8ae9168483d811", size = 165834, upload-time = "2026-01-01T23:51:32.278Z" }, - { url = "https://files.pythonhosted.org/packages/5d/e1/bd8d1eacacb24be26a47f157719553bbd1b3fe812c30dddf121c0436fd0b/librt-0.7.7-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a10b8eebdaca6e9fdbaf88b5aefc0e324b763a5f40b1266532590d5afb268a4c", size = 174819, upload-time = "2026-01-01T23:51:33.461Z" }, - { url = "https://files.pythonhosted.org/packages/46/7d/91d6c3372acf54a019c1ad8da4c9ecf4fc27d039708880bf95f48dbe426a/librt-0.7.7-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:067be973d90d9e319e6eb4ee2a9b9307f0ecd648b8a9002fa237289a4a07a9e7", size = 189607, upload-time = "2026-01-01T23:51:34.604Z" }, - { url = "https://files.pythonhosted.org/packages/fa/ac/44604d6d3886f791fbd1c6ae12d5a782a8f4aca927484731979f5e92c200/librt-0.7.7-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:23d2299ed007812cccc1ecef018db7d922733382561230de1f3954db28433977", size = 184586, upload-time = "2026-01-01T23:51:35.845Z" }, - { url = "https://files.pythonhosted.org/packages/5c/26/d8a6e4c17117b7f9b83301319d9a9de862ae56b133efb4bad8b3aa0808c9/librt-0.7.7-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:6b6f8ea465524aa4c7420c7cc4ca7d46fe00981de8debc67b1cc2e9957bb5b9d", size = 178251, upload-time = "2026-01-01T23:51:37.018Z" }, - { url = "https://files.pythonhosted.org/packages/99/ab/98d857e254376f8e2f668e807daccc1f445e4b4fc2f6f9c1cc08866b0227/librt-0.7.7-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8df32a99cc46eb0ee90afd9ada113ae2cafe7e8d673686cf03ec53e49635439", size = 199853, upload-time = "2026-01-01T23:51:38.195Z" }, - { url = "https://files.pythonhosted.org/packages/7c/55/4523210d6ae5134a5da959900be43ad8bab2e4206687b6620befddb5b5fd/librt-0.7.7-cp313-cp313-win32.whl", hash = "sha256:86f86b3b785487c7760247bcdac0b11aa8bf13245a13ed05206286135877564b", size = 43247, upload-time = "2026-01-01T23:51:39.629Z" }, - { url = "https://files.pythonhosted.org/packages/25/40/3ec0fed5e8e9297b1cf1a3836fb589d3de55f9930e3aba988d379e8ef67c/librt-0.7.7-cp313-cp313-win_amd64.whl", hash = "sha256:4862cb2c702b1f905c0503b72d9d4daf65a7fdf5a9e84560e563471e57a56949", size = 49419, upload-time = "2026-01-01T23:51:40.674Z" }, - { url = "https://files.pythonhosted.org/packages/1c/7a/aab5f0fb122822e2acbc776addf8b9abfb4944a9056c00c393e46e543177/librt-0.7.7-cp313-cp313-win_arm64.whl", hash = "sha256:0996c83b1cb43c00e8c87835a284f9057bc647abd42b5871e5f941d30010c832", size = 42828, upload-time = "2026-01-01T23:51:41.731Z" }, - { url = "https://files.pythonhosted.org/packages/69/9c/228a5c1224bd23809a635490a162e9cbdc68d99f0eeb4a696f07886b8206/librt-0.7.7-cp314-cp314-macosx_10_13_x86_64.whl", hash = "sha256:23daa1ab0512bafdd677eb1bfc9611d8ffbe2e328895671e64cb34166bc1b8c8", size = 55188, upload-time = "2026-01-01T23:51:43.14Z" }, - { url = "https://files.pythonhosted.org/packages/ba/c2/0e7c6067e2b32a156308205e5728f4ed6478c501947e9142f525afbc6bd2/librt-0.7.7-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:558a9e5a6f3cc1e20b3168fb1dc802d0d8fa40731f6e9932dcc52bbcfbd37111", size = 56895, upload-time = "2026-01-01T23:51:44.534Z" }, - { url = "https://files.pythonhosted.org/packages/0e/77/de50ff70c80855eb79d1d74035ef06f664dd073fb7fb9d9fb4429651b8eb/librt-0.7.7-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:2567cb48dc03e5b246927ab35cbb343376e24501260a9b5e30b8e255dca0d1d2", size = 163724, upload-time = "2026-01-01T23:51:45.571Z" }, - { url = "https://files.pythonhosted.org/packages/6e/19/f8e4bf537899bdef9e0bb9f0e4b18912c2d0f858ad02091b6019864c9a6d/librt-0.7.7-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:6066c638cdf85ff92fc6f932d2d73c93a0e03492cdfa8778e6d58c489a3d7259", size = 172470, upload-time = "2026-01-01T23:51:46.823Z" }, - { url = "https://files.pythonhosted.org/packages/42/4c/dcc575b69d99076768e8dd6141d9aecd4234cba7f0e09217937f52edb6ed/librt-0.7.7-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a609849aca463074c17de9cda173c276eb8fee9e441053529e7b9e249dc8b8ee", size = 186806, upload-time = "2026-01-01T23:51:48.009Z" }, - { url = "https://files.pythonhosted.org/packages/fe/f8/4094a2b7816c88de81239a83ede6e87f1138477d7ee956c30f136009eb29/librt-0.7.7-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:add4e0a000858fe9bb39ed55f31085506a5c38363e6eb4a1e5943a10c2bfc3d1", size = 181809, upload-time = "2026-01-01T23:51:49.35Z" }, - { url = "https://files.pythonhosted.org/packages/1b/ac/821b7c0ab1b5a6cd9aee7ace8309c91545a2607185101827f79122219a7e/librt-0.7.7-cp314-cp314-musllinux_1_2_i686.whl", hash = "sha256:a3bfe73a32bd0bdb9a87d586b05a23c0a1729205d79df66dee65bb2e40d671ba", size = 175597, upload-time = "2026-01-01T23:51:50.636Z" }, - { url = "https://files.pythonhosted.org/packages/71/f9/27f6bfbcc764805864c04211c6ed636fe1d58f57a7b68d1f4ae5ed74e0e0/librt-0.7.7-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:0ecce0544d3db91a40f8b57ae26928c02130a997b540f908cefd4d279d6c5848", size = 196506, upload-time = "2026-01-01T23:51:52.535Z" }, - { url = "https://files.pythonhosted.org/packages/46/ba/c9b9c6fc931dd7ea856c573174ccaf48714905b1a7499904db2552e3bbaf/librt-0.7.7-cp314-cp314-win32.whl", hash = "sha256:8f7a74cf3a80f0c3b0ec75b0c650b2f0a894a2cec57ef75f6f72c1e82cdac61d", size = 39747, upload-time = "2026-01-01T23:51:53.683Z" }, - { url = "https://files.pythonhosted.org/packages/c5/69/cd1269337c4cde3ee70176ee611ab0058aa42fc8ce5c9dce55f48facfcd8/librt-0.7.7-cp314-cp314-win_amd64.whl", hash = "sha256:3d1fe2e8df3268dd6734dba33ededae72ad5c3a859b9577bc00b715759c5aaab", size = 45971, upload-time = "2026-01-01T23:51:54.697Z" }, - { url = "https://files.pythonhosted.org/packages/79/fd/e0844794423f5583108c5991313c15e2b400995f44f6ec6871f8aaf8243c/librt-0.7.7-cp314-cp314-win_arm64.whl", hash = "sha256:2987cf827011907d3dfd109f1be0d61e173d68b1270107bb0e89f2fca7f2ed6b", size = 39075, upload-time = "2026-01-01T23:51:55.726Z" }, - { url = "https://files.pythonhosted.org/packages/42/02/211fd8f7c381e7b2a11d0fdfcd410f409e89967be2e705983f7c6342209a/librt-0.7.7-cp314-cp314t-macosx_10_13_x86_64.whl", hash = "sha256:8e92c8de62b40bfce91d5e12c6e8b15434da268979b1af1a6589463549d491e6", size = 57368, upload-time = "2026-01-01T23:51:56.706Z" }, - { url = "https://files.pythonhosted.org/packages/4c/b6/aca257affae73ece26041ae76032153266d110453173f67d7603058e708c/librt-0.7.7-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:f683dcd49e2494a7535e30f779aa1ad6e3732a019d80abe1309ea91ccd3230e3", size = 59238, upload-time = "2026-01-01T23:51:58.066Z" }, - { url = "https://files.pythonhosted.org/packages/96/47/7383a507d8e0c11c78ca34c9d36eab9000db5989d446a2f05dc40e76c64f/librt-0.7.7-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:9b15e5d17812d4d629ff576699954f74e2cc24a02a4fc401882dd94f81daba45", size = 183870, upload-time = "2026-01-01T23:51:59.204Z" }, - { url = "https://files.pythonhosted.org/packages/a4/b8/50f3d8eec8efdaf79443963624175c92cec0ba84827a66b7fcfa78598e51/librt-0.7.7-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c084841b879c4d9b9fa34e5d5263994f21aea7fd9c6add29194dbb41a6210536", size = 194608, upload-time = "2026-01-01T23:52:00.419Z" }, - { url = "https://files.pythonhosted.org/packages/23/d9/1b6520793aadb59d891e3b98ee057a75de7f737e4a8b4b37fdbecb10d60f/librt-0.7.7-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:10c8fb9966f84737115513fecbaf257f9553d067a7dd45a69c2c7e5339e6a8dc", size = 206776, upload-time = "2026-01-01T23:52:01.705Z" }, - { url = "https://files.pythonhosted.org/packages/ff/db/331edc3bba929d2756fa335bfcf736f36eff4efcb4f2600b545a35c2ae58/librt-0.7.7-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:9b5fb1ecb2c35362eab2dbd354fd1efa5a8440d3e73a68be11921042a0edc0ff", size = 203206, upload-time = "2026-01-01T23:52:03.315Z" }, - { url = "https://files.pythonhosted.org/packages/b2/e1/6af79ec77204e85f6f2294fc171a30a91bb0e35d78493532ed680f5d98be/librt-0.7.7-cp314-cp314t-musllinux_1_2_i686.whl", hash = "sha256:d1454899909d63cc9199a89fcc4f81bdd9004aef577d4ffc022e600c412d57f3", size = 196697, upload-time = "2026-01-01T23:52:04.857Z" }, - { url = "https://files.pythonhosted.org/packages/f3/46/de55ecce4b2796d6d243295c221082ca3a944dc2fb3a52dcc8660ce7727d/librt-0.7.7-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:7ef28f2e7a016b29792fe0a2dd04dec75725b32a1264e390c366103f834a9c3a", size = 217193, upload-time = "2026-01-01T23:52:06.159Z" }, - { url = "https://files.pythonhosted.org/packages/41/61/33063e271949787a2f8dd33c5260357e3d512a114fc82ca7890b65a76e2d/librt-0.7.7-cp314-cp314t-win32.whl", hash = "sha256:5e419e0db70991b6ba037b70c1d5bbe92b20ddf82f31ad01d77a347ed9781398", size = 40277, upload-time = "2026-01-01T23:52:07.625Z" }, - { url = "https://files.pythonhosted.org/packages/06/21/1abd972349f83a696ea73159ac964e63e2d14086fdd9bc7ca878c25fced4/librt-0.7.7-cp314-cp314t-win_amd64.whl", hash = "sha256:d6b7d93657332c817b8d674ef6bf1ab7796b4f7ce05e420fd45bd258a72ac804", size = 46765, upload-time = "2026-01-01T23:52:08.647Z" }, - { url = "https://files.pythonhosted.org/packages/51/0e/b756c7708143a63fca65a51ca07990fa647db2cc8fcd65177b9e96680255/librt-0.7.7-cp314-cp314t-win_arm64.whl", hash = "sha256:142c2cd91794b79fd0ce113bd658993b7ede0fe93057668c2f98a45ca00b7e91", size = 39724, upload-time = "2026-01-01T23:52:09.745Z" }, -] - [[package]] name = "lupa" version = "2.6" @@ -2156,52 +2083,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b7/da/7d22601b625e241d4f23ef1ebff8acfc60da633c9e7e7922e24d10f592b3/multidict-6.7.0-py3-none-any.whl", hash = "sha256:394fc5c42a333c9ffc3e421a4c85e08580d990e08b99f6bf35b4132114c5dcb3", size = 12317, upload-time = "2025-10-06T14:52:29.272Z" }, ] -[[package]] -name = "mypy" -version = "1.19.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "librt", marker = "platform_python_implementation != 'PyPy'" }, - { name = "mypy-extensions" }, - { name = "pathspec" }, - { name = "tomli", marker = "python_full_version < '3.11'" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/f5/db/4efed9504bc01309ab9c2da7e352cc223569f05478012b5d9ece38fd44d2/mypy-1.19.1.tar.gz", hash = "sha256:19d88bb05303fe63f71dd2c6270daca27cb9401c4ca8255fe50d1d920e0eb9ba", size = 3582404, upload-time = "2025-12-15T05:03:48.42Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/2f/63/e499890d8e39b1ff2df4c0c6ce5d371b6844ee22b8250687a99fd2f657a8/mypy-1.19.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5f05aa3d375b385734388e844bc01733bd33c644ab48e9684faa54e5389775ec", size = 13101333, upload-time = "2025-12-15T05:03:03.28Z" }, - { url = "https://files.pythonhosted.org/packages/72/4b/095626fc136fba96effc4fd4a82b41d688ab92124f8c4f7564bffe5cf1b0/mypy-1.19.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:022ea7279374af1a5d78dfcab853fe6a536eebfda4b59deab53cd21f6cd9f00b", size = 12164102, upload-time = "2025-12-15T05:02:33.611Z" }, - { url = "https://files.pythonhosted.org/packages/0c/5b/952928dd081bf88a83a5ccd49aaecfcd18fd0d2710c7ff07b8fb6f7032b9/mypy-1.19.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:ee4c11e460685c3e0c64a4c5de82ae143622410950d6be863303a1c4ba0e36d6", size = 12765799, upload-time = "2025-12-15T05:03:28.44Z" }, - { url = "https://files.pythonhosted.org/packages/2a/0d/93c2e4a287f74ef11a66fb6d49c7a9f05e47b0a4399040e6719b57f500d2/mypy-1.19.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:de759aafbae8763283b2ee5869c7255391fbc4de3ff171f8f030b5ec48381b74", size = 13522149, upload-time = "2025-12-15T05:02:36.011Z" }, - { url = "https://files.pythonhosted.org/packages/7b/0e/33a294b56aaad2b338d203e3a1d8b453637ac36cb278b45005e0901cf148/mypy-1.19.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:ab43590f9cd5108f41aacf9fca31841142c786827a74ab7cc8a2eacb634e09a1", size = 13810105, upload-time = "2025-12-15T05:02:40.327Z" }, - { url = "https://files.pythonhosted.org/packages/0e/fd/3e82603a0cb66b67c5e7abababce6bf1a929ddf67bf445e652684af5c5a0/mypy-1.19.1-cp310-cp310-win_amd64.whl", hash = "sha256:2899753e2f61e571b3971747e302d5f420c3fd09650e1951e99f823bc3089dac", size = 10057200, upload-time = "2025-12-15T05:02:51.012Z" }, - { url = "https://files.pythonhosted.org/packages/ef/47/6b3ebabd5474d9cdc170d1342fbf9dddc1b0ec13ec90bf9004ee6f391c31/mypy-1.19.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d8dfc6ab58ca7dda47d9237349157500468e404b17213d44fc1cb77bce532288", size = 13028539, upload-time = "2025-12-15T05:03:44.129Z" }, - { url = "https://files.pythonhosted.org/packages/5c/a6/ac7c7a88a3c9c54334f53a941b765e6ec6c4ebd65d3fe8cdcfbe0d0fd7db/mypy-1.19.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e3f276d8493c3c97930e354b2595a44a21348b320d859fb4a2b9f66da9ed27ab", size = 12083163, upload-time = "2025-12-15T05:03:37.679Z" }, - { url = "https://files.pythonhosted.org/packages/67/af/3afa9cf880aa4a2c803798ac24f1d11ef72a0c8079689fac5cfd815e2830/mypy-1.19.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2abb24cf3f17864770d18d673c85235ba52456b36a06b6afc1e07c1fdcd3d0e6", size = 12687629, upload-time = "2025-12-15T05:02:31.526Z" }, - { url = "https://files.pythonhosted.org/packages/2d/46/20f8a7114a56484ab268b0ab372461cb3a8f7deed31ea96b83a4e4cfcfca/mypy-1.19.1-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:a009ffa5a621762d0c926a078c2d639104becab69e79538a494bcccb62cc0331", size = 13436933, upload-time = "2025-12-15T05:03:15.606Z" }, - { url = "https://files.pythonhosted.org/packages/5b/f8/33b291ea85050a21f15da910002460f1f445f8007adb29230f0adea279cb/mypy-1.19.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:f7cee03c9a2e2ee26ec07479f38ea9c884e301d42c6d43a19d20fb014e3ba925", size = 13661754, upload-time = "2025-12-15T05:02:26.731Z" }, - { url = "https://files.pythonhosted.org/packages/fd/a3/47cbd4e85bec4335a9cd80cf67dbc02be21b5d4c9c23ad6b95d6c5196bac/mypy-1.19.1-cp311-cp311-win_amd64.whl", hash = "sha256:4b84a7a18f41e167f7995200a1d07a4a6810e89d29859df936f1c3923d263042", size = 10055772, upload-time = "2025-12-15T05:03:26.179Z" }, - { url = "https://files.pythonhosted.org/packages/06/8a/19bfae96f6615aa8a0604915512e0289b1fad33d5909bf7244f02935d33a/mypy-1.19.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:a8174a03289288c1f6c46d55cef02379b478bfbc8e358e02047487cad44c6ca1", size = 13206053, upload-time = "2025-12-15T05:03:46.622Z" }, - { url = "https://files.pythonhosted.org/packages/a5/34/3e63879ab041602154ba2a9f99817bb0c85c4df19a23a1443c8986e4d565/mypy-1.19.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:ffcebe56eb09ff0c0885e750036a095e23793ba6c2e894e7e63f6d89ad51f22e", size = 12219134, upload-time = "2025-12-15T05:03:24.367Z" }, - { url = "https://files.pythonhosted.org/packages/89/cc/2db6f0e95366b630364e09845672dbee0cbf0bbe753a204b29a944967cd9/mypy-1.19.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b64d987153888790bcdb03a6473d321820597ab8dd9243b27a92153c4fa50fd2", size = 12731616, upload-time = "2025-12-15T05:02:44.725Z" }, - { url = "https://files.pythonhosted.org/packages/00/be/dd56c1fd4807bc1eba1cf18b2a850d0de7bacb55e158755eb79f77c41f8e/mypy-1.19.1-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c35d298c2c4bba75feb2195655dfea8124d855dfd7343bf8b8c055421eaf0cf8", size = 13620847, upload-time = "2025-12-15T05:03:39.633Z" }, - { url = "https://files.pythonhosted.org/packages/6d/42/332951aae42b79329f743bf1da088cd75d8d4d9acc18fbcbd84f26c1af4e/mypy-1.19.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:34c81968774648ab5ac09c29a375fdede03ba253f8f8287847bd480782f73a6a", size = 13834976, upload-time = "2025-12-15T05:03:08.786Z" }, - { url = "https://files.pythonhosted.org/packages/6f/63/e7493e5f90e1e085c562bb06e2eb32cae27c5057b9653348d38b47daaecc/mypy-1.19.1-cp312-cp312-win_amd64.whl", hash = "sha256:b10e7c2cd7870ba4ad9b2d8a6102eb5ffc1f16ca35e3de6bfa390c1113029d13", size = 10118104, upload-time = "2025-12-15T05:03:10.834Z" }, - { url = "https://files.pythonhosted.org/packages/de/9f/a6abae693f7a0c697dbb435aac52e958dc8da44e92e08ba88d2e42326176/mypy-1.19.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:e3157c7594ff2ef1634ee058aafc56a82db665c9438fd41b390f3bde1ab12250", size = 13201927, upload-time = "2025-12-15T05:02:29.138Z" }, - { url = "https://files.pythonhosted.org/packages/9a/a4/45c35ccf6e1c65afc23a069f50e2c66f46bd3798cbe0d680c12d12935caa/mypy-1.19.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:bdb12f69bcc02700c2b47e070238f42cb87f18c0bc1fc4cdb4fb2bc5fd7a3b8b", size = 12206730, upload-time = "2025-12-15T05:03:01.325Z" }, - { url = "https://files.pythonhosted.org/packages/05/bb/cdcf89678e26b187650512620eec8368fded4cfd99cfcb431e4cdfd19dec/mypy-1.19.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f859fb09d9583a985be9a493d5cfc5515b56b08f7447759a0c5deaf68d80506e", size = 12724581, upload-time = "2025-12-15T05:03:20.087Z" }, - { url = "https://files.pythonhosted.org/packages/d1/32/dd260d52babf67bad8e6770f8e1102021877ce0edea106e72df5626bb0ec/mypy-1.19.1-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:c9a6538e0415310aad77cb94004ca6482330fece18036b5f360b62c45814c4ef", size = 13616252, upload-time = "2025-12-15T05:02:49.036Z" }, - { url = "https://files.pythonhosted.org/packages/71/d0/5e60a9d2e3bd48432ae2b454b7ef2b62a960ab51292b1eda2a95edd78198/mypy-1.19.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:da4869fc5e7f62a88f3fe0b5c919d1d9f7ea3cef92d3689de2823fd27e40aa75", size = 13840848, upload-time = "2025-12-15T05:02:55.95Z" }, - { url = "https://files.pythonhosted.org/packages/98/76/d32051fa65ecf6cc8c6610956473abdc9b4c43301107476ac03559507843/mypy-1.19.1-cp313-cp313-win_amd64.whl", hash = "sha256:016f2246209095e8eda7538944daa1d60e1e8134d98983b9fc1e92c1fc0cb8dd", size = 10135510, upload-time = "2025-12-15T05:02:58.438Z" }, - { url = "https://files.pythonhosted.org/packages/de/eb/b83e75f4c820c4247a58580ef86fcd35165028f191e7e1ba57128c52782d/mypy-1.19.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:06e6170bd5836770e8104c8fdd58e5e725cfeb309f0a6c681a811f557e97eac1", size = 13199744, upload-time = "2025-12-15T05:03:30.823Z" }, - { url = "https://files.pythonhosted.org/packages/94/28/52785ab7bfa165f87fcbb61547a93f98bb20e7f82f90f165a1f69bce7b3d/mypy-1.19.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:804bd67b8054a85447c8954215a906d6eff9cabeabe493fb6334b24f4bfff718", size = 12215815, upload-time = "2025-12-15T05:02:42.323Z" }, - { url = "https://files.pythonhosted.org/packages/0a/c6/bdd60774a0dbfb05122e3e925f2e9e846c009e479dcec4821dad881f5b52/mypy-1.19.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:21761006a7f497cb0d4de3d8ef4ca70532256688b0523eee02baf9eec895e27b", size = 12740047, upload-time = "2025-12-15T05:03:33.168Z" }, - { url = "https://files.pythonhosted.org/packages/32/2a/66ba933fe6c76bd40d1fe916a83f04fed253152f451a877520b3c4a5e41e/mypy-1.19.1-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:28902ee51f12e0f19e1e16fbe2f8f06b6637f482c459dd393efddd0ec7f82045", size = 13601998, upload-time = "2025-12-15T05:03:13.056Z" }, - { url = "https://files.pythonhosted.org/packages/e3/da/5055c63e377c5c2418760411fd6a63ee2b96cf95397259038756c042574f/mypy-1.19.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:481daf36a4c443332e2ae9c137dfee878fcea781a2e3f895d54bd3002a900957", size = 13807476, upload-time = "2025-12-15T05:03:17.977Z" }, - { url = "https://files.pythonhosted.org/packages/cd/09/4ebd873390a063176f06b0dbf1f7783dd87bd120eae7727fa4ae4179b685/mypy-1.19.1-cp314-cp314-win_amd64.whl", hash = "sha256:8bb5c6f6d043655e055be9b542aa5f3bdd30e4f3589163e85f93f3640060509f", size = 10281872, upload-time = "2025-12-15T05:03:05.549Z" }, - { url = "https://files.pythonhosted.org/packages/8d/f4/4ce9a05ce5ded1de3ec1c1d96cf9f9504a04e54ce0ed55cfa38619a32b8d/mypy-1.19.1-py3-none-any.whl", hash = "sha256:f1235f5ea01b7db5468d53ece6aaddf1ad0b88d9e7462b86ef96fe04995d7247", size = 2471239, upload-time = "2025-12-15T05:03:07.248Z" }, -] - [[package]] name = "mypy-extensions" version = "1.1.0" @@ -3759,6 +3640,31 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c7/18/c86eb8e0202e32dd3df50d43d7ff9854f8e0603945ff398974c1d91ac1ef/tomli_w-1.2.0-py3-none-any.whl", hash = "sha256:188306098d013b691fcadc011abd66727d3c414c571bb01b1a174ba8c983cf90", size = 6675, upload-time = "2025-01-15T12:07:22.074Z" }, ] +[[package]] +name = "ty" +version = "0.0.9" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/19/7b/4f677c622d58563c593c32081f8a8572afd90e43dc15b0dedd27b4305038/ty-0.0.9.tar.gz", hash = "sha256:83f980c46df17586953ab3060542915827b43c4748a59eea04190c59162957fe", size = 4858642, upload-time = "2026-01-05T12:24:56.528Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/3f/3f/c1ee119738b401a8081ff84341781122296b66982e5982e6f162d946a1ff/ty-0.0.9-py3-none-linux_armv6l.whl", hash = "sha256:dd270d4dd6ebeb0abb37aee96cbf9618610723677f500fec1ba58f35bfa8337d", size = 9763596, upload-time = "2026-01-05T12:24:37.43Z" }, + { url = "https://files.pythonhosted.org/packages/63/41/6b0669ef4cd806d4bd5c30263e6b732a362278abac1bc3a363a316cde896/ty-0.0.9-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:debfb2ba418b00e86ffd5403cb666b3f04e16853f070439517dd1eaaeeff9255", size = 9591514, upload-time = "2026-01-05T12:24:26.891Z" }, + { url = "https://files.pythonhosted.org/packages/02/a1/874aa756aee5118e690340a771fb9ded0d0c2168c0b7cc7d9561c2a750b0/ty-0.0.9-py3-none-macosx_11_0_arm64.whl", hash = "sha256:107c76ebb05a13cdb669172956421f7ffd289ad98f36d42a44a465588d434d58", size = 9097773, upload-time = "2026-01-05T12:24:14.442Z" }, + { url = "https://files.pythonhosted.org/packages/32/62/cb9a460cf03baab77b3361d13106b93b40c98e274d07c55f333ce3c716f6/ty-0.0.9-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:6868ca5c87ca0caa1b3cb84603c767356242b0659b88307eda69b2fb0bfa416b", size = 9581824, upload-time = "2026-01-05T12:24:35.074Z" }, + { url = "https://files.pythonhosted.org/packages/5a/97/633ecb348c75c954f09f8913669de8c440b13b43ea7d214503f3f1c4bb60/ty-0.0.9-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:d14a4aa0eb5c1d3591c2adbdda4e44429a6bb5d2e298a704398bb2a7ccdafdfe", size = 9591050, upload-time = "2026-01-05T12:24:08.804Z" }, + { url = "https://files.pythonhosted.org/packages/6f/e6/4b0c6a7a8a234e2113f88c80cc7aaa9af5868de7a693859f3c49da981934/ty-0.0.9-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:01bd4466504cefa36b465c6608e9af4504415fa67f6affc01c7d6ce36663c7f4", size = 10018262, upload-time = "2026-01-05T12:24:53.791Z" }, + { url = "https://files.pythonhosted.org/packages/cb/97/076d72a028f6b31e0b87287aa27c5b71a2f9927ee525260ea9f2f56828b8/ty-0.0.9-py3-none-manylinux_2_17_ppc64.manylinux2014_ppc64.whl", hash = "sha256:76c8253d1b30bc2c3eaa1b1411a1c34423decde0f4de0277aa6a5ceacfea93d9", size = 10911642, upload-time = "2026-01-05T12:24:48.264Z" }, + { url = "https://files.pythonhosted.org/packages/3f/5a/705d6a5ed07ea36b1f23592c3f0dbc8fc7649267bfbb3bf06464cdc9a98a/ty-0.0.9-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8992fa4a9c6a5434eae4159fdd4842ec8726259bfd860e143ab95d078de6f8e3", size = 10632468, upload-time = "2026-01-05T12:24:24.118Z" }, + { url = "https://files.pythonhosted.org/packages/44/78/4339a254537488d62bf392a936b3ec047702c0cc33d6ce3a5d613f275cd0/ty-0.0.9-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8c79d503d151acb4a145a3d98702d07cb641c47292f63e5ffa0151e4020a5d33", size = 10273422, upload-time = "2026-01-05T12:24:45.8Z" }, + { url = "https://files.pythonhosted.org/packages/90/40/e7f386e87c9abd3670dcee8311674d7e551baa23b2e4754e2405976e6c92/ty-0.0.9-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b7a7ebf89ed276b564baa1f0dd9cd708e7b5aa89f19ce1b2f7d7132075abf93e", size = 10120289, upload-time = "2026-01-05T12:24:17.424Z" }, + { url = "https://files.pythonhosted.org/packages/f7/46/1027442596e725c50d0d1ab5179e9fa78a398ab412994b3006d0ee0899c7/ty-0.0.9-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:ae3866e50109d2400a886bb11d9ef607f23afc020b226af773615cf82ae61141", size = 9566657, upload-time = "2026-01-05T12:24:51.048Z" }, + { url = "https://files.pythonhosted.org/packages/56/be/df921cf1967226aa01690152002b370a7135c6cced81e86c12b86552cdc4/ty-0.0.9-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:185244a5eacfcd8f5e2d85b95e4276316772f1e586520a6cb24aa072ec1bac26", size = 9610334, upload-time = "2026-01-05T12:24:20.334Z" }, + { url = "https://files.pythonhosted.org/packages/ac/e8/f085268860232cc92ebe95415e5c8640f7f1797ac3a49ddd137c6222924d/ty-0.0.9-py3-none-musllinux_1_2_i686.whl", hash = "sha256:f834ff27d940edb24b2e86bbb3fb45ab9e07cf59ca8c5ac615095b2542786408", size = 9726701, upload-time = "2026-01-05T12:24:29.785Z" }, + { url = "https://files.pythonhosted.org/packages/42/b4/9394210c66041cd221442e38f68a596945103d9446ece505889ffa9b3da9/ty-0.0.9-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:773f4b3ba046de952d7c1ad3a2c09b24f3ed4bc8342ae3cbff62ebc14aa6d48c", size = 10227082, upload-time = "2026-01-05T12:24:40.132Z" }, + { url = "https://files.pythonhosted.org/packages/dc/9f/75951eb573b473d35dd9570546fc1319f7ca2d5b5c50a5825ba6ea6cb33a/ty-0.0.9-py3-none-win32.whl", hash = "sha256:1f20f67e373038ff20f36d5449e787c0430a072b92d5933c5b6e6fc79d3de4c8", size = 9176458, upload-time = "2026-01-05T12:24:32.559Z" }, + { url = "https://files.pythonhosted.org/packages/9b/80/b1cdf71ac874e72678161e25e2326a7d30bc3489cd3699561355a168e54f/ty-0.0.9-py3-none-win_amd64.whl", hash = "sha256:2c415f3bbb730f8de2e6e0b3c42eb3a91f1b5fbbcaaead2e113056c3b361c53c", size = 10040479, upload-time = "2026-01-05T12:24:42.697Z" }, + { url = "https://files.pythonhosted.org/packages/b5/8f/abc75c4bb774b12698629f02d0d12501b0a7dff9c31dc3bd6b6c6467e90a/ty-0.0.9-py3-none-win_arm64.whl", hash = "sha256:48e339d794542afeed710ea4f846ead865cc38cecc335a9c781804d02eaa2722", size = 9543127, upload-time = "2026-01-05T12:24:11.731Z" }, +] + [[package]] name = "typeapi" version = "2.3.0" From f5682da1c563b6176bbc1ea19cacf5ee974622d0 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 7 Jan 2026 14:37:13 +0100 Subject: [PATCH 2/6] Resolve some type ignores --- .gitignore | 2 +- src/crawlee/_utils/recurring_task.py | 12 +++- src/crawlee/_utils/sitemap.py | 17 +++-- src/crawlee/browsers/_browser_pool.py | 4 +- src/crawlee/browsers/_playwright_browser.py | 3 +- .../_adaptive_playwright_crawler.py | 20 ++---- .../_playwright/_playwright_crawler.py | 15 ++-- src/crawlee/events/_event_manager.py | 10 ++- src/crawlee/http_clients/_curl_impersonate.py | 68 +++++++++++++++---- .../storage_clients/_redis/_dataset_client.py | 2 + tests/unit/_utils/test_html_to_text.py | 1 + .../test_adaptive_playwright_crawler.py | 2 + 12 files changed, 107 insertions(+), 49 deletions(-) diff --git a/.gitignore b/.gitignore index ea73de62b4..1a84032a9c 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,8 @@ # Cache __pycache__ -.uv_cache .pytest_cache .ruff_cache +.ty_cache .uv-cache # Virtual envs diff --git a/src/crawlee/_utils/recurring_task.py b/src/crawlee/_utils/recurring_task.py index ba80f8f8b0..99f21499cb 100644 --- a/src/crawlee/_utils/recurring_task.py +++ b/src/crawlee/_utils/recurring_task.py @@ -25,7 +25,11 @@ class RecurringTask: """ def __init__(self, func: Callable, delay: timedelta) -> None: - logger.debug(f'Calling RecurringTask.__init__(func={func.__name__}, delay={delay})...') # ty: ignore[unresolved-attribute] + logger.debug( + 'Calling RecurringTask.__init__(func={%s}, delay={%s})...', + func.__name__ if hasattr(func, '__name__') else func.__class__.__name__, + delay, + ) self.func = func self.delay = delay self.task: asyncio.Task | None = None @@ -55,7 +59,11 @@ async def _wrapper(self) -> None: def start(self) -> None: """Start the recurring task execution.""" - self.task = asyncio.create_task(self._wrapper(), name=f'Task-recurring-{self.func.__name__}') # ty: ignore[possibly-missing-attribute] + name = self.func.__name__ if hasattr(self.func, '__name__') else self.func.__class__.__name__ + self.task = asyncio.create_task( + self._wrapper(), + name=f'Task-recurring-{name}', + ) async def stop(self) -> None: """Stop the recurring task execution.""" diff --git a/src/crawlee/_utils/sitemap.py b/src/crawlee/_utils/sitemap.py index ba844ca47b..95d1e26a5f 100644 --- a/src/crawlee/_utils/sitemap.py +++ b/src/crawlee/_utils/sitemap.py @@ -430,10 +430,17 @@ async def parse_sitemap( up to the specified maximum depth. """ # Set default options - options = options or {} # ty: ignore[invalid-assignment] - emit_nested_sitemaps = options.get('emit_nested_sitemaps', False) # ty: ignore[possibly-missing-attribute] - max_depth = options.get('max_depth', float('inf')) # ty: ignore[possibly-missing-attribute] - sitemap_retries = options.get('sitemap_retries', 3) # ty: ignore[possibly-missing-attribute] + default_timeout = timedelta(seconds=30) + if options: + emit_nested_sitemaps = options['emit_nested_sitemaps'] + max_depth = options['max_depth'] + sitemap_retries = options['sitemap_retries'] + timeout = options.get('timeout', default_timeout) + else: + emit_nested_sitemaps = False + max_depth = float('inf') + sitemap_retries = 3 + timeout = default_timeout # Setup working state sources = list(initial_sources) @@ -472,7 +479,7 @@ async def parse_sitemap( sitemap_retries, emit_nested_sitemaps=emit_nested_sitemaps, proxy_info=proxy_info, - timeout=options.get('timeout', timedelta(seconds=30)), # ty: ignore[possibly-missing-attribute] + timeout=timeout, ): yield result else: diff --git a/src/crawlee/browsers/_browser_pool.py b/src/crawlee/browsers/_browser_pool.py index 480fb9fac5..4a78709049 100644 --- a/src/crawlee/browsers/_browser_pool.py +++ b/src/crawlee/browsers/_browser_pool.py @@ -138,11 +138,11 @@ def with_default_plugin( kwargs: Additional arguments for default constructor. """ plugin_options: dict = defaultdict(dict) - plugin_options['browser_launch_options'] = browser_launch_options or {} + plugin_options['browser_launch_options'] = dict(browser_launch_options) if browser_launch_options else {} plugin_options['browser_new_context_options'] = browser_new_context_options or {} if headless is not None: - plugin_options['browser_launch_options']['headless'] = headless # ty: ignore[invalid-assignment] + plugin_options['browser_launch_options']['headless'] = headless if use_incognito_pages is not None: plugin_options['use_incognito_pages'] = use_incognito_pages diff --git a/src/crawlee/browsers/_playwright_browser.py b/src/crawlee/browsers/_playwright_browser.py index c66dcb21be..1593e3fdbb 100644 --- a/src/crawlee/browsers/_playwright_browser.py +++ b/src/crawlee/browsers/_playwright_browser.py @@ -78,7 +78,8 @@ async def new_context(self, **context_options: Any) -> BrowserContext: async def _delete_temp_dir(self, _: BrowserContext | None) -> None: if self._temp_dir and self._temp_dir.exists(): - await asyncio.to_thread(lambda: shutil.rmtree(self._temp_dir, ignore_errors=True)) # ty: ignore[invalid-argument-type] + temp_dir = self._temp_dir + await asyncio.to_thread(lambda: shutil.rmtree(temp_dir, ignore_errors=True)) @override async def close(self, **kwargs: Any) -> None: diff --git a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py index 18285cbb92..fbbf811f11 100644 --- a/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py +++ b/src/crawlee/crawlers/_adaptive_playwright/_adaptive_playwright_crawler.py @@ -27,23 +27,16 @@ ) from crawlee.crawlers._beautifulsoup._beautifulsoup_parser import BeautifulSoupParser from crawlee.crawlers._parsel._parsel_parser import ParselParser +from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions from crawlee.statistics import Statistics, StatisticsState -from ._adaptive_playwright_crawler_statistics import ( - AdaptivePlaywrightCrawlerStatisticState, -) +from ._adaptive_playwright_crawler_statistics import AdaptivePlaywrightCrawlerStatisticState from ._adaptive_playwright_crawling_context import ( AdaptivePlaywrightCrawlingContext, AdaptivePlaywrightPreNavCrawlingContext, ) -from ._rendering_type_predictor import ( - DefaultRenderingTypePredictor, - RenderingType, - RenderingTypePredictor, -) -from ._result_comparator import ( - create_default_comparator, -) +from ._rendering_type_predictor import DefaultRenderingTypePredictor, RenderingType, RenderingTypePredictor +from ._result_comparator import create_default_comparator if TYPE_CHECKING: from types import TracebackType @@ -51,7 +44,6 @@ from typing_extensions import Unpack from crawlee.crawlers._basic._basic_crawler import _BasicCrawlerOptions - from crawlee.crawlers._playwright._playwright_crawler import _PlaywrightCrawlerAdditionalOptions TStaticParseResult = TypeVar('TStaticParseResult') @@ -162,7 +154,7 @@ def __init__( super().__init__(statistics=adaptive_statistics, **kwargs) # Sub crawlers related. - playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or {} # ty: ignore[invalid-assignment] + playwright_crawler_specific_kwargs = playwright_crawler_specific_kwargs or _PlaywrightCrawlerAdditionalOptions() # Each sub crawler will use custom logger . static_logger = getLogger('Subcrawler_static') @@ -183,7 +175,7 @@ def __init__( ) playwright_crawler = PlaywrightCrawler( statistics=_NonPersistentStatistics(), - **playwright_crawler_specific_kwargs, # ty: ignore[invalid-argument-type] + **playwright_crawler_specific_kwargs, **basic_crawler_kwargs_for_pw_crawler, ) diff --git a/src/crawlee/crawlers/_playwright/_playwright_crawler.py b/src/crawlee/crawlers/_playwright/_playwright_crawler.py index c71bb71510..d7a9b5a1ec 100644 --- a/src/crawlee/crawlers/_playwright/_playwright_crawler.py +++ b/src/crawlee/crawlers/_playwright/_playwright_crawler.py @@ -14,10 +14,7 @@ from crawlee import service_locator from crawlee._request import Request, RequestOptions, RequestState -from crawlee._types import ( - BasicCrawlingContext, - ConcurrencySettings, -) +from crawlee._types import BasicCrawlingContext, ConcurrencySettings from crawlee._utils.blocked import RETRY_CSS_SELECTORS from crawlee._utils.docs import docs_group from crawlee._utils.robots import RobotsTxtFile @@ -177,13 +174,12 @@ def __init__( # If browser_pool is not provided, create a new instance of BrowserPool with specified arguments. else: if fingerprint_generator == 'default': - if not browser_type: - generator_browser_type = None - else: - generator_browser_type = [fingerprint_browser_type_from_playwright_browser_type(browser_type)] + generator_browser_type: list[Literal['chrome', 'firefox', 'safari', 'edge']] | None = ( + [fingerprint_browser_type_from_playwright_browser_type(browser_type)] if browser_type else None + ) fingerprint_generator = DefaultFingerprintGenerator( - header_options=HeaderGeneratorOptions(browsers=generator_browser_type) # ty: ignore[invalid-argument-type] + header_options=HeaderGeneratorOptions(browsers=generator_browser_type) ) browser_pool = BrowserPool.with_default_plugin( @@ -516,6 +512,7 @@ async def _get_cookies(self, page: Page) -> list[PlaywrightCookieParam]: async def _update_cookies(self, page: Page, cookies: list[PlaywrightCookieParam]) -> None: """Update the cookies in the page context.""" + # False positive ty error, see https://github.com/astral-sh/ty/issues/1493. await page.context.add_cookies([{**cookie} for cookie in cookies]) # ty: ignore[invalid-argument-type] async def _find_txt_file_for_url(self, url: str) -> RobotsTxtFile: diff --git a/src/crawlee/events/_event_manager.py b/src/crawlee/events/_event_manager.py index 2183727483..8b714255fb 100644 --- a/src/crawlee/events/_event_manager.py +++ b/src/crawlee/events/_event_manager.py @@ -178,7 +178,8 @@ async def listener_wrapper(event_data: EventData) -> None: else asyncio.to_thread(cast('Callable[..., None]', listener), *bound_args.args, **bound_args.kwargs) ) - listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener.__name__}') # ty: ignore[invalid-argument-type, unresolved-attribute] + listener_name = listener.__name__ if hasattr(listener, '__name__') else listener.__class__.__name__ + listener_task = asyncio.create_task(coro, name=f'Task-{event.value}-{listener_name}') self._listener_tasks.add(listener_task) try: @@ -189,7 +190,12 @@ async def listener_wrapper(event_data: EventData) -> None: # We need to swallow the exception and just log it here, otherwise it could break the event emitter logger.exception( 'Exception in the event listener', - extra={'event_name': event.value, 'listener_name': listener.__name__}, # ty: ignore[unresolved-attribute] + extra={ + 'event_name': event.value, + 'listener_name': listener.__name__ + if hasattr(listener, '__name__') + else listener.__class__.__name__, + }, ) finally: logger.debug('EventManager.on.listener_wrapper(): Removing listener task from the set...') diff --git a/src/crawlee/http_clients/_curl_impersonate.py b/src/crawlee/http_clients/_curl_impersonate.py index 342a60ef08..b22a807de3 100644 --- a/src/crawlee/http_clients/_curl_impersonate.py +++ b/src/crawlee/http_clients/_curl_impersonate.py @@ -2,7 +2,8 @@ import asyncio from contextlib import asynccontextmanager -from typing import TYPE_CHECKING, Any +from http.cookiejar import Cookie +from typing import TYPE_CHECKING, Any, cast from curl_cffi import CurlInfo from curl_cffi.const import CurlHttpVersion @@ -15,7 +16,7 @@ from curl_cffi.requests.impersonate import DEFAULT_CHROME as CURL_DEFAULT_CHROME from typing_extensions import override -from crawlee._types import HttpHeaders, HttpPayload +from crawlee._types import HttpHeaders, HttpMethod, HttpPayload from crawlee._utils.blocked import ROTATE_PROXY_ERRORS from crawlee._utils.docs import docs_group from crawlee.errors import ProxyError @@ -24,11 +25,11 @@ if TYPE_CHECKING: from collections.abc import AsyncGenerator from datetime import timedelta - from http.cookiejar import Cookie from curl_cffi import Curl from curl_cffi.requests import Request as CurlRequest from curl_cffi.requests import Response + from curl_cffi.requests.session import HttpMethod as CurlHttpMethod from crawlee import Request from crawlee._types import HttpMethod @@ -90,13 +91,15 @@ def headers(self) -> HttpHeaders: async def read(self) -> bytes: if self._response.astream_task: raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method') + return self._response.content async def read_stream(self) -> AsyncGenerator[bytes, None]: - if not self._response.astream_task or self._response.astream_task.done(): # ty: ignore[possibly-missing-attribute] - raise RuntimeError( - 'Cannot read stream: either already consumed or Response not obtained from `stream` method' - ) + if not self._response.astream_task: + raise RuntimeError('Cannot read stream, Response not obtained from `stream` method.') + + if isinstance(self._response.astream_task, asyncio.Future) and self._response.astream_task.done(): + raise RuntimeError('Cannot read stream, it was already consumed.') async for chunk in self._response.aiter_content(): yield chunk @@ -156,7 +159,7 @@ async def crawl( try: response = await client.request( url=request.url, - method=request.method.upper(), # ty: ignore[invalid-argument-type] + method=self._convert_method(request.method), headers=request.headers, data=request.payload, cookies=session.cookies.jar if session else None, @@ -203,7 +206,7 @@ async def send_request( try: response = await client.request( url=url, - method=method.upper(), # ty: ignore[invalid-argument-type] + method=self._convert_method(method), headers=dict(headers) if headers else None, data=payload, cookies=session.cookies.jar if session else None, @@ -244,7 +247,7 @@ async def stream( try: response = await client.request( url=url, - method=method.upper(), # ty: ignore[invalid-argument-type] + method=self._convert_method(method), headers=dict(headers) if headers else None, data=payload, cookies=session.cookies.jar if session else None, @@ -291,6 +294,40 @@ def _get_client(self, proxy_url: str | None) -> AsyncSession: return self._client_by_proxy_url[proxy_url] + def _convert_method(self, method: HttpMethod) -> CurlHttpMethod: + """Convert from Crawlee HTTP method to curl-cffi HTTP method. + + Args: + method: Crawlee HTTP method. + + Returns: + Corresponding curl-cffi HTTP method. + + Raises: + ValueError: If the provided HTTP method is not supported. + """ + method_upper = method.upper() # curl-cffi requires uppercase methods + + match method_upper: + case 'GET': + return 'GET' + case 'POST': + return 'POST' + case 'PUT': + return 'PUT' + case 'DELETE': + return 'DELETE' + case 'OPTIONS': + return 'OPTIONS' + case 'HEAD': + return 'HEAD' + case 'TRACE': + return 'TRACE' + case 'PATCH': + return 'PATCH' + case _: + raise ValueError(f'HTTP method {method} is not supported in {self.__class__.__name__}.') + @staticmethod def _is_proxy_error(error: CurlRequestError) -> bool: """Determine whether the given error is related to a proxy issue. @@ -308,11 +345,16 @@ def _is_proxy_error(error: CurlRequestError) -> bool: @staticmethod def _get_cookies(curl: Curl) -> list[Cookie]: - cookies: list[Cookie] = [] - for curl_cookie in curl.getinfo(CurlInfo.COOKIELIST): # ty: ignore[not-iterable] - curl_morsel = CurlMorsel.from_curl_format(curl_cookie) # ty: ignore[invalid-argument-type] + cookies = list[Cookie]() + + # Implementation of getinfo always returns list[bytes] for CurlInfo.COOKIELIST. + cookie_list = cast('list[bytes]', curl.getinfo(CurlInfo.COOKIELIST)) + + for curl_cookie in cookie_list: + curl_morsel = CurlMorsel.from_curl_format(curl_cookie) cookie = curl_morsel.to_cookiejar_cookie() cookies.append(cookie) + return cookies async def cleanup(self) -> None: diff --git a/src/crawlee/storage_clients/_redis/_dataset_client.py b/src/crawlee/storage_clients/_redis/_dataset_client.py index 74c9d6c496..17dfb48215 100644 --- a/src/crawlee/storage_clients/_redis/_dataset_client.py +++ b/src/crawlee/storage_clients/_redis/_dataset_client.py @@ -179,12 +179,14 @@ async def get_data( case (True, int(), None): json_path += f'[:-{offset}]' case (True, int(), int()): + # ty lacks support for advanced pattern matching, see https://github.com/astral-sh/ty/issues/887. json_path += f'[-{offset + limit}:-{offset}]' # ty: ignore[unsupported-operator] case (False, 0, int()): json_path += f'[:{limit}]' case (False, int(), None): json_path += f'[{offset}:]' case (False, int(), int()): + # ty lacks support for advanced pattern matching, see https://github.com/astral-sh/ty/issues/887. json_path += f'[{offset}:{offset + limit}]' # ty: ignore[unsupported-operator] if json_path == '$': diff --git a/tests/unit/_utils/test_html_to_text.py b/tests/unit/_utils/test_html_to_text.py index 0a535e58db..f49feb5b9a 100644 --- a/tests/unit/_utils/test_html_to_text.py +++ b/tests/unit/_utils/test_html_to_text.py @@ -191,6 +191,7 @@ def test_html_to_text(source: str, expected_text: str, html_to_text: Callable[[s @pytest.mark.parametrize('html_to_text', [html_to_text_parsel, html_to_text_beautifulsoup]) def test_html_to_text_raises_on_wrong_input_type(html_to_text: Callable[[str], str]) -> None: with pytest.raises(TypeError): + # Intentional wrong type test. html_to_text(1) # ty: ignore[invalid-argument-type] diff --git a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py index 4a5619c854..4aedeff2eb 100644 --- a/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py +++ b/tests/unit/crawlers/_adaptive_playwright/test_adaptive_playwright_crawler.py @@ -116,6 +116,7 @@ class TestInput: TestInput( expected_pw_count=0, expected_static_count=2, + # Lack of ty support, see https://github.com/astral-sh/ty/issues/2348. rendering_types=cycle(['static']), # ty: ignore[invalid-argument-type] detection_probability_recommendation=cycle([0]), ), @@ -746,6 +747,7 @@ async def request_handler(context: AdaptivePlaywrightCrawlingContext) -> None: TestInput( expected_pw_count=0, expected_static_count=2, + # Lack of ty support, see https://github.com/astral-sh/ty/issues/2348. rendering_types=cycle(['static']), # ty: ignore[invalid-argument-type] detection_probability_recommendation=cycle([0]), ), From b906909158f06987f0a9c025d05602db2f5959c2 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 7 Jan 2026 14:58:39 +0100 Subject: [PATCH 3/6] revert the asyncio to thread lambda changes --- .../http_crawlers/selectolax_parser.py | 2 +- src/crawlee/browsers/_playwright_browser.py | 2 +- .../crawlers/_parsel/_parsel_parser.py | 2 +- .../_file_system/_dataset_client.py | 12 +++++------ .../_file_system/_key_value_store_client.py | 21 +++++++++++-------- .../_file_system/_request_queue_client.py | 15 ++++++------- 6 files changed, 29 insertions(+), 25 deletions(-) diff --git a/docs/guides/code_examples/http_crawlers/selectolax_parser.py b/docs/guides/code_examples/http_crawlers/selectolax_parser.py index 1627a3b220..0c38b1e9bf 100644 --- a/docs/guides/code_examples/http_crawlers/selectolax_parser.py +++ b/docs/guides/code_examples/http_crawlers/selectolax_parser.py @@ -22,7 +22,7 @@ async def parse(self, response: HttpResponse) -> LexborHTMLParser: """Parse HTTP response body into a document object.""" response_body = await response.read() # Run parsing in a thread to avoid blocking the event loop. - return await asyncio.to_thread(lambda: LexborHTMLParser(response_body)) + return await asyncio.to_thread(LexborHTMLParser, response_body) @override async def parse_text(self, text: str) -> LexborHTMLParser: diff --git a/src/crawlee/browsers/_playwright_browser.py b/src/crawlee/browsers/_playwright_browser.py index 1593e3fdbb..8ce19bfd26 100644 --- a/src/crawlee/browsers/_playwright_browser.py +++ b/src/crawlee/browsers/_playwright_browser.py @@ -79,7 +79,7 @@ async def new_context(self, **context_options: Any) -> BrowserContext: async def _delete_temp_dir(self, _: BrowserContext | None) -> None: if self._temp_dir and self._temp_dir.exists(): temp_dir = self._temp_dir - await asyncio.to_thread(lambda: shutil.rmtree(temp_dir, ignore_errors=True)) + await asyncio.to_thread(shutil.rmtree, temp_dir, ignore_errors=True) @override async def close(self, **kwargs: Any) -> None: diff --git a/src/crawlee/crawlers/_parsel/_parsel_parser.py b/src/crawlee/crawlers/_parsel/_parsel_parser.py index 640cf80e8e..f9ca19139a 100644 --- a/src/crawlee/crawlers/_parsel/_parsel_parser.py +++ b/src/crawlee/crawlers/_parsel/_parsel_parser.py @@ -22,7 +22,7 @@ class ParselParser(AbstractHttpParser[Selector, Selector]): @override async def parse(self, response: HttpResponse) -> Selector: response_body = await response.read() - return await asyncio.to_thread(lambda: Selector(body=response_body)) + return await asyncio.to_thread(Selector, body=response_body) @override async def parse_text(self, text: str) -> Selector: diff --git a/src/crawlee/storage_clients/_file_system/_dataset_client.py b/src/crawlee/storage_clients/_file_system/_dataset_client.py index 4a222dc037..b970a98928 100644 --- a/src/crawlee/storage_clients/_file_system/_dataset_client.py +++ b/src/crawlee/storage_clients/_file_system/_dataset_client.py @@ -120,7 +120,7 @@ async def open( dataset_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR if not dataset_base_path.exists(): - await asyncio.to_thread(lambda: dataset_base_path.mkdir(parents=True, exist_ok=True)) + await asyncio.to_thread(dataset_base_path.mkdir, parents=True, exist_ok=True) # Get a new instance by ID. if id: @@ -134,7 +134,7 @@ async def open( continue try: - file = await asyncio.to_thread(lambda p=path_to_metadata: p.open(mode='r', encoding='utf-8')) + file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8') try: file_content = json.load(file) metadata = DatasetMetadata(**file_content) @@ -163,7 +163,7 @@ async def open( # If the dataset directory exists, reconstruct the client from the metadata file. if path_to_dataset.exists() and path_to_metadata.exists(): - file = await asyncio.to_thread(lambda: path_to_metadata.open(mode='r', encoding='utf-8')) + file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8') try: file_content = json.load(file) finally: @@ -211,7 +211,7 @@ async def drop(self) -> None: async def purge(self) -> None: async with self._lock: for file_path in await self._get_sorted_data_files(): - await asyncio.to_thread(lambda f=file_path: f.unlink(missing_ok=True)) + await asyncio.to_thread(file_path.unlink, missing_ok=True) await self._update_metadata( update_accessed_at=True, @@ -435,7 +435,7 @@ async def _update_metadata( self._metadata.item_count = new_item_count # Ensure the parent directory for the metadata file exists. - await asyncio.to_thread(lambda: self.path_to_metadata.parent.mkdir(parents=True, exist_ok=True)) + await asyncio.to_thread(self.path_to_metadata.parent.mkdir, parents=True, exist_ok=True) # Dump the serialized metadata to the file. data = await json_dumps(self._metadata.model_dump()) @@ -456,7 +456,7 @@ async def _push_item(self, item: dict[str, Any], item_id: int) -> None: file_path = self.path_to_dataset / filename # Ensure the dataset directory exists. - await asyncio.to_thread(lambda: self.path_to_dataset.mkdir(parents=True, exist_ok=True)) + await asyncio.to_thread(self.path_to_dataset.mkdir, parents=True, exist_ok=True) # Dump the serialized item to the file. data = await json_dumps(item) diff --git a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py index 28e724fda8..3a36a77074 100644 --- a/src/crawlee/storage_clients/_file_system/_key_value_store_client.py +++ b/src/crawlee/storage_clients/_file_system/_key_value_store_client.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import functools import json import shutil import urllib.parse @@ -119,7 +120,7 @@ async def open( kvs_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR if not kvs_base_path.exists(): - await asyncio.to_thread(lambda: kvs_base_path.mkdir(parents=True, exist_ok=True)) + await asyncio.to_thread(kvs_base_path.mkdir, parents=True, exist_ok=True) # Get a new instance by ID. if id: @@ -133,7 +134,7 @@ async def open( continue try: - file = await asyncio.to_thread(lambda p=path_to_metadata: p.open(mode='r', encoding='utf-8')) + file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8') try: file_content = json.load(file) metadata = KeyValueStoreMetadata(**file_content) @@ -162,7 +163,7 @@ async def open( # If the key-value store directory exists, reconstruct the client from the metadata file. if path_to_kvs.exists() and path_to_metadata.exists(): - file = await asyncio.to_thread(lambda: path_to_metadata.open(mode='r', encoding='utf-8')) + file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8') try: file_content = json.load(file) finally: @@ -212,7 +213,7 @@ async def purge(self) -> None: for file_path in self.path_to_kvs.glob('*'): if file_path.name == METADATA_FILENAME: continue - await asyncio.to_thread(lambda f=file_path: f.unlink(missing_ok=True)) + await asyncio.to_thread(file_path.unlink, missing_ok=True) await self._update_metadata( update_accessed_at=True, @@ -239,7 +240,9 @@ async def get_value(self, *, key: str) -> KeyValueStoreRecord | None: # Read the metadata file async with self._lock: try: - file = await asyncio.to_thread(lambda: record_metadata_filepath.open(mode='r', encoding='utf-8')) + file = await asyncio.to_thread( + functools.partial(record_metadata_filepath.open, mode='r', encoding='utf-8'), + ) except FileNotFoundError: logger.warning(f'Metadata file disappeared for key "{key}", aborting get_value') return None @@ -346,11 +349,11 @@ async def delete_value(self, *, key: str) -> None: async with self._lock: # Delete the value file and its metadata if found if record_path.exists(): - await asyncio.to_thread(lambda: record_path.unlink(missing_ok=True)) + await asyncio.to_thread(record_path.unlink, missing_ok=True) # Delete the metadata file if it exists if metadata_path.exists(): - await asyncio.to_thread(lambda: metadata_path.unlink(missing_ok=True)) + await asyncio.to_thread(metadata_path.unlink, missing_ok=True) else: logger.warning(f'Found value file for key "{key}" but no metadata file when trying to delete it.') @@ -395,7 +398,7 @@ async def iterate_keys( # Try to read and parse the metadata file try: - metadata_content = await asyncio.to_thread(lambda f=file_path: f.read_text(encoding='utf-8')) + metadata_content = await asyncio.to_thread(file_path.read_text, encoding='utf-8') except FileNotFoundError: logger.warning(f'Metadata file disappeared for key "{key_name}", skipping it.') continue @@ -475,7 +478,7 @@ async def _update_metadata( self._metadata.modified_at = now # Ensure the parent directory for the metadata file exists. - await asyncio.to_thread(lambda: self.path_to_metadata.parent.mkdir(parents=True, exist_ok=True)) + await asyncio.to_thread(self.path_to_metadata.parent.mkdir, parents=True, exist_ok=True) # Dump the serialized metadata to the file. data = await json_dumps(self._metadata.model_dump()) diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index 1a91ecea9e..6808c4b88e 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -1,6 +1,7 @@ from __future__ import annotations import asyncio +import functools import json import shutil from collections import deque @@ -183,7 +184,7 @@ async def open( rq_base_path = Path(configuration.storage_dir) / cls._STORAGE_SUBDIR if not rq_base_path.exists(): - await asyncio.to_thread(lambda: rq_base_path.mkdir(parents=True, exist_ok=True)) + await asyncio.to_thread(rq_base_path.mkdir, parents=True, exist_ok=True) # Open an existing RQ by its ID, raise an error if not found. if id: @@ -197,7 +198,7 @@ async def open( continue try: - file = await asyncio.to_thread(lambda p=path_to_metadata: p.open(mode='r', encoding='utf-8')) + file = await asyncio.to_thread(path_to_metadata.open, mode='r', encoding='utf-8') try: file_content = json.load(file) metadata = RequestQueueMetadata(**file_content) @@ -232,7 +233,7 @@ async def open( # If the RQ directory exists, reconstruct the client from the metadata file. if path_to_rq.exists() and path_to_metadata.exists(): - file = await asyncio.to_thread(lambda: path_to_metadata.open(encoding='utf-8')) + file = await asyncio.to_thread(path_to_metadata.open, encoding='utf-8') try: file_content = json.load(file) finally: @@ -300,7 +301,7 @@ async def purge(self) -> None: request_files = await self._get_request_files(self.path_to_rq) for file_path in request_files: - await asyncio.to_thread(lambda f=file_path: f.unlink(missing_ok=True)) + await asyncio.to_thread(file_path.unlink, missing_ok=True) # Clear recoverable state await self._state.reset() @@ -675,7 +676,7 @@ async def _update_metadata( self._metadata.had_multiple_clients = True # Ensure the parent directory for the metadata file exists. - await asyncio.to_thread(lambda: self.path_to_metadata.parent.mkdir(parents=True, exist_ok=True)) + await asyncio.to_thread(self.path_to_metadata.parent.mkdir, parents=True, exist_ok=True) # Dump the serialized metadata to the file. data = await json_dumps(self._metadata.model_dump()) @@ -753,7 +754,7 @@ async def _get_request_files(cls, path_to_rq: Path) -> list[Path]: A list of paths to all request files. """ # Create the requests directory if it doesn't exist. - await asyncio.to_thread(lambda: path_to_rq.mkdir(parents=True, exist_ok=True)) + await asyncio.to_thread(path_to_rq.mkdir, parents=True, exist_ok=True) # List all the json files. files = await asyncio.to_thread(lambda: list(path_to_rq.glob('*.json'))) @@ -775,7 +776,7 @@ async def _parse_request_file(cls, file_path: Path) -> Request | None: """ # Open the request file. try: - file = await asyncio.to_thread(lambda f=file_path: f.open(mode='r', encoding='utf-8')) + file = await asyncio.to_thread(functools.partial(file_path.open, mode='r', encoding='utf-8')) except FileNotFoundError: logger.warning(f'Request file "{file_path}" not found.') return None From 16630a9412ac2ce7d943ad1c848cf89df4bcea02 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 7 Jan 2026 15:12:26 +0100 Subject: [PATCH 4/6] Fix --- src/crawlee/request_loaders/_sitemap_request_loader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/crawlee/request_loaders/_sitemap_request_loader.py b/src/crawlee/request_loaders/_sitemap_request_loader.py index ee278e4eda..06f2c29111 100644 --- a/src/crawlee/request_loaders/_sitemap_request_loader.py +++ b/src/crawlee/request_loaders/_sitemap_request_loader.py @@ -230,7 +230,7 @@ async def _load_sitemaps(self) -> None: continue state.in_progress_sitemap_url = sitemap_url - parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True) + parse_options = ParseSitemapOptions(max_depth=0, emit_nested_sitemaps=True, sitemap_retries=3) async for item in parse_sitemap( [SitemapSource(type='url', url=sitemap_url)], From 71355ea572c661c01d838dd87642fe08788f712d Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Wed, 7 Jan 2026 15:21:04 +0100 Subject: [PATCH 5/6] Fix --- src/crawlee/storage_clients/_redis/_dataset_client.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/crawlee/storage_clients/_redis/_dataset_client.py b/src/crawlee/storage_clients/_redis/_dataset_client.py index 17dfb48215..4490f23864 100644 --- a/src/crawlee/storage_clients/_redis/_dataset_client.py +++ b/src/crawlee/storage_clients/_redis/_dataset_client.py @@ -197,6 +197,8 @@ async def get_data( if data is None: data = [] + data = [item for item in data if isinstance(item, dict)] + if skip_empty: data = [item for item in data if item] @@ -212,7 +214,7 @@ async def get_data( limit=limit or (total - offset), total=total, desc=desc, - items=data, # ty: ignore[invalid-argument-type] + items=data, ) @override From 5c7abc3598918333d61d7f7614ce16551d605fdc Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Thu, 8 Jan 2026 16:28:29 +0100 Subject: [PATCH 6/6] Fix --- src/crawlee/_utils/globs.py | 8 ++++---- .../_abstract_http/_abstract_http_crawler.py | 4 +--- src/crawlee/sessions/_cookies.py | 13 +++++++------ 3 files changed, 12 insertions(+), 13 deletions(-) diff --git a/src/crawlee/_utils/globs.py b/src/crawlee/_utils/globs.py index aed82e1f18..ab352113a3 100644 --- a/src/crawlee/_utils/globs.py +++ b/src/crawlee/_utils/globs.py @@ -33,12 +33,12 @@ def _translate( HACK: This function is copied from CPython stdlib source. It will be released in Python 3.13 as `glob.translate` """ - if not seps: - seps = (os.path.sep, os.path.altsep) if os.path.altsep else os.path.sep + _seps = ((os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)) if seps is None else seps - escaped_seps = ''.join(map(re.escape, seps)) # ty: ignore[invalid-argument-type] - any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps + escaped_seps = ''.join(map(re.escape, _seps)) + any_sep = f'[{escaped_seps}]' if len(_seps) > 1 else escaped_seps not_sep = f'[^{escaped_seps}]' + if include_hidden: one_last_segment = f'{not_sep}+' one_segment = f'{one_last_segment}{any_sep}' diff --git a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py index 059b3adbe9..a503d27a0a 100644 --- a/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py +++ b/src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py @@ -100,9 +100,7 @@ def create_parsed_http_crawler_class( this method simplifies cases where `TParseResult` is used for both generic parameters. """ - class _ParsedHttpCrawler( - AbstractHttpCrawler[ParsedHttpCrawlingContext[TParseResult], TParseResult, TSelectResult] - ): # ty: ignore[invalid-generic-class] + class _ParsedHttpCrawler(AbstractHttpCrawler): def __init__( self, parser: AbstractHttpParser[TParseResult, TSelectResult] = static_parser, diff --git a/src/crawlee/sessions/_cookies.py b/src/crawlee/sessions/_cookies.py index 1089fc37f5..f4878a7d78 100644 --- a/src/crawlee/sessions/_cookies.py +++ b/src/crawlee/sessions/_cookies.py @@ -66,17 +66,18 @@ def __init__(self, cookies: SessionCookies | CookieJar | dict[str, str] | list[C self._jar = CookieJar() - if isinstance(cookies, dict): - for key, value in cookies.items(): - self.set(key, value) # ty: ignore[invalid-argument-type] - - elif isinstance(cookies, list): + if isinstance(cookies, list): for item in cookies: self.set(**item) elif isinstance(cookies, SessionCookies): for cookie in cookies.jar: - self.jar.set_cookie(cookie) + self._jar.set_cookie(cookie) + + elif isinstance(cookies, dict): + cookies_dict: dict[str, str] = cookies + for key, value in cookies_dict.items(): + self.set(key, value) @property def jar(self) -> CookieJar: