Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Cache
__pycache__
.mypy_cache
.pytest_cache
.ruff_cache
.ty_cache
.uv-cache

# Virtual envs
Expand Down
2 changes: 1 addition & 1 deletion CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ make format

### Type checking

Type checking is handled by [mypy](https://mypy.readthedocs.io/), verifying code against type annotations. Configuration settings can be found in `pyproject.toml`.
Type checking is handled by [ty](https://docs.astral.sh/ty/), verifying code against type annotations. Configuration settings can be found in `pyproject.toml`.

To run type checking:

Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
E2E_TESTS_CONCURRENCY = 1

clean:
rm -rf .mypy_cache .pytest_cache .ruff_cache .uv-cache build dist htmlcov .coverage
rm -rf .uv_cache .pytest_cache .ruff_cache .uv-cache build dist htmlcov .coverage

install-sync:
uv sync --all-extras
Expand All @@ -27,7 +27,7 @@ lint:
uv run ruff check

type-check:
uv run mypy
uv run ty check

unit-tests:
uv run pytest \
Expand Down
3 changes: 1 addition & 2 deletions docs/deployment/code_examples/google/cloud_run_example.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# mypy: disable-error-code="misc"
import json
import os

Expand All @@ -9,7 +8,7 @@
from crawlee.storage_clients import MemoryStorageClient


@get('/') # type: ignore[untyped-decorator]
@get('/')
async def main() -> str:
"""The crawler entry point that will be called when the HTTP endpoint is accessed."""
# highlight-start
Expand Down
3 changes: 1 addition & 2 deletions docs/deployment/code_examples/google/google_example.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# mypy: disable-error-code="misc"
import asyncio
import json
from datetime import timedelta
Expand Down Expand Up @@ -48,7 +47,7 @@ async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
# highlight-end


@functions_framework.http # type: ignore[untyped-decorator]
@functions_framework.http
def crawlee_run(request: Request) -> Response:
# You can pass data to your crawler using `request`
function_id = request.headers['Function-Execution-Id']
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

async def main() -> None:
fingerprint_generator = DefaultFingerprintGenerator(
header_options=HeaderGeneratorOptions(browsers=['chromium']),
header_options=HeaderGeneratorOptions(browsers=['chrome']),
screen_options=ScreenOptions(min_width=400),
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ async def parse(self, response: HttpResponse) -> LexborHTMLParser:
"""Parse HTTP response body into a document object."""
response_body = await response.read()
# Run parsing in a thread to avoid blocking the event loop.
return await asyncio.to_thread(lambda: LexborHTMLParser(response_body))
return await asyncio.to_thread(LexborHTMLParser, response_body)

@override
async def parse_text(self, text: str) -> LexborHTMLParser:
Expand Down
4 changes: 2 additions & 2 deletions docs/guides/code_examples/running_in_web_server/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
app = FastAPI(lifespan=lifespan, title='Crawler app')


@app.get('/', response_class=HTMLResponse) # type: ignore[untyped-decorator]
@app.get('/', response_class=HTMLResponse)
def index() -> str:
return """
<!DOCTYPE html>
Expand All @@ -32,7 +32,7 @@ def index() -> str:
"""


@app.get('/scrape') # type: ignore[untyped-decorator]
@app.get('/scrape')
async def scrape_url(request: Request, url: str | None = None) -> dict:
if not url:
return {'url': 'missing', 'scrape result': 'no results'}
Expand Down
62 changes: 12 additions & 50 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,6 @@ dev = [
"build<2.0.0", # For e2e tests.
"dycw-pytest-only<3.0.0",
"fakeredis[probabilistic,json,lua]<3.0.0",
"mypy~=1.19.0",
"pre-commit<5.0.0",
"proxy-py<3.0.0",
"pydoc-markdown<5.0.0",
Expand All @@ -113,6 +112,7 @@ dev = [
"pytest<9.0.0",
"ruff~=0.14.0",
"setuptools", # setuptools are used by pytest, but not explicitly required
"ty~=0.0.0",
"types-beautifulsoup4<5.0.0",
"types-cachetools<7.0.0",
"types-colorama<1.0.0",
Expand Down Expand Up @@ -230,62 +230,24 @@ filterwarnings = [
"ignore:websockets.server.WebSocketServerProtocol is deprecated:DeprecationWarning",
]

[tool.mypy]
python_version = "3.10"
plugins = ["pydantic.mypy"]
[tool.ty.environment]
python-version = "3.10"

[tool.ty.src]
include = ["src", "tests", "scripts", "docs", "website"]
exclude = [
"src/crawlee/project_template",
"docs/guides/code_examples/storage_clients/custom_storage_client_example.py",
]
files = ["src", "tests", "docs", "website"]
check_untyped_defs = true
disallow_incomplete_defs = true
disallow_untyped_calls = true
disallow_untyped_decorators = true
disallow_untyped_defs = true
no_implicit_optional = true
warn_redundant_casts = true
warn_return_any = true
warn_unreachable = true
warn_unused_ignores = true

[[tool.mypy.overrides]]
# Example codes are sometimes showing integration of crawlee with external tool, which is not dependency of crawlee.
module = [
"apify", # Example code shows integration of apify and crawlee.
"apify_fingerprint_datapoints", # Untyped and stubs not available
"camoufox", # Example code shows integration of camoufox and crawlee.
"fastapi", # Example code shows running in webserver.
"saxonche", # Example code shows HttpCrawler with custom parser.
"scrapling.*", # Example code shows HttpCrawler with custom parser.
"selectolax.*", # Example code shows HttpCrawler with custom parser.
"stagehand.*", # Example code shows integration of Stagehand and crawlee.
"starlette.*", # Example code shows running in webserver.
"flask", # Example code shows deploy on Google Cloud.
"functions_framework", # Example code shows deploy on Google Cloud.
"jaro", # Untyped and stubs not available
"litestar", # Example code shows deploy on Google Cloud Run.
"loguru", # Example code shows integration of loguru and crawlee for JSON logging.
"lxml.*", # Example code shows HttpCrawler with custom parser.
"sklearn.linear_model", # Untyped and stubs not available
"cookiecutter.*", # Untyped and stubs not available
"inquirer.*", # Untyped and stubs not available
"pyquery", # Example code shows HttpCrawler with custom parser.
"warcio.*", # Example code shows WARC files creation.
"wrapt" # Untyped and stubs not available
]
ignore_missing_imports = true

[[tool.mypy.overrides]]
module = [
"running_in_web_server.*", # False positive when fastapi not available
[[tool.ty.overrides]]
include = [
"docs/**/*.py",
"website/**/*.py",
]
disable_error_code = ["misc"]

[tool.basedpyright]
pythonVersion = "3.10"
typeCheckingMode = "standard"
include = ["src", "tests", "docs", "website"]
[tool.ty.overrides.rules]
unresolved-import = "ignore"

[tool.coverage.report]
exclude_lines = ["pragma: no cover", "if TYPE_CHECKING:", "assert_never()"]
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/_browserforge_workaround.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ def patch_browserforge() -> None:
def DownloadIfNotExists(**flags: bool) -> None:
pass

download.DownloadIfNotExists = DownloadIfNotExists
download.DownloadIfNotExists = DownloadIfNotExists # ty: ignore[invalid-assignment]

import browserforge.bayesian_network

Expand Down
4 changes: 2 additions & 2 deletions src/crawlee/_request.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def __setitem__(self, key: str, value: JsonSerializable) -> None:
def __delitem__(self, key: str) -> None:
del self.__pydantic_extra__[key]

def __iter__(self) -> Iterator[str]: # type: ignore[override]
def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-override]
yield from self.__pydantic_extra__

def __len__(self) -> int:
Expand Down Expand Up @@ -195,7 +195,7 @@ class Request(BaseModel):
] = None
"""HTTP request payload."""

# Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory
# Workaround for Pydantic and type checkers when using Annotated with default_factory
if TYPE_CHECKING:
headers: HttpHeaders = HttpHeaders()
"""HTTP request headers."""
Expand Down
27 changes: 13 additions & 14 deletions src/crawlee/_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,14 @@ class HttpHeaders(RootModel, Mapping[str, str]):

model_config = ConfigDict(validate_by_name=True, validate_by_alias=True)

# Workaround for pydantic 2.12 and mypy type checking issue for Annotated with default_factory
# Workaround for Pydantic and type checkers when using Annotated with default_factory
if TYPE_CHECKING:
root: dict[str, str] = {}
else:
root: Annotated[
dict[str, str],
PlainValidator(lambda value: _normalize_headers(value)),
Field(default_factory=dict),
Field(default_factory=lambda: dict[str, str]()),
]

def __getitem__(self, key: str) -> str:
Expand All @@ -91,7 +91,7 @@ def __ror__(self, other: HttpHeaders) -> HttpHeaders:
combined_headers = {**other, **self.root}
return HttpHeaders(combined_headers)

def __iter__(self) -> Iterator[str]: # type: ignore[override]
def __iter__(self) -> Iterator[str]: # ty: ignore[invalid-method-override]
yield from self.root

def __len__(self) -> int:
Expand Down Expand Up @@ -671,17 +671,16 @@ def create_modified_copy(
get_key_value_store: GetKeyValueStoreFromRequestHandlerFunction | None = None,
) -> Self:
"""Create a modified copy of the crawling context with specified changes."""
original_fields = {field.name: getattr(self, field.name) for field in dataclasses.fields(self)}
modified_fields = {
key: value
for key, value in {
'push_data': push_data,
'add_requests': add_requests,
'get_key_value_store': get_key_value_store,
}.items()
if value
}
return self.__class__(**{**original_fields, **modified_fields})
modifications = dict[str, Any]()

if push_data is not None:
modifications['push_data'] = push_data
if add_requests is not None:
modifications['add_requests'] = add_requests
if get_key_value_store is not None:
modifications['get_key_value_store'] = get_key_value_store

return dataclasses.replace(self, **modifications)


class GetDataKwargs(TypedDict):
Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/_utils/context.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,4 +44,4 @@ async def async_wrapper(self: Any, *args: Any, **kwargs: Any) -> Any:

return await method(self, *args, **kwargs)

return async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper # type: ignore[return-value]
return async_wrapper if inspect.iscoroutinefunction(method) else sync_wrapper # ty: ignore[invalid-return-type]
2 changes: 1 addition & 1 deletion src/crawlee/_utils/file.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ async def export_csv_to_stream(
if 'lineterminator' not in kwargs:
kwargs['lineterminator'] = '\n'

writer = csv.writer(dst, **kwargs) # type: ignore[arg-type]
writer = csv.writer(dst, **kwargs)
write_header = True

# Iterate over the dataset and write to CSV.
Expand Down
8 changes: 4 additions & 4 deletions src/crawlee/_utils/globs.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,12 @@ def _translate(

HACK: This function is copied from CPython stdlib source. It will be released in Python 3.13 as `glob.translate`
"""
if not seps:
seps = (os.path.sep, os.path.altsep) if os.path.altsep else os.path.sep
_seps = ((os.path.sep, os.path.altsep) if os.path.altsep else (os.path.sep,)) if seps is None else seps

escaped_seps = ''.join(map(re.escape, seps))
any_sep = f'[{escaped_seps}]' if len(seps) > 1 else escaped_seps
escaped_seps = ''.join(map(re.escape, _seps))
any_sep = f'[{escaped_seps}]' if len(_seps) > 1 else escaped_seps
not_sep = f'[^{escaped_seps}]'

if include_hidden:
one_last_segment = f'{not_sep}+'
one_segment = f'{one_last_segment}{any_sep}'
Expand Down
12 changes: 10 additions & 2 deletions src/crawlee/_utils/recurring_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,11 @@ class RecurringTask:
"""

def __init__(self, func: Callable, delay: timedelta) -> None:
logger.debug(f'Calling RecurringTask.__init__(func={func.__name__}, delay={delay})...')
logger.debug(
'Calling RecurringTask.__init__(func={%s}, delay={%s})...',
func.__name__ if hasattr(func, '__name__') else func.__class__.__name__,
delay,
)
self.func = func
self.delay = delay
self.task: asyncio.Task | None = None
Expand Down Expand Up @@ -55,7 +59,11 @@ async def _wrapper(self) -> None:

def start(self) -> None:
"""Start the recurring task execution."""
self.task = asyncio.create_task(self._wrapper(), name=f'Task-recurring-{self.func.__name__}')
name = self.func.__name__ if hasattr(self.func, '__name__') else self.func.__class__.__name__
self.task = asyncio.create_task(
self._wrapper(),
name=f'Task-recurring-{name}',
)

async def stop(self) -> None:
"""Stop the recurring task execution."""
Expand Down
17 changes: 12 additions & 5 deletions src/crawlee/_utils/sitemap.py
Original file line number Diff line number Diff line change
Expand Up @@ -430,10 +430,17 @@ async def parse_sitemap(
up to the specified maximum depth.
"""
# Set default options
options = options or {}
emit_nested_sitemaps = options.get('emit_nested_sitemaps', False)
max_depth = options.get('max_depth', float('inf'))
sitemap_retries = options.get('sitemap_retries', 3)
default_timeout = timedelta(seconds=30)
if options:
emit_nested_sitemaps = options['emit_nested_sitemaps']
max_depth = options['max_depth']
sitemap_retries = options['sitemap_retries']
timeout = options.get('timeout', default_timeout)
else:
emit_nested_sitemaps = False
max_depth = float('inf')
sitemap_retries = 3
timeout = default_timeout

# Setup working state
sources = list(initial_sources)
Expand Down Expand Up @@ -472,7 +479,7 @@ async def parse_sitemap(
sitemap_retries,
emit_nested_sitemaps=emit_nested_sitemaps,
proxy_info=proxy_info,
timeout=options.get('timeout', timedelta(seconds=30)),
timeout=timeout,
):
yield result
else:
Expand Down
Loading
Loading