Skip to content

Commit 49265e8

Browse files
authored
Improve unique key generation logic (#193)
1 parent 63a7172 commit 49265e8

File tree

7 files changed

+288
-21
lines changed

7 files changed

+288
-21
lines changed

CHANGELOG.md

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,15 @@
11
# Changelog
22

3-
## [1.6.1](../../releases/tag/v1.6.1) - Unreleased
3+
## [1.7.0](../../releases/tag/v1.7.0) - Unreleased
4+
5+
### Added
6+
7+
- Add a new way of generating the `uniqueKey` field of the request, aligning it with the Crawlee.
48

59
### Fixed
610

711
- Improve error handling for `to_apify_request` serialization failures
12+
- Scrapy's `Request.dont_filter` works.
813

914
## [1.6.0](../../releases/tag/v1.6.0) - 2024-02-23
1015

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "apify"
3-
version = "1.6.1"
3+
version = "1.7.0"
44
description = "Apify SDK for Python"
55
readme = "README.md"
66
license = { text = "Apache Software License" }

src/apify/_utils.py

Lines changed: 117 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,23 @@
11
from __future__ import annotations
22

33
import asyncio
4-
import base64
54
import builtins
65
import contextlib
76
import functools
8-
import hashlib
97
import inspect
108
import json
119
import mimetypes
1210
import os
1311
import re
1412
import sys
1513
import time
14+
from base64 import b64encode
1615
from collections import OrderedDict
1716
from collections.abc import MutableMapping
1817
from datetime import datetime, timezone
18+
from hashlib import sha256
1919
from importlib import metadata
20+
from logging import getLogger
2021
from typing import (
2122
Any,
2223
Callable,
@@ -30,6 +31,7 @@
3031
overload,
3132
)
3233
from typing import OrderedDict as OrderedDictType
34+
from urllib.parse import parse_qsl, urlencode, urlparse
3335

3436
import aioshutil
3537
import psutil
@@ -59,6 +61,7 @@
5961
from apify.consts import REQUEST_ID_LENGTH, StorageTypes
6062

6163
T = TypeVar('T')
64+
logger = getLogger(__name__)
6265

6366

6467
def get_system_info() -> dict:
@@ -292,9 +295,8 @@ def maybe_parse_body(body: bytes, content_type: str) -> Any:
292295

293296
def unique_key_to_request_id(unique_key: str) -> str:
294297
"""Generate request ID based on unique key in a deterministic way."""
295-
id = re.sub(r'(\+|\/|=)', '', base64.b64encode(hashlib.sha256(unique_key.encode('utf-8')).digest()).decode('utf-8')) # noqa: A001
296-
297-
return id[:REQUEST_ID_LENGTH] if len(id) > REQUEST_ID_LENGTH else id
298+
request_id = re.sub(r'(\+|\/|=)', '', b64encode(sha256(unique_key.encode('utf-8')).digest()).decode('utf-8'))
299+
return request_id[:REQUEST_ID_LENGTH] if len(request_id) > REQUEST_ID_LENGTH else request_id
298300

299301

300302
async def force_rename(src_dir: str, dst_dir: str) -> None:
@@ -410,3 +412,113 @@ def validate_single(field_value: Any, expected_type: type, required: bool, name:
410412
PARSE_DATE_FIELDS_MAX_DEPTH = 3
411413
PARSE_DATE_FIELDS_KEY_SUFFIX = 'At'
412414
ListOrDictOrAny = TypeVar('ListOrDictOrAny', list, dict, Any)
415+
416+
417+
def compute_short_hash(data: bytes, *, length: int = 8) -> str:
418+
"""Computes a hexadecimal SHA-256 hash of the provided data and returns a substring (prefix) of it.
419+
420+
Args:
421+
data: The binary data to be hashed.
422+
length: The length of the hash to be returned.
423+
424+
Returns:
425+
A substring (prefix) of the hexadecimal hash of the data.
426+
"""
427+
hash_object = sha256(data)
428+
return hash_object.hexdigest()[:length]
429+
430+
431+
def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str:
432+
"""Normalizes a URL.
433+
434+
This function cleans and standardizes a URL by removing leading and trailing whitespaces,
435+
converting the scheme and netloc to lower case, stripping unwanted tracking parameters
436+
(specifically those beginning with 'utm_'), sorting the remaining query parameters alphabetically,
437+
and optionally retaining the URL fragment. The goal is to ensure that URLs that are functionally
438+
identical but differ in trivial ways (such as parameter order or casing) are treated as the same.
439+
440+
Args:
441+
url: The URL to be normalized.
442+
keep_url_fragment: Flag to determine whether the fragment part of the URL should be retained.
443+
444+
Returns:
445+
A string containing the normalized URL.
446+
"""
447+
# Parse the URL
448+
parsed_url = urlparse(url.strip())
449+
search_params = dict(parse_qsl(parsed_url.query)) # Convert query to a dict
450+
451+
# Remove any 'utm_' parameters
452+
search_params = {k: v for k, v in search_params.items() if not k.startswith('utm_')}
453+
454+
# Construct the new query string
455+
sorted_keys = sorted(search_params.keys())
456+
sorted_query = urlencode([(k, search_params[k]) for k in sorted_keys])
457+
458+
# Construct the final URL
459+
new_url = (
460+
parsed_url._replace(
461+
query=sorted_query,
462+
scheme=parsed_url.scheme,
463+
netloc=parsed_url.netloc,
464+
path=parsed_url.path.rstrip('/'),
465+
)
466+
.geturl()
467+
.lower()
468+
)
469+
470+
# Retain the URL fragment if required
471+
if not keep_url_fragment:
472+
new_url = new_url.split('#')[0]
473+
474+
return new_url
475+
476+
477+
def compute_unique_key(
478+
url: str,
479+
method: str = 'GET',
480+
payload: bytes | None = None,
481+
*,
482+
keep_url_fragment: bool = False,
483+
use_extended_unique_key: bool = False,
484+
) -> str:
485+
"""Computes a unique key for caching & deduplication of requests.
486+
487+
This function computes a unique key by normalizing the provided URL and method.
488+
If 'use_extended_unique_key' is True and a payload is provided, the payload is hashed and
489+
included in the key. Otherwise, the unique key is just the normalized URL.
490+
491+
Args:
492+
url: The request URL.
493+
method: The HTTP method, defaults to 'GET'.
494+
payload: The request payload, defaults to None.
495+
keep_url_fragment: A flag indicating whether to keep the URL fragment, defaults to False.
496+
use_extended_unique_key: A flag indicating whether to include a hashed payload in the key, defaults to False.
497+
498+
Returns:
499+
A string representing the unique key for the request.
500+
"""
501+
# Normalize the URL and method.
502+
try:
503+
normalized_url = normalize_url(url, keep_url_fragment=keep_url_fragment)
504+
except Exception as exc:
505+
logger.warning(f'Failed to normalize URL: {exc}')
506+
normalized_url = url
507+
508+
normalized_method = method.upper()
509+
510+
# Compute and return the extended unique key if required.
511+
if use_extended_unique_key:
512+
payload_hash = compute_short_hash(payload) if payload else ''
513+
return f'{normalized_method}({payload_hash}):{normalized_url}'
514+
515+
# Log information if there is a non-GET request with a payload.
516+
if normalized_method != 'GET' and payload:
517+
logger.info(
518+
f'We have encountered a {normalized_method} Request with a payload. This is fine. Just letting you know '
519+
'that if your requests point to the same URL and differ only in method and payload, you should consider '
520+
'using the "use_extended_unique_key" option.'
521+
)
522+
523+
# Return the normalized URL as the unique key.
524+
return normalized_url

src/apify/scrapy/requests.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
) from exc
1414

1515
from apify._crypto import crypto_random_object_id
16+
from apify._utils import compute_unique_key
1617
from apify.actor import Actor
1718

1819

@@ -45,25 +46,37 @@ def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None:
4546
apify_request = {
4647
'url': scrapy_request.url,
4748
'method': scrapy_request.method,
49+
'payload': scrapy_request.body,
4850
'userData': scrapy_request.meta.get('userData', {}),
4951
}
5052

53+
# Convert Scrapy's headers to a dictionary and store them in the apify_request
5154
if isinstance(scrapy_request.headers, Headers):
5255
apify_request['headers'] = dict(scrapy_request.headers.to_unicode_dict())
5356
else:
5457
Actor.log.warning(f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}')
5558

59+
# If the request was produced by the middleware (e.g. retry or redirect), we must compute the unique key here
5660
if _is_request_produced_by_middleware(scrapy_request):
57-
apify_request['uniqueKey'] = scrapy_request.url
61+
apify_request['uniqueKey'] = compute_unique_key(
62+
url=scrapy_request.url,
63+
method=scrapy_request.method,
64+
payload=scrapy_request.body,
65+
use_extended_unique_key=True,
66+
)
67+
# Othwerwise, we can use the unique key (also the id) from the meta
5868
else:
59-
# Add 'id' to the apify_request
6069
if scrapy_request.meta.get('apify_request_id'):
6170
apify_request['id'] = scrapy_request.meta['apify_request_id']
6271

63-
# Add 'uniqueKey' to the apify_request
6472
if scrapy_request.meta.get('apify_request_unique_key'):
6573
apify_request['uniqueKey'] = scrapy_request.meta['apify_request_unique_key']
6674

75+
# If the request's dont_filter field is set, we must generate a random `uniqueKey` to avoid deduplication
76+
# of the request in the Request Queue.
77+
if scrapy_request.dont_filter:
78+
apify_request['uniqueKey'] = crypto_random_object_id(8)
79+
6780
# Serialize the Scrapy Request and store it in the apify_request.
6881
# - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
6982
# and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.

src/apify/scrapy/scheduler.py

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,12 @@ def enqueue_request(self: ApifyScheduler, request: Request) -> bool:
9595
raise TypeError('self._rq must be an instance of the RequestQueue class')
9696

9797
try:
98-
result = nested_event_loop.run_until_complete(self._rq.add_request(apify_request))
98+
result = nested_event_loop.run_until_complete(
99+
self._rq.add_request(
100+
apify_request,
101+
use_extended_unique_key=True,
102+
)
103+
)
99104
except BaseException:
100105
traceback.print_exc()
101106
raise

src/apify/storages/request_queue.py

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from apify_shared.utils import ignore_docs
1010

1111
from apify._crypto import crypto_random_object_id
12-
from apify._utils import LRUCache, budget_ow, unique_key_to_request_id
12+
from apify._utils import LRUCache, budget_ow, compute_unique_key, unique_key_to_request_id
1313
from apify.consts import REQUEST_QUEUE_HEAD_MAX_LIMIT
1414
from apify.log import logger
1515
from apify.storages.base_storage import BaseStorage
@@ -140,15 +140,43 @@ def _get_storage_collection_client(
140140
) -> RequestQueueCollectionClientAsync | RequestQueueCollectionClient:
141141
return client.request_queues()
142142

143-
async def add_request(self: RequestQueue, request: dict, *, forefront: bool = False) -> dict:
144-
"""Add a request to the queue.
143+
async def add_request(
144+
self: RequestQueue,
145+
request: dict,
146+
*,
147+
forefront: bool = False,
148+
keep_url_fragment: bool = False,
149+
use_extended_unique_key: bool = False,
150+
) -> dict:
151+
"""Adds a request to the `RequestQueue` while managing deduplication and positioning within the queue.
152+
153+
The deduplication of requests relies on the `uniqueKey` field within the request dictionary. If `uniqueKey`
154+
exists, it remains unchanged; if it does not, it is generated based on the request's `url`, `method`,
155+
and `payload` fields. The generation of `uniqueKey` can be influenced by the `keep_url_fragment` and
156+
`use_extended_unique_key` flags, which dictate whether to include the URL fragment and the request's method
157+
and payload, respectively, in its computation.
158+
159+
The request can be added to the forefront (beginning) or the back of the queue based on the `forefront`
160+
parameter. Information about the request's addition to the queue, including whether it was already present or
161+
handled, is returned in an output dictionary.
145162
146163
Args:
147-
request (dict): The request to add to the queue
148-
forefront (bool, optional): Whether to add the request to the head or the end of the queue
164+
request: The request object to be added to the queue. Must include at least the `url` key.
165+
Optionaly it can include the `method`, `payload` and `uniqueKey` keys.
149166
150-
Returns:
151-
dict: Information about the queue operation with keys `requestId`, `uniqueKey`, `wasAlreadyPresent`, `wasAlreadyHandled`.
167+
forefront: If True, adds the request to the forefront of the queue; otherwise, adds it to the end.
168+
169+
keep_url_fragment: Determines whether the URL fragment (the part of the URL after '#') should be retained
170+
in the unique key computation.
171+
172+
use_extended_unique_key: Determines whether to use an extended unique key, incorporating the request's
173+
method and payload into the unique key computation.
174+
175+
Returns: A dictionary containing information about the operation, including:
176+
- `requestId` (str): The ID of the request.
177+
- `uniqueKey` (str): The unique key associated with the request.
178+
- `wasAlreadyPresent` (bool): Indicates whether the request was already in the queue.
179+
- `wasAlreadyHandled` (bool): Indicates whether the request was already processed.
152180
"""
153181
budget_ow(
154182
request,
@@ -159,9 +187,13 @@ async def add_request(self: RequestQueue, request: dict, *, forefront: bool = Fa
159187
self._last_activity = datetime.now(timezone.utc)
160188

161189
if request.get('uniqueKey') is None:
162-
# TODO: Check Request class in crawlee and replicate uniqueKey generation logic...
163-
# https://github.com/apify/apify-sdk-python/issues/141
164-
request['uniqueKey'] = request['url']
190+
request['uniqueKey'] = compute_unique_key(
191+
url=request['url'],
192+
method=request.get('method', 'GET'),
193+
payload=request.get('payload'),
194+
keep_url_fragment=keep_url_fragment,
195+
use_extended_unique_key=use_extended_unique_key,
196+
)
165197

166198
cache_key = unique_key_to_request_id(request['uniqueKey'])
167199
cached_info = self._requests_cache.get(cache_key)

0 commit comments

Comments
 (0)