|
1 | 1 | from __future__ import annotations
|
2 | 2 |
|
3 | 3 | import asyncio
|
4 |
| -import base64 |
5 | 4 | import builtins
|
6 | 5 | import contextlib
|
7 | 6 | import functools
|
8 |
| -import hashlib |
9 | 7 | import inspect
|
10 | 8 | import json
|
11 | 9 | import mimetypes
|
12 | 10 | import os
|
13 | 11 | import re
|
14 | 12 | import sys
|
15 | 13 | import time
|
| 14 | +from base64 import b64encode |
16 | 15 | from collections import OrderedDict
|
17 | 16 | from collections.abc import MutableMapping
|
18 | 17 | from datetime import datetime, timezone
|
| 18 | +from hashlib import sha256 |
19 | 19 | from importlib import metadata
|
| 20 | +from logging import getLogger |
20 | 21 | from typing import (
|
21 | 22 | Any,
|
22 | 23 | Callable,
|
|
30 | 31 | overload,
|
31 | 32 | )
|
32 | 33 | from typing import OrderedDict as OrderedDictType
|
| 34 | +from urllib.parse import parse_qsl, urlencode, urlparse |
33 | 35 |
|
34 | 36 | import aioshutil
|
35 | 37 | import psutil
|
|
59 | 61 | from apify.consts import REQUEST_ID_LENGTH, StorageTypes
|
60 | 62 |
|
61 | 63 | T = TypeVar('T')
|
| 64 | +logger = getLogger(__name__) |
62 | 65 |
|
63 | 66 |
|
64 | 67 | def get_system_info() -> dict:
|
@@ -292,9 +295,8 @@ def maybe_parse_body(body: bytes, content_type: str) -> Any:
|
292 | 295 |
|
293 | 296 | def unique_key_to_request_id(unique_key: str) -> str:
|
294 | 297 | """Generate request ID based on unique key in a deterministic way."""
|
295 |
| - id = re.sub(r'(\+|\/|=)', '', base64.b64encode(hashlib.sha256(unique_key.encode('utf-8')).digest()).decode('utf-8')) # noqa: A001 |
296 |
| - |
297 |
| - return id[:REQUEST_ID_LENGTH] if len(id) > REQUEST_ID_LENGTH else id |
| 298 | + request_id = re.sub(r'(\+|\/|=)', '', b64encode(sha256(unique_key.encode('utf-8')).digest()).decode('utf-8')) |
| 299 | + return request_id[:REQUEST_ID_LENGTH] if len(request_id) > REQUEST_ID_LENGTH else request_id |
298 | 300 |
|
299 | 301 |
|
300 | 302 | async def force_rename(src_dir: str, dst_dir: str) -> None:
|
@@ -410,3 +412,113 @@ def validate_single(field_value: Any, expected_type: type, required: bool, name:
|
410 | 412 | PARSE_DATE_FIELDS_MAX_DEPTH = 3
|
411 | 413 | PARSE_DATE_FIELDS_KEY_SUFFIX = 'At'
|
412 | 414 | ListOrDictOrAny = TypeVar('ListOrDictOrAny', list, dict, Any)
|
| 415 | + |
| 416 | + |
| 417 | +def compute_short_hash(data: bytes, *, length: int = 8) -> str: |
| 418 | + """Computes a hexadecimal SHA-256 hash of the provided data and returns a substring (prefix) of it. |
| 419 | +
|
| 420 | + Args: |
| 421 | + data: The binary data to be hashed. |
| 422 | + length: The length of the hash to be returned. |
| 423 | +
|
| 424 | + Returns: |
| 425 | + A substring (prefix) of the hexadecimal hash of the data. |
| 426 | + """ |
| 427 | + hash_object = sha256(data) |
| 428 | + return hash_object.hexdigest()[:length] |
| 429 | + |
| 430 | + |
| 431 | +def normalize_url(url: str, *, keep_url_fragment: bool = False) -> str: |
| 432 | + """Normalizes a URL. |
| 433 | +
|
| 434 | + This function cleans and standardizes a URL by removing leading and trailing whitespaces, |
| 435 | + converting the scheme and netloc to lower case, stripping unwanted tracking parameters |
| 436 | + (specifically those beginning with 'utm_'), sorting the remaining query parameters alphabetically, |
| 437 | + and optionally retaining the URL fragment. The goal is to ensure that URLs that are functionally |
| 438 | + identical but differ in trivial ways (such as parameter order or casing) are treated as the same. |
| 439 | +
|
| 440 | + Args: |
| 441 | + url: The URL to be normalized. |
| 442 | + keep_url_fragment: Flag to determine whether the fragment part of the URL should be retained. |
| 443 | +
|
| 444 | + Returns: |
| 445 | + A string containing the normalized URL. |
| 446 | + """ |
| 447 | + # Parse the URL |
| 448 | + parsed_url = urlparse(url.strip()) |
| 449 | + search_params = dict(parse_qsl(parsed_url.query)) # Convert query to a dict |
| 450 | + |
| 451 | + # Remove any 'utm_' parameters |
| 452 | + search_params = {k: v for k, v in search_params.items() if not k.startswith('utm_')} |
| 453 | + |
| 454 | + # Construct the new query string |
| 455 | + sorted_keys = sorted(search_params.keys()) |
| 456 | + sorted_query = urlencode([(k, search_params[k]) for k in sorted_keys]) |
| 457 | + |
| 458 | + # Construct the final URL |
| 459 | + new_url = ( |
| 460 | + parsed_url._replace( |
| 461 | + query=sorted_query, |
| 462 | + scheme=parsed_url.scheme, |
| 463 | + netloc=parsed_url.netloc, |
| 464 | + path=parsed_url.path.rstrip('/'), |
| 465 | + ) |
| 466 | + .geturl() |
| 467 | + .lower() |
| 468 | + ) |
| 469 | + |
| 470 | + # Retain the URL fragment if required |
| 471 | + if not keep_url_fragment: |
| 472 | + new_url = new_url.split('#')[0] |
| 473 | + |
| 474 | + return new_url |
| 475 | + |
| 476 | + |
| 477 | +def compute_unique_key( |
| 478 | + url: str, |
| 479 | + method: str = 'GET', |
| 480 | + payload: bytes | None = None, |
| 481 | + *, |
| 482 | + keep_url_fragment: bool = False, |
| 483 | + use_extended_unique_key: bool = False, |
| 484 | +) -> str: |
| 485 | + """Computes a unique key for caching & deduplication of requests. |
| 486 | +
|
| 487 | + This function computes a unique key by normalizing the provided URL and method. |
| 488 | + If 'use_extended_unique_key' is True and a payload is provided, the payload is hashed and |
| 489 | + included in the key. Otherwise, the unique key is just the normalized URL. |
| 490 | +
|
| 491 | + Args: |
| 492 | + url: The request URL. |
| 493 | + method: The HTTP method, defaults to 'GET'. |
| 494 | + payload: The request payload, defaults to None. |
| 495 | + keep_url_fragment: A flag indicating whether to keep the URL fragment, defaults to False. |
| 496 | + use_extended_unique_key: A flag indicating whether to include a hashed payload in the key, defaults to False. |
| 497 | +
|
| 498 | + Returns: |
| 499 | + A string representing the unique key for the request. |
| 500 | + """ |
| 501 | + # Normalize the URL and method. |
| 502 | + try: |
| 503 | + normalized_url = normalize_url(url, keep_url_fragment=keep_url_fragment) |
| 504 | + except Exception as exc: |
| 505 | + logger.warning(f'Failed to normalize URL: {exc}') |
| 506 | + normalized_url = url |
| 507 | + |
| 508 | + normalized_method = method.upper() |
| 509 | + |
| 510 | + # Compute and return the extended unique key if required. |
| 511 | + if use_extended_unique_key: |
| 512 | + payload_hash = compute_short_hash(payload) if payload else '' |
| 513 | + return f'{normalized_method}({payload_hash}):{normalized_url}' |
| 514 | + |
| 515 | + # Log information if there is a non-GET request with a payload. |
| 516 | + if normalized_method != 'GET' and payload: |
| 517 | + logger.info( |
| 518 | + f'We have encountered a {normalized_method} Request with a payload. This is fine. Just letting you know ' |
| 519 | + 'that if your requests point to the same URL and differ only in method and payload, you should consider ' |
| 520 | + 'using the "use_extended_unique_key" option.' |
| 521 | + ) |
| 522 | + |
| 523 | + # Return the normalized URL as the unique key. |
| 524 | + return normalized_url |
0 commit comments