From 3bdef50f7c532d53b8e02e463e558409d035b42f Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 21 Oct 2025 12:02:48 +0200 Subject: [PATCH 1/3] Re-export SQL storage client from Crawlee --- website/docusaurus.config.js | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index 9ec99007..b58767e3 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -239,6 +239,10 @@ module.exports = { url: 'https://crawlee.dev/python/api/class/FileSystemStorageClient', group: 'Storage clients', }, + { + url: 'https://crawlee.dev/python/api/class/SqlStorageClient', + group: 'Storage clients', + }, // Request loaders { url: 'https://crawlee.dev/python/api/class/RequestLoader', From 4d547433f2ae148f51c8eb0ebad06b03cd404b7d Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 21 Oct 2025 14:50:49 +0200 Subject: [PATCH 2/3] Improve API docs --- .../storage_clients/_apify/_storage_client.py | 47 ++++++++++++++----- .../_smart_apify/_storage_client.py | 24 ++++++---- 2 files changed, 50 insertions(+), 21 deletions(-) diff --git a/src/apify/storage_clients/_apify/_storage_client.py b/src/apify/storage_clients/_apify/_storage_client.py index 2bee6527..6db3b39a 100644 --- a/src/apify/storage_clients/_apify/_storage_client.py +++ b/src/apify/storage_clients/_apify/_storage_client.py @@ -21,23 +21,44 @@ @docs_group('Storage clients') class ApifyStorageClient(StorageClient): - """Apify storage client.""" + """Apify platform implementation of the storage client. + + This storage client provides access to datasets, key-value stores, and request queues that persist data + to the Apify platform. Each storage type is implemented with its own specific Apify client that stores data + in the cloud, making it accessible from anywhere. + + The communication with the Apify platform is handled via the Apify API client for Python, which is an HTTP API + wrapper. For maximum efficiency and performance of the storage clients, various caching mechanisms are used to + minimize the number of API calls made to the Apify platform. Data can be inspected and manipulated through + the Apify console web interface or via the Apify API. + + The request queue client supports two access modes controlled by the `request_queue_access` parameter: + + ### Single mode + + The `single` mode is optimized for scenarios with only one consumer. It makes fewer API calls and is therefore + faster and more cost-efficient, but it comes with several constraints. Only one client should consume the request + queue at a time. Multiple producers can add new requests, but forefront requests might not be processed + immediately, since this mode relies on local head estimation rather than frequent forefront fetching. Requests + can be added or marked as handled by other clients, but they must not be deleted or modified, as such changes + would not be reflected in the local cache. + + ### Shared mode + + The `shared` mode is designed for scenarios with multiple concurrent consumers. It ensures proper synchronization + and consistency across clients, at the cost of higher API usage and slightly worse performance. This mode is safe + for concurrent access from multiple processes, including Actors running in parallel on the Apify platform. It + should be used when multiple consumers need to process requests from the same queue simultaneously. + """ def __init__(self, *, request_queue_access: Literal['single', 'shared'] = 'single') -> None: - """Initialize the Apify storage client. + """Initialize a new instance. Args: - request_queue_access: Controls the implementation of the request queue client based on expected scenario: - - 'single' is suitable for single consumer scenarios. It makes less API calls, is cheaper and faster. - - 'shared' is suitable for multiple consumers scenarios at the cost of higher API usage. - Detailed constraints for the 'single' access type: - - Only one client is consuming the request queue at the time. - - Multiple producers can put requests to the queue, but their forefront requests are not guaranteed to - be handled so quickly as this client does not aggressively fetch the forefront and relies on local - head estimation. - - Requests are only added to the queue, never deleted by other clients. (Marking as handled is ok.) - - Other producers can add new requests, but not modify existing ones. - (Modifications would not be included in local cache) + request_queue_access: Defines how the request queue client behaves. Use `single` mode for a single + consumer. It has fewer API calls, meaning better performance and lower costs. If you need multiple + concurrent consumers use `shared` mode, but expect worse performance and higher costs due to + the additional overhead. """ self._request_queue_access = request_queue_access diff --git a/src/apify/storage_clients/_smart_apify/_storage_client.py b/src/apify/storage_clients/_smart_apify/_storage_client.py index db1b8b5d..92b3d1e9 100644 --- a/src/apify/storage_clients/_smart_apify/_storage_client.py +++ b/src/apify/storage_clients/_smart_apify/_storage_client.py @@ -19,10 +19,18 @@ @docs_group('Storage clients') class SmartApifyStorageClient(StorageClient): - """SmartApifyStorageClient that delegates to cloud_storage_client or local_storage_client. + """Storage client that automatically selects cloud or local storage client based on the environment. - When running on Apify platform use cloud_storage_client, else use local_storage_client. This storage client is - designed to work specifically in Actor context. + This storage client provides access to datasets, key-value stores, and request queues by intelligently + delegating to either the cloud or local storage client based on the execution environment and configuration. + + When running on the Apify platform (which is detected via environment variables), this client automatically + uses the `cloud_storage_client` to store storage data there. When running locally, it uses the + `local_storage_client` to store storage data there. You can also force cloud storage usage from your + local machine by using the `force_cloud` argument. + + This storage client is designed to work specifically in `Actor` context and provides a seamless development + experience where the same code works both locally and on the Apify platform without any changes. """ def __init__( @@ -31,13 +39,13 @@ def __init__( cloud_storage_client: ApifyStorageClient | None = None, local_storage_client: StorageClient | None = None, ) -> None: - """Initialize the Apify storage client. + """Initialize a new instance. Args: - cloud_storage_client: Client used to communicate with the Apify platform storage. Either through - `force_cloud` argument when opening storages or automatically when running on the Apify platform. - local_storage_client: Client used to communicate with the storage when not running on the Apify - platform and not using `force_cloud` argument when opening storages. + cloud_storage_client: Storage client used when an Actor is running on the Apify platform, or when + explicitly enabled via the `force_cloud` argument. Defaults to `ApifyStorageClient`. + local_storage_client: Storage client used when an Actor is not running on the Apify platform and when + `force_cloud` flag is not set. Defaults to `FileSystemStorageClient`. """ self._cloud_storage_client = cloud_storage_client or ApifyStorageClient(request_queue_access='single') self._local_storage_client = local_storage_client or ApifyFileSystemStorageClient() From 033c495a01505593f2d657f2ef5f80bed17400c1 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Tue, 21 Oct 2025 16:12:40 +0200 Subject: [PATCH 3/3] address feedback --- .../storage_clients/_apify/_storage_client.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/src/apify/storage_clients/_apify/_storage_client.py b/src/apify/storage_clients/_apify/_storage_client.py index 6db3b39a..cab538f8 100644 --- a/src/apify/storage_clients/_apify/_storage_client.py +++ b/src/apify/storage_clients/_apify/_storage_client.py @@ -36,12 +36,18 @@ class ApifyStorageClient(StorageClient): ### Single mode - The `single` mode is optimized for scenarios with only one consumer. It makes fewer API calls and is therefore - faster and more cost-efficient, but it comes with several constraints. Only one client should consume the request - queue at a time. Multiple producers can add new requests, but forefront requests might not be processed - immediately, since this mode relies on local head estimation rather than frequent forefront fetching. Requests - can be added or marked as handled by other clients, but they must not be deleted or modified, as such changes - would not be reflected in the local cache. + The `single` mode is optimized for scenarios with only one consumer. It minimizes API calls, making it faster + and more cost-efficient compared to the `shared` mode. This option is ideal when a single Actor is responsible + for consuming the entire request queue. Using multiple consumers simultaneously may lead to inconsistencies + or unexpected behavior. + + In this mode, multiple producers can safely add new requests, but forefront requests may not be processed + immediately, as the client relies on local head estimation instead of frequent forefront fetching. Requests can + also be added or marked as handled by other clients, but they must not be deleted or modified, since such changes + would not be reflected in the local cache. If a request is already fully cached locally, marking it as handled + by another client will be ignored by this client. This does not cause errors but can occasionally result in + reprocessing a request that was already handled elsewhere. If the request was not yet cached locally, marking + it as handled poses no issue. ### Shared mode