Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
47 changes: 34 additions & 13 deletions src/apify/storage_clients/_apify/_storage_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,23 +21,44 @@

@docs_group('Storage clients')
class ApifyStorageClient(StorageClient):
"""Apify storage client."""
"""Apify platform implementation of the storage client.
This storage client provides access to datasets, key-value stores, and request queues that persist data
to the Apify platform. Each storage type is implemented with its own specific Apify client that stores data
in the cloud, making it accessible from anywhere.
The communication with the Apify platform is handled via the Apify API client for Python, which is an HTTP API
wrapper. For maximum efficiency and performance of the storage clients, various caching mechanisms are used to
minimize the number of API calls made to the Apify platform. Data can be inspected and manipulated through
the Apify console web interface or via the Apify API.
The request queue client supports two access modes controlled by the `request_queue_access` parameter:
### Single mode
The `single` mode is optimized for scenarios with only one consumer. It makes fewer API calls and is therefore
faster and more cost-efficient, but it comes with several constraints. Only one client should consume the request
queue at a time. Multiple producers can add new requests, but forefront requests might not be processed
immediately, since this mode relies on local head estimation rather than frequent forefront fetching. Requests
can be added or marked as handled by other clients, but they must not be deleted or modified, as such changes
would not be reflected in the local cache.
### Shared mode
The `shared` mode is designed for scenarios with multiple concurrent consumers. It ensures proper synchronization
and consistency across clients, at the cost of higher API usage and slightly worse performance. This mode is safe
for concurrent access from multiple processes, including Actors running in parallel on the Apify platform. It
should be used when multiple consumers need to process requests from the same queue simultaneously.
"""

def __init__(self, *, request_queue_access: Literal['single', 'shared'] = 'single') -> None:
"""Initialize the Apify storage client.
"""Initialize a new instance.
Args:
request_queue_access: Controls the implementation of the request queue client based on expected scenario:
- 'single' is suitable for single consumer scenarios. It makes less API calls, is cheaper and faster.
- 'shared' is suitable for multiple consumers scenarios at the cost of higher API usage.
Detailed constraints for the 'single' access type:
- Only one client is consuming the request queue at the time.
- Multiple producers can put requests to the queue, but their forefront requests are not guaranteed to
be handled so quickly as this client does not aggressively fetch the forefront and relies on local
head estimation.
- Requests are only added to the queue, never deleted by other clients. (Marking as handled is ok.)
- Other producers can add new requests, but not modify existing ones.
(Modifications would not be included in local cache)
request_queue_access: Defines how the request queue client behaves. Use `single` mode for a single
consumer. It has fewer API calls, meaning better performance and lower costs. If you need multiple
concurrent consumers use `shared` mode, but expect worse performance and higher costs due to
the additional overhead.
"""
self._request_queue_access = request_queue_access

Expand Down
24 changes: 16 additions & 8 deletions src/apify/storage_clients/_smart_apify/_storage_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,18 @@

@docs_group('Storage clients')
class SmartApifyStorageClient(StorageClient):
"""SmartApifyStorageClient that delegates to cloud_storage_client or local_storage_client.
"""Storage client that automatically selects cloud or local storage client based on the environment.

When running on Apify platform use cloud_storage_client, else use local_storage_client. This storage client is
designed to work specifically in Actor context.
This storage client provides access to datasets, key-value stores, and request queues by intelligently
delegating to either the cloud or local storage client based on the execution environment and configuration.

When running on the Apify platform (which is detected via environment variables), this client automatically
uses the `cloud_storage_client` to store storage data there. When running locally, it uses the
`local_storage_client` to store storage data there. You can also force cloud storage usage from your
local machine by using the `force_cloud` argument.

This storage client is designed to work specifically in `Actor` context and provides a seamless development
experience where the same code works both locally and on the Apify platform without any changes.
"""

def __init__(
Expand All @@ -31,13 +39,13 @@ def __init__(
cloud_storage_client: ApifyStorageClient | None = None,
local_storage_client: StorageClient | None = None,
) -> None:
"""Initialize the Apify storage client.
"""Initialize a new instance.

Args:
cloud_storage_client: Client used to communicate with the Apify platform storage. Either through
`force_cloud` argument when opening storages or automatically when running on the Apify platform.
local_storage_client: Client used to communicate with the storage when not running on the Apify
platform and not using `force_cloud` argument when opening storages.
cloud_storage_client: Storage client used when an Actor is running on the Apify platform, or when
explicitly enabled via the `force_cloud` argument. Defaults to `ApifyStorageClient`.
local_storage_client: Storage client used when an Actor is not running on the Apify platform and when
`force_cloud` flag is not set. Defaults to `FileSystemStorageClient`.
"""
self._cloud_storage_client = cloud_storage_client or ApifyStorageClient(request_queue_access='single')
self._local_storage_client = local_storage_client or ApifyFileSystemStorageClient()
Expand Down
4 changes: 4 additions & 0 deletions website/docusaurus.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -239,6 +239,10 @@ module.exports = {
url: 'https://crawlee.dev/python/api/class/FileSystemStorageClient',
group: 'Storage clients',
},
{
url: 'https://crawlee.dev/python/api/class/SqlStorageClient',
group: 'Storage clients',
},
// Request loaders
{
url: 'https://crawlee.dev/python/api/class/RequestLoader',
Expand Down