Fix issue with querying request queue head multiple times in parallel (#113)

fnesveda · web-flow · commit 7dffcff9b04d · 2023-09-01T08:03:22.000+02:00
When calling `_ensure_head_is_non_empty` on `RequestQueue`, the call to
query the RQ head is cached (as an optimization) so that multiple
parallel calls to the storage are not made, and in each call of
`_ensure_head_is_non_empty` the cached call is then awaited.

In Python (unlike JavaScript), you can't await the result of an async
function multiple times, it leads to an error. To get around that, you
can wrap the result of the coroutine into an `asyncio.Task`, and that
you can await multiple times (`asyncio.Task` behaves sort of like a JS
`Promise` in this regard).
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -4,6 +4,10 @@ Changelog
 [1.1.4](../../releases/tag/v1.1.4) - Unreleased
 -----------------------------------------------
 
+### Fixes
+
+- resolved issue with querying request queue head multiple times in parallel
+
 ### Internal changes
 
 - Fixed integration tests for Actor logger
diff --git a/src/apify/storages/request_queue.py b/src/apify/storages/request_queue.py
@@ -1,7 +1,7 @@
 import asyncio
 from collections import OrderedDict
 from datetime import datetime, timezone
-from typing import Coroutine, Dict, Optional
+from typing import Dict, Optional
 from typing import OrderedDict as OrderedDictType
 from typing import Set, Union
 
@@ -75,7 +75,7 @@ class RequestQueue(BaseStorage):
     _request_queue_client: Union[RequestQueueClientAsync, RequestQueueClient]
     _client_key = _crypto_random_object_id()
     _queue_head_dict: OrderedDictType[str, str]
-    _query_queue_head_promise: Optional[Coroutine]
+    _query_queue_head_task: Optional[asyncio.Task]
     _in_progress: Set[str]
     _last_activity: datetime
     _internal_timeout_seconds = 5 * 60
@@ -100,7 +100,7 @@ def __init__(self, id: str, name: Optional[str], client: Union[ApifyClientAsync,
 
         self._request_queue_client = client.request_queue(self._id, client_key=self._client_key)
         self._queue_head_dict = OrderedDict()
-        self._query_queue_head_promise = None
+        self._query_queue_head_task = None
         self._in_progress = set()
         self._last_activity = datetime.now(timezone.utc)
         self._recently_handled = LRUCache[bool](max_length=RECENTLY_HANDLED_CACHE_SIZE)
@@ -369,7 +369,7 @@ async def is_finished(self) -> bool:
 
     def _reset(self) -> None:
         self._queue_head_dict.clear()
-        self._query_queue_head_promise = None
+        self._query_queue_head_task = None
         self._in_progress.clear()
         self._recently_handled.clear()
         self._assumed_total_count = 0
@@ -402,7 +402,7 @@ async def _queue_query_head(self, limit: int) -> Dict:
             })
 
         # This is needed so that the next call to _ensureHeadIsNonEmpty() will fetch the queue head again.
-        self._query_queue_head_promise = None
+        self._query_queue_head_task = None
 
         return {
             'wasLimitReached': len(list_head['items']) >= limit,
@@ -420,15 +420,15 @@ async def _ensure_head_is_non_empty(self, ensure_consistency: bool = False, limi
         if limit is None:
             limit = max(self._in_progress_count() * QUERY_HEAD_BUFFER, QUERY_HEAD_MIN_LENGTH)
 
-        if self._query_queue_head_promise is None:
-            self._query_queue_head_promise = self._queue_query_head(limit)
+        if self._query_queue_head_task is None:
+            self._query_queue_head_task = asyncio.Task(self._queue_query_head(limit))
 
-        queue_head = await self._query_queue_head_promise
+        queue_head = await self._query_queue_head_task
 
         # TODO: I feel this code below can be greatly simplified... (comes from TS implementation *wink*)
 
         """ If queue is still empty then one of the following holds:
-        - the other calls waiting for this promise already consumed all the returned requests
+        - the other calls waiting for this task already consumed all the returned requests
         - the limit was too low and contained only requests in progress
         - the writes from other clients were not propagated yet
         - the whole queue was processed and we are done