Skip to content

Commit 8c5e638

Browse files
authored
Improve error handling for to_apify_request serialization failures (#191)
1 parent 5542d3a commit 8c5e638

File tree

4 files changed

+49
-39
lines changed

4 files changed

+49
-39
lines changed

CHANGELOG.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,9 @@
22

33
## [1.6.1](../../releases/tag/v1.6.1) - Unreleased
44

5-
...
5+
### Fixed
6+
7+
- Improve error handling for `to_apify_request` serialization failures
68

79
## [1.6.0](../../releases/tag/v1.6.0) - 2024-02-23
810

src/apify/scrapy/requests.py

Lines changed: 36 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -24,57 +24,57 @@ def _is_request_produced_by_middleware(scrapy_request: Request) -> bool:
2424
return bool(scrapy_request.meta.get('redirect_times')) or bool(scrapy_request.meta.get('retry_times'))
2525

2626

27-
def to_apify_request(scrapy_request: Request, spider: Spider) -> dict:
27+
def to_apify_request(scrapy_request: Request, spider: Spider) -> dict | None:
2828
"""Convert a Scrapy request to an Apify request.
2929
3030
Args:
3131
scrapy_request: The Scrapy request to be converted.
3232
spider: The Scrapy spider that the request is associated with.
3333
34-
Raises:
35-
TypeError: If the scrapy_request is not an instance of the scrapy.Request class.
36-
3734
Returns:
38-
The converted Apify request.
35+
The converted Apify request if the conversion was successful, otherwise None.
3936
"""
4037
if not isinstance(scrapy_request, Request):
41-
raise TypeError('scrapy_request must be an instance of the scrapy.Request class')
38+
Actor.log.warning('Failed to convert to Apify request: Scrapy request must be a Request instance.')
39+
return None
4240

4341
call_id = crypto_random_object_id(8)
4442
Actor.log.debug(f'[{call_id}]: to_apify_request was called (scrapy_request={scrapy_request})...')
4543

46-
apify_request = {
47-
'url': scrapy_request.url,
48-
'method': scrapy_request.method,
49-
'userData': scrapy_request.meta.get('userData', {}),
50-
}
44+
try:
45+
apify_request = {
46+
'url': scrapy_request.url,
47+
'method': scrapy_request.method,
48+
'userData': scrapy_request.meta.get('userData', {}),
49+
}
5150

52-
if isinstance(scrapy_request.headers, Headers):
53-
apify_request['headers'] = dict(scrapy_request.headers.to_unicode_dict())
54-
else:
55-
Actor.log.warning(
56-
f'scrapy_request.headers is not an instance of the scrapy.http.headers.Headers class, scrapy_request.headers = {scrapy_request.headers}',
57-
)
51+
if isinstance(scrapy_request.headers, Headers):
52+
apify_request['headers'] = dict(scrapy_request.headers.to_unicode_dict())
53+
else:
54+
Actor.log.warning(f'Invalid scrapy_request.headers type, not scrapy.http.headers.Headers: {scrapy_request.headers}')
5855

59-
if _is_request_produced_by_middleware(scrapy_request):
60-
apify_request['uniqueKey'] = scrapy_request.url
61-
else:
62-
# Add 'id' to the apify_request
63-
if scrapy_request.meta.get('apify_request_id'):
64-
apify_request['id'] = scrapy_request.meta['apify_request_id']
65-
66-
# Add 'uniqueKey' to the apify_request
67-
if scrapy_request.meta.get('apify_request_unique_key'):
68-
apify_request['uniqueKey'] = scrapy_request.meta['apify_request_unique_key']
69-
70-
# Serialize the Scrapy Request and store it in the apify_request.
71-
# - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
72-
# and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
73-
# - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
74-
scrapy_request_dict = scrapy_request.to_dict(spider=spider)
75-
scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode()
76-
77-
apify_request['userData']['scrapy_request'] = scrapy_request_dict_encoded
56+
if _is_request_produced_by_middleware(scrapy_request):
57+
apify_request['uniqueKey'] = scrapy_request.url
58+
else:
59+
# Add 'id' to the apify_request
60+
if scrapy_request.meta.get('apify_request_id'):
61+
apify_request['id'] = scrapy_request.meta['apify_request_id']
62+
63+
# Add 'uniqueKey' to the apify_request
64+
if scrapy_request.meta.get('apify_request_unique_key'):
65+
apify_request['uniqueKey'] = scrapy_request.meta['apify_request_unique_key']
66+
67+
# Serialize the Scrapy Request and store it in the apify_request.
68+
# - This process involves converting the Scrapy Request object into a dictionary, encoding it to base64,
69+
# and storing it as 'scrapy_request' within the 'userData' dictionary of the apify_request.
70+
# - The serialization process can be referenced at: https://stackoverflow.com/questions/30469575/.
71+
scrapy_request_dict = scrapy_request.to_dict(spider=spider)
72+
scrapy_request_dict_encoded = codecs.encode(pickle.dumps(scrapy_request_dict), 'base64').decode()
73+
apify_request['userData']['scrapy_request'] = scrapy_request_dict_encoded
74+
75+
except Exception as exc:
76+
Actor.log.warning(f'Conversion of Scrapy request {scrapy_request} to Apify request failed; {exc}')
77+
return None
7878

7979
Actor.log.debug(f'[{call_id}]: scrapy_request was converted to the apify_request={apify_request}')
8080
return apify_request

src/apify/scrapy/scheduler.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,10 @@ def enqueue_request(self: ApifyScheduler, request: Request) -> bool:
8585
raise TypeError('self.spider must be an instance of the Spider class')
8686

8787
apify_request = to_apify_request(request, spider=self.spider)
88+
if apify_request is None:
89+
Actor.log.error(f'Request {request} was not enqueued because it could not be converted to Apify request.')
90+
return False
91+
8892
Actor.log.debug(f'[{call_id}]: scrapy_request was transformed to apify_request (apify_request={apify_request})')
8993

9094
if not isinstance(self._rq, RequestQueue):

tests/unit/scrapy/requests/test_to_apify_request.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ def test__to_apify_request__simple(spider: Spider) -> None:
2121
scrapy_request = Request(url='https://example.com')
2222

2323
apify_request = to_apify_request(scrapy_request, spider)
24+
assert apify_request is not None
2425
assert apify_request.get('url') == 'https://example.com'
2526

2627
user_data = apify_request.get('userData', {})
@@ -35,6 +36,7 @@ def test__to_apify_request__headers(spider: Spider) -> None:
3536

3637
apify_request = to_apify_request(scrapy_request, spider)
3738

39+
assert apify_request is not None
3840
assert apify_request['headers'] == dict(scrapy_request_headers.to_unicode_dict())
3941

4042

@@ -47,6 +49,7 @@ def test__to_apify_request__without_id_and_unique_key(spider: Spider) -> None:
4749

4850
apify_request = to_apify_request(scrapy_request, spider)
4951

52+
assert apify_request is not None
5053
assert apify_request.get('url') == 'https://example.com'
5154
assert apify_request.get('method') == 'GET'
5255

@@ -70,6 +73,7 @@ def test__to_apify_request__with_id_and_unique_key(spider: Spider) -> None:
7073
)
7174

7275
apify_request = to_apify_request(scrapy_request, spider)
76+
assert apify_request is not None
7377

7478
assert apify_request.get('url') == 'https://example.com'
7579
assert apify_request.get('method') == 'GET'
@@ -87,5 +91,5 @@ def test__to_apify_request__with_id_and_unique_key(spider: Spider) -> None:
8791
def test__to_apify_request__invalid_scrapy_request(spider: Spider) -> None:
8892
scrapy_request = 'invalid_request'
8993

90-
with pytest.raises(TypeError):
91-
to_apify_request(scrapy_request, spider)
94+
apify_request = to_apify_request(scrapy_request, spider)
95+
assert apify_request is None

0 commit comments

Comments
 (0)