Skip to content

Commit dc90127

Browse files
authored
feat: Add transform_request_function parameter for SitemapRequestLoader (#1525)
### Description This PR is inspired by this [discussion](#1517). Add support `transform_request_function` for `SitemapRequestLoader`, which works the same way as in `EnqueueLinksFunction`. This can be useful for setting `label` for correct routing or `user_data` with custom data.
1 parent 65942c5 commit dc90127

File tree

4 files changed

+175
-4
lines changed

4 files changed

+175
-4
lines changed
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
import asyncio
2+
from collections.abc import Callable
3+
4+
from yarl import URL
5+
6+
from crawlee import RequestOptions, RequestTransformAction
7+
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
8+
from crawlee.http_clients import ImpitHttpClient
9+
from crawlee.request_loaders import SitemapRequestLoader
10+
11+
12+
# Create a transform_request_function that maps request options based on the host in
13+
# the URL
14+
def create_transform_request(
15+
data_mapper: dict[str, dict],
16+
) -> Callable[[RequestOptions], RequestOptions | RequestTransformAction]:
17+
def transform_request(
18+
request_options: RequestOptions,
19+
) -> RequestOptions | RequestTransformAction:
20+
# According to the Sitemap protocol, all URLs in a Sitemap must be from a single
21+
# host.
22+
request_host = URL(request_options['url']).host
23+
24+
if request_host and (mapping_data := data_mapper.get(request_host)):
25+
# Set properties from the mapping data
26+
if 'label' in mapping_data:
27+
request_options['label'] = mapping_data['label']
28+
if 'user_data' in mapping_data:
29+
request_options['user_data'] = mapping_data['user_data']
30+
31+
return request_options
32+
33+
return 'unchanged'
34+
35+
return transform_request
36+
37+
38+
async def main() -> None:
39+
# Prepare data mapping for hosts
40+
apify_host = URL('https://apify.com/sitemap.xml').host
41+
crawlee_host = URL('https://crawlee.dev/sitemap.xml').host
42+
43+
if not apify_host or not crawlee_host:
44+
raise ValueError('Unable to extract host from URLs')
45+
46+
data_map = {
47+
apify_host: {
48+
'label': 'apify',
49+
'user_data': {'source': 'apify'},
50+
},
51+
crawlee_host: {
52+
'label': 'crawlee',
53+
'user_data': {'source': 'crawlee'},
54+
},
55+
}
56+
57+
# Initialize the SitemapRequestLoader with the transform function
58+
async with SitemapRequestLoader(
59+
# Set the sitemap URLs and the HTTP client
60+
sitemap_urls=['https://crawlee.dev/sitemap.xml', 'https://apify.com/sitemap.xml'],
61+
http_client=ImpitHttpClient(),
62+
transform_request_function=create_transform_request(data_map),
63+
) as sitemap_loader:
64+
# Convert the sitemap loader to a request manager
65+
request_manager = await sitemap_loader.to_tandem()
66+
67+
# Create and configure the crawler
68+
crawler = BeautifulSoupCrawler(
69+
request_manager=request_manager,
70+
max_requests_per_crawl=10,
71+
)
72+
73+
# Create default handler for requests without a specific label
74+
@crawler.router.default_handler
75+
async def handler(context: BeautifulSoupCrawlingContext) -> None:
76+
source = context.request.user_data.get('source', 'unknown')
77+
context.log.info(
78+
f'Processing request: {context.request.url} from source: {source}'
79+
)
80+
81+
# Create handler for requests labeled 'apify'
82+
@crawler.router.handler('apify')
83+
async def apify_handler(context: BeautifulSoupCrawlingContext) -> None:
84+
source = context.request.user_data.get('source', 'unknown')
85+
context.log.info(
86+
f'Apify handler processing: {context.request.url} from source: {source}'
87+
)
88+
89+
# Create handler for requests labeled 'crawlee'
90+
@crawler.router.handler('crawlee')
91+
async def crawlee_handler(context: BeautifulSoupCrawlingContext) -> None:
92+
source = context.request.user_data.get('source', 'unknown')
93+
context.log.info(
94+
f'Crawlee handler processing: {context.request.url} from source: {source}'
95+
)
96+
97+
await crawler.run()
98+
99+
100+
if __name__ == '__main__':
101+
asyncio.run(main())
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
---
2+
id: using-sitemap-request-loader
3+
title: Using sitemap request loader
4+
---
5+
6+
import ApiLink from '@site/src/components/ApiLink';
7+
8+
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
9+
10+
import SitemapRequestLoaderExample from '!!raw-loader!roa-loader!./code_examples/using_sitemap_request_loader.py';
11+
12+
This example demonstrates how to use <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> to crawl websites that provide `sitemap.xml` files following the [Sitemaps protocol](https://www.sitemaps.org/protocol.html). The <ApiLink to="class/SitemapRequestLoader">`SitemapRequestLoader`</ApiLink> processes sitemaps in a streaming fashion without loading them entirely into memory, making it suitable for large sitemaps.
13+
14+
The example shows how to use the `transform_request_function` parameter to configure request options based on URL patterns. This allows you to modify request properties such as labels and user data based on the source URL, enabling different handling logic for different websites or sections.
15+
16+
The following code example implements processing of sitemaps from two different domains (Apify and Crawlee), with different labels assigned to requests based on their host. The `create_transform_request` function maps each host to the corresponding request configuration, while the crawler uses different handlers based on the assigned labels.
17+
18+
<RunnableCodeBlock className="language-python" language="python">
19+
{SitemapRequestLoaderExample}
20+
</RunnableCodeBlock>
21+
22+
For more information about request loaders, see the [Request loaders guide](../guides/request-loaders).

src/crawlee/request_loaders/_sitemap_request_loader.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
from pydantic import BaseModel, ConfigDict, Field
1010
from typing_extensions import override
1111

12-
from crawlee import Request
12+
from crawlee import Request, RequestOptions
1313
from crawlee._utils.docs import docs_group
1414
from crawlee._utils.globs import Glob
1515
from crawlee._utils.recoverable_state import RecoverableState
@@ -18,9 +18,10 @@
1818

1919
if TYPE_CHECKING:
2020
import re
21-
from collections.abc import Sequence
21+
from collections.abc import Callable, Sequence
2222
from types import TracebackType
2323

24+
from crawlee import RequestTransformAction
2425
from crawlee.http_clients import HttpClient
2526
from crawlee.proxy_configuration import ProxyInfo
2627
from crawlee.storage_clients.models import ProcessedRequest
@@ -112,6 +113,7 @@ def __init__(
112113
exclude: list[re.Pattern[Any] | Glob] | None = None,
113114
max_buffer_size: int = 200,
114115
persist_state_key: str | None = None,
116+
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
115117
) -> None:
116118
"""Initialize the sitemap request loader.
117119
@@ -125,13 +127,17 @@ def __init__(
125127
persist_state_key: A key for persisting the loader's state in the KeyValueStore.
126128
When provided, allows resuming from where it left off after interruption.
127129
If None, no state persistence occurs.
130+
transform_request_function: An optional function to transform requests
131+
generated by the loader. It receives `RequestOptions` with `url` and should return either
132+
modified `RequestOptions` or a `RequestTransformAction`.
128133
"""
129134
self._http_client = http_client
130135
self._sitemap_urls = sitemap_urls
131136
self._include = include
132137
self._exclude = exclude
133138
self._proxy_info = proxy_info
134139
self._max_buffer_size = max_buffer_size
140+
self._transform_request_function = transform_request_function
135141

136142
# Synchronization for queue operations
137143
self._queue_has_capacity = asyncio.Event()
@@ -313,8 +319,15 @@ async def fetch_next_request(self) -> Request | None:
313319

314320
async with self._queue_lock:
315321
url = state.url_queue.popleft()
316-
317-
request = Request.from_url(url)
322+
request_option = RequestOptions(url=url)
323+
if self._transform_request_function:
324+
transform_request_option = self._transform_request_function(request_option)
325+
if transform_request_option == 'skip':
326+
state.total_count -= 1
327+
continue
328+
if transform_request_option != 'unchanged':
329+
request_option = transform_request_option
330+
request = Request.from_url(**request_option)
318331
state.in_progress.add(request.url)
319332
if len(state.url_queue) < self._max_buffer_size:
320333
self._queue_has_capacity.set()

tests/unit/request_loaders/test_sitemap_request_loader.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
from yarl import URL
66

7+
from crawlee import RequestOptions, RequestTransformAction
78
from crawlee.http_clients._base import HttpClient
89
from crawlee.request_loaders._sitemap_request_loader import SitemapRequestLoader
910
from crawlee.storages import KeyValueStore
@@ -172,3 +173,37 @@ async def test_recovery_data_persistence_for_sitemap_loading(
172173

173174
assert item is not None
174175
assert item.url == next_item_in_kvs
176+
177+
178+
async def test_transform_request_function(server_url: URL, http_client: HttpClient) -> None:
179+
sitemap_url = (server_url / 'sitemap.xml').with_query(base64=encode_base64(BASIC_SITEMAP.encode()))
180+
181+
def transform_request(request_options: RequestOptions) -> RequestOptions | RequestTransformAction:
182+
request_options['user_data'] = {'transformed': True}
183+
return request_options
184+
185+
sitemap_loader = SitemapRequestLoader(
186+
[str(sitemap_url)],
187+
http_client=http_client,
188+
transform_request_function=transform_request,
189+
)
190+
191+
extracted_urls = set()
192+
193+
while not await sitemap_loader.is_finished():
194+
request = await sitemap_loader.fetch_next_request()
195+
assert request is not None
196+
assert request.user_data.get('transformed') is True
197+
198+
extracted_urls.add(request.url)
199+
200+
await sitemap_loader.mark_request_as_handled(request)
201+
202+
assert len(extracted_urls) == 5
203+
assert extracted_urls == {
204+
'http://not-exists.com/',
205+
'http://not-exists.com/catalog?item=12&desc=vacation_hawaii',
206+
'http://not-exists.com/catalog?item=73&desc=vacation_new_zealand',
207+
'http://not-exists.com/catalog?item=74&desc=vacation_newfoundland',
208+
'http://not-exists.com/catalog?item=83&desc=vacation_usa',
209+
}

0 commit comments

Comments
 (0)