|
8 | 8 | from crawlee import service_locator |
9 | 9 | from crawlee.configuration import Configuration |
10 | 10 | from crawlee.crawlers import HttpCrawler, HttpCrawlingContext |
| 11 | +from crawlee.statistics import Statistics |
11 | 12 | from crawlee.storage_clients import MemoryStorageClient |
12 | 13 | from crawlee.storage_clients._file_system._storage_client import FileSystemStorageClient |
13 | 14 |
|
@@ -35,26 +36,51 @@ def test_global_configuration_works_reversed() -> None: |
35 | 36 | ) |
36 | 37 |
|
37 | 38 |
|
38 | | -async def test_storage_not_persisted_when_disabled(tmp_path: Path, server_url: URL) -> None: |
| 39 | +async def test_storage_not_persisted_when_non_persistable_storage_used(tmp_path: Path, server_url: URL) -> None: |
| 40 | + """Make the Crawler use MemoryStorageClient which can't persist state.""" |
| 41 | + service_locator.set_configuration( |
| 42 | + Configuration( |
| 43 | + crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] |
| 44 | + ) |
| 45 | + ) |
| 46 | + crawler = HttpCrawler(storage_client=MemoryStorageClient()) |
| 47 | + |
| 48 | + @crawler.router.default_handler |
| 49 | + async def default_handler(context: HttpCrawlingContext) -> None: |
| 50 | + await context.push_data({'url': context.request.url}) |
| 51 | + |
| 52 | + await crawler.run([str(server_url)]) |
| 53 | + |
| 54 | + # Verify that no files were created in the storage directory. |
| 55 | + content = list(tmp_path.iterdir()) |
| 56 | + assert content == [], 'Expected the storage directory to be empty, but it is not.' |
| 57 | + |
| 58 | + |
| 59 | +async def test_storage_persisted_with_explicit_statistics_with_persistable_storage( |
| 60 | + tmp_path: Path, server_url: URL |
| 61 | +) -> None: |
| 62 | + """Make the Crawler use MemoryStorageClient which can't persist state, |
| 63 | + but pass explicit statistics to it which will use global FileSystemStorageClient() that can persist state.""" |
| 64 | + |
39 | 65 | configuration = Configuration( |
40 | 66 | crawlee_storage_dir=str(tmp_path), # type: ignore[call-arg] |
41 | 67 | ) |
42 | | - storage_client = MemoryStorageClient() |
43 | | - |
44 | 68 | service_locator.set_configuration(configuration) |
45 | | - service_locator.set_storage_client(storage_client) |
| 69 | + service_locator.set_storage_client(FileSystemStorageClient()) |
46 | 70 |
|
47 | | - crawler = HttpCrawler() |
| 71 | + crawler = HttpCrawler( |
| 72 | + storage_client=MemoryStorageClient(), statistics=Statistics.with_default_state(persistence_enabled=True) |
| 73 | + ) |
48 | 74 |
|
49 | 75 | @crawler.router.default_handler |
50 | 76 | async def default_handler(context: HttpCrawlingContext) -> None: |
51 | 77 | await context.push_data({'url': context.request.url}) |
52 | 78 |
|
53 | 79 | await crawler.run([str(server_url)]) |
54 | 80 |
|
55 | | - # Verify that no files were created in the storage directory. |
| 81 | + # Verify that files were created in the storage directory. |
56 | 82 | content = list(tmp_path.iterdir()) |
57 | | - assert content == [], 'Expected the storage directory to be empty, but it is not.' |
| 83 | + assert content != [], 'Expected the storage directory to contain files, but it does not.' |
58 | 84 |
|
59 | 85 |
|
60 | 86 | async def test_storage_persisted_when_enabled(tmp_path: Path, server_url: URL) -> None: |
|
0 commit comments