From 9d46e2bf93a07a25da8eb77db258d6f5d00870a0 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Wed, 16 Apr 2025 09:17:55 +0200 Subject: [PATCH 1/2] Update `UnprocessedRequest` to match actual data. Add test. --- src/crawlee/storage_clients/models.py | 4 ++-- .../_memory/test_memory_storage_client.py | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/src/crawlee/storage_clients/models.py b/src/crawlee/storage_clients/models.py index 7cceae3546..dc56e7bd90 100644 --- a/src/crawlee/storage_clients/models.py +++ b/src/crawlee/storage_clients/models.py @@ -224,8 +224,8 @@ class UnprocessedRequest(BaseModel): model_config = ConfigDict(populate_by_name=True) - unique_key: Annotated[str, Field(alias='requestUniqueKey')] - url: Annotated[str, BeforeValidator(validate_http_url), Field()] + unique_key: Annotated[str, Field(alias='uniqueKey')] + url: Annotated[str | None, BeforeValidator(validate_http_url), Field()] = None method: Annotated[HttpMethod | None, Field()] = None diff --git a/tests/unit/storage_clients/_memory/test_memory_storage_client.py b/tests/unit/storage_clients/_memory/test_memory_storage_client.py index 4b6ba2b00e..1860d014ca 100644 --- a/tests/unit/storage_clients/_memory/test_memory_storage_client.py +++ b/tests/unit/storage_clients/_memory/test_memory_storage_client.py @@ -11,6 +11,7 @@ from crawlee._consts import METADATA_FILENAME from crawlee.configuration import Configuration from crawlee.storage_clients import MemoryStorageClient +from crawlee.storage_clients.models import BatchRequestsOperationResponse async def test_write_metadata(tmp_path: Path) -> None: @@ -267,3 +268,21 @@ async def test_parametrized_storage_path_overrides_env_var() -> None: Configuration(crawlee_storage_dir='./parametrized_storage_dir'), # type: ignore[call-arg] ) assert ms.storage_dir == './parametrized_storage_dir' + + +async def test_batch_requests_operation_response() -> None: + """Test that `BatchRequestsOperationResponse` creation from example responses.""" + process_request = { + 'requestId': 'EAaArVRs5qV39C9', + 'uniqueKey': 'https://example.com', + 'wasAlreadyHandled': False, + 'wasAlreadyPresent': True, + } + unprocess_request_full = {'uniqueKey': 'https://example2.com', 'method': 'GET', 'url': 'https://example2.com'} + unprocess_request_minimal = {'uniqueKey': 'https://example3.com'} + BatchRequestsOperationResponse.model_validate( + { + 'processedRequests': [process_request], + 'unprocessedRequests': [unprocess_request_full, unprocess_request_minimal], + } + ) From 6ebc9ffbb98e77320183b316209972391d92a5c4 Mon Sep 17 00:00:00 2001 From: Josef Prochazka Date: Tue, 22 Apr 2025 11:55:33 +0200 Subject: [PATCH 2/2] Make url mandatory again --- src/crawlee/storage_clients/models.py | 2 +- .../unit/storage_clients/_memory/test_memory_storage_client.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/crawlee/storage_clients/models.py b/src/crawlee/storage_clients/models.py index dc56e7bd90..f016e24730 100644 --- a/src/crawlee/storage_clients/models.py +++ b/src/crawlee/storage_clients/models.py @@ -225,7 +225,7 @@ class UnprocessedRequest(BaseModel): model_config = ConfigDict(populate_by_name=True) unique_key: Annotated[str, Field(alias='uniqueKey')] - url: Annotated[str | None, BeforeValidator(validate_http_url), Field()] = None + url: Annotated[str, BeforeValidator(validate_http_url), Field()] method: Annotated[HttpMethod | None, Field()] = None diff --git a/tests/unit/storage_clients/_memory/test_memory_storage_client.py b/tests/unit/storage_clients/_memory/test_memory_storage_client.py index 1860d014ca..0d043322ae 100644 --- a/tests/unit/storage_clients/_memory/test_memory_storage_client.py +++ b/tests/unit/storage_clients/_memory/test_memory_storage_client.py @@ -279,7 +279,7 @@ async def test_batch_requests_operation_response() -> None: 'wasAlreadyPresent': True, } unprocess_request_full = {'uniqueKey': 'https://example2.com', 'method': 'GET', 'url': 'https://example2.com'} - unprocess_request_minimal = {'uniqueKey': 'https://example3.com'} + unprocess_request_minimal = {'uniqueKey': 'https://example3.com', 'url': 'https://example3.com'} BatchRequestsOperationResponse.model_validate( { 'processedRequests': [process_request],