Skip to content

Commit dbea7d9

Browse files
authored
refactor!: Make default Apify storages use alias mechanism (#606)
### Description - Using unnamed Apify-based storage locally outside of the Apify platform will use the alias mechanism. ### Issues Closes: #599 ### Testing Added unit tests
1 parent 5489a1f commit dbea7d9

File tree

7 files changed

+96
-15
lines changed

7 files changed

+96
-15
lines changed

docs/04_upgrading/upgrading_to_v3.md

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,3 +101,21 @@ async def main():
101101
storage_client=custom_storage_client,
102102
)
103103
```
104+
105+
## Removed Actor.config property
106+
- `Actor.config` property has been removed. Use `Actor.configuration` instead.
107+
108+
## Default storage ids in configuration changed to None
109+
- `Configuration.default_key_value_store_id` changed from `'default'` to `None`.
110+
- `Configuration.default_dataset_id` changed from `'default'` to `None`.
111+
- `Configuration.default_request_queue_id` changed from `'default'` to `None`.
112+
113+
Previously using the default storage without specifying its `id` in `Configuration` would lead to using specific storage with id `'default'`. Now it will use newly created unnamed storage with `'id'` assigned by the Apify platform, consecutive calls to get the default storage will return the same storage.
114+
115+
## Storages
116+
117+
<!-- TODO -->
118+
119+
## Storage clients
120+
121+
<!-- TODO -->

src/apify/_configuration.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -142,37 +142,37 @@ class Configuration(CrawleeConfiguration):
142142
] = None
143143

144144
default_dataset_id: Annotated[
145-
str,
145+
str | None,
146146
Field(
147147
validation_alias=AliasChoices(
148148
'actor_default_dataset_id',
149149
'apify_default_dataset_id',
150150
),
151151
description='Default dataset ID used by the Apify storage client when no ID or name is provided.',
152152
),
153-
] = 'default'
153+
] = None
154154

155155
default_key_value_store_id: Annotated[
156-
str,
156+
str | None,
157157
Field(
158158
validation_alias=AliasChoices(
159159
'actor_default_key_value_store_id',
160160
'apify_default_key_value_store_id',
161161
),
162162
description='Default key-value store ID for the Apify storage client when no ID or name is provided.',
163163
),
164-
] = 'default'
164+
] = None
165165

166166
default_request_queue_id: Annotated[
167-
str,
167+
str | None,
168168
Field(
169169
validation_alias=AliasChoices(
170170
'actor_default_request_queue_id',
171171
'apify_default_request_queue_id',
172172
),
173173
description='Default request queue ID for the Apify storage client when no ID or name is provided.',
174174
),
175-
] = 'default'
175+
] = None
176176

177177
disable_outdated_warning: Annotated[
178178
bool,

src/apify/storage_clients/_apify/_dataset_client.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,8 +124,10 @@ async def open(
124124
)
125125
apify_datasets_client = apify_client_async.datasets()
126126

127-
# Normalize 'default' alias to None
128-
alias = None if alias == 'default' else alias
127+
# Normalize unnamed default storage in cases where not defined in `configuration.default_dataset_id` to unnamed
128+
# storage aliased as `__default__`
129+
if not any([alias, name, id, configuration.default_dataset_id]):
130+
alias = '__default__'
129131

130132
if alias:
131133
# Check if there is pre-existing alias mapping in the default KVS.
@@ -150,6 +152,11 @@ async def open(
150152
# If none are provided, try to get the default storage ID from environment variables.
151153
elif id is None:
152154
id = configuration.default_dataset_id
155+
if not id:
156+
raise ValueError(
157+
'Dataset "id", "name", or "alias" must be specified, '
158+
'or a default dataset ID must be set in the configuration.'
159+
)
153160

154161
# Now create the client for the determined ID
155162
apify_dataset_client = apify_client_async.dataset(dataset_id=id)

src/apify/storage_clients/_apify/_key_value_store_client.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -115,8 +115,10 @@ async def open(
115115
)
116116
apify_kvss_client = apify_client_async.key_value_stores()
117117

118-
# Normalize 'default' alias to None
119-
alias = None if alias == 'default' else alias
118+
# Normalize unnamed default storage in cases where not defined in `configuration.default_key_value_store_id` to
119+
# unnamed storage aliased as `__default__`
120+
if not any([alias, name, id, configuration.default_key_value_store_id]):
121+
alias = '__default__'
120122

121123
if alias:
122124
# Check if there is pre-existing alias mapping in the default KVS.
@@ -142,6 +144,11 @@ async def open(
142144
# If none are provided, try to get the default storage ID from environment variables.
143145
elif id is None:
144146
id = configuration.default_key_value_store_id
147+
if not id:
148+
raise ValueError(
149+
'KeyValueStore "id", "name", or "alias" must be specified, '
150+
'or a default KeyValueStore ID must be set in the configuration.'
151+
)
145152

146153
# Now create the client for the determined ID
147154
apify_kvs_client = apify_client_async.key_value_store(key_value_store_id=id)

src/apify/storage_clients/_apify/_request_queue_client.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -200,8 +200,10 @@ async def open(
200200
)
201201
apify_rqs_client = apify_client_async.request_queues()
202202

203-
# Normalize 'default' alias to None
204-
alias = None if alias == 'default' else alias
203+
# Normalize unnamed default storage in cases where not defined in `configuration.default_request_queue_id` to
204+
# unnamed storage aliased as `__default__`
205+
if not any([alias, name, id, configuration.default_request_queue_id]):
206+
alias = '__default__'
205207

206208
if alias:
207209
# Check if there is pre-existing alias mapping in the default KVS.
@@ -226,6 +228,11 @@ async def open(
226228
# If none are provided, try to get the default storage ID from environment variables.
227229
elif id is None:
228230
id = configuration.default_request_queue_id
231+
if not id:
232+
raise ValueError(
233+
'RequestQueue "id", "name", or "alias" must be specified, '
234+
'or a default default_request_queue_id ID must be set in the configuration.'
235+
)
229236

230237
# Use suitable client_key to make `hadMultipleClients` response of Apify API useful.
231238
# It should persist across migrated or resurrected Actor runs on the Apify platform.

src/apify/storage_clients/_apify/_utils.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,7 +76,7 @@ async def _get_alias_map(cls) -> dict[str, str]:
7676
Returns:
7777
Map of aliases and storage ids.
7878
"""
79-
if not cls._alias_map:
79+
if not cls._alias_map and Configuration.get_global_configuration().is_at_home:
8080
default_kvs_client = await _get_default_kvs_client()
8181

8282
record = await default_kvs_client.get_record(cls._ALIAS_MAPPING_KEY)
@@ -156,7 +156,8 @@ async def _get_default_kvs_client() -> KeyValueStoreClientAsync:
156156
min_delay_between_retries_millis=500,
157157
timeout_secs=360,
158158
)
159-
159+
if not configuration.default_key_value_store_id:
160+
raise ValueError("'Configuration.default_key_value_store_id' must be set.")
160161
return apify_client_async.key_value_store(key_value_store_id=configuration.default_key_value_store_id)
161162

162163

tests/integration/test_apify_storages.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from crawlee import service_locator
66
from crawlee.storages import Dataset, KeyValueStore, RequestQueue
77

8-
from apify import Configuration
8+
from apify import Actor, Configuration
99
from apify.storage_clients import ApifyStorageClient
1010

1111

@@ -32,3 +32,44 @@ async def test_alias_concurrent_creation_local(
3232
except AssertionError:
3333
for storage in storages:
3434
await storage.drop()
35+
36+
37+
@pytest.mark.parametrize(
38+
'storage_type',
39+
[Dataset, KeyValueStore, RequestQueue],
40+
)
41+
async def test_unnamed_default_without_config(
42+
storage_type: Dataset | KeyValueStore | RequestQueue, apify_token: str
43+
) -> None:
44+
"""Test that default Apify storage used locally is unnamed storage."""
45+
service_locator.set_configuration(Configuration(token=apify_token))
46+
service_locator.set_storage_client(ApifyStorageClient())
47+
48+
# Open storage and make sure it has no name and it has id
49+
storage = await storage_type.open()
50+
assert storage.name is None
51+
assert storage.id
52+
53+
# Make sure the same instance is returned when opened again without name or alias
54+
storage_again = await storage_type.open()
55+
assert storage is storage_again
56+
57+
await storage.drop()
58+
59+
60+
@pytest.mark.parametrize(
61+
'storage_type',
62+
[Dataset, KeyValueStore, RequestQueue],
63+
)
64+
async def test_aliases_not_stored_on_platform_when_local(
65+
storage_type: Dataset | KeyValueStore | RequestQueue, apify_token: str
66+
) -> None:
67+
"""Test that default Apify storage used locally is not persisting aliases to Apify based default KVS."""
68+
service_locator.set_configuration(Configuration(token=apify_token))
69+
service_locator.set_storage_client(ApifyStorageClient())
70+
async with Actor(configure_logging=False):
71+
await storage_type.open(alias='test')
72+
default_kvs = await Actor.open_key_value_store(force_cloud=True)
73+
74+
# The default KVS should be empty
75+
assert len(await default_kvs.list_keys()) == 0

0 commit comments

Comments
 (0)