Skip to content

Commit 07c75a0

Browse files
MantisusCopilotjanbucharvdusekrenovate[bot]
authored
feat: Add SqlStorageClient based on sqlalchemy v2+ (#1339)
### Description - Add `SQLStorageClient` which can accept a database connection string or a pre-configured `AsyncEngine`, or creates a default `crawlee.db` database in `Configuration.storage_dir`. ### Issues - Closes: #307 --------- Co-authored-by: Copilot <[email protected]> Co-authored-by: Jan Buchar <[email protected]> Co-authored-by: Vlada Dusek <[email protected]> Co-authored-by: renovate[bot] <29139614+renovate[bot]@users.noreply.github.com> Co-authored-by: Jan Buchar <[email protected]> Co-authored-by: Apify Release Bot <[email protected]> Co-authored-by: Josef Procházka <[email protected]>
1 parent 58e6a86 commit 07c75a0

20 files changed

+3450
-17
lines changed
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
from crawlee.crawlers import ParselCrawler
2+
from crawlee.storage_clients import SqlStorageClient
3+
4+
5+
async def main() -> None:
6+
# Create a new instance of storage client.
7+
# This will create an SQLite database file crawlee.db or created tables in your
8+
# database if you pass `connection_string` or `engine`
9+
# Use the context manager to ensure that connections are properly cleaned up.
10+
async with SqlStorageClient() as storage_client:
11+
# And pass it to the crawler.
12+
crawler = ParselCrawler(storage_client=storage_client)
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
from sqlalchemy.ext.asyncio import create_async_engine
2+
3+
from crawlee.configuration import Configuration
4+
from crawlee.crawlers import ParselCrawler
5+
from crawlee.storage_clients import SqlStorageClient
6+
7+
8+
async def main() -> None:
9+
# Create a new instance of storage client.
10+
# On first run, also creates tables in your PostgreSQL database.
11+
# Use the context manager to ensure that connections are properly cleaned up.
12+
async with SqlStorageClient(
13+
# Create an `engine` with the desired configuration
14+
engine=create_async_engine(
15+
'postgresql+asyncpg://myuser:mypassword@localhost:5432/postgres',
16+
future=True,
17+
pool_size=5,
18+
max_overflow=10,
19+
pool_recycle=3600,
20+
pool_pre_ping=True,
21+
echo=False,
22+
)
23+
) as storage_client:
24+
# Create a configuration with custom settings.
25+
configuration = Configuration(
26+
purge_on_start=False,
27+
)
28+
29+
# And pass them to the crawler.
30+
crawler = ParselCrawler(
31+
storage_client=storage_client,
32+
configuration=configuration,
33+
)

docs/guides/storage_clients.mdx

Lines changed: 188 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,12 +8,15 @@ import ApiLink from '@site/src/components/ApiLink';
88
import Tabs from '@theme/Tabs';
99
import TabItem from '@theme/TabItem';
1010
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
11+
import CodeBlock from '@theme/CodeBlock';
1112

1213
import MemoryStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/memory_storage_client_basic_example.py';
1314
import FileSystemStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/file_system_storage_client_basic_example.py';
1415
import FileSystemStorageClientConfigurationExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/file_system_storage_client_configuration_example.py';
1516
import CustomStorageClientExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/custom_storage_client_example.py';
1617
import RegisteringStorageClientsExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/registering_storage_clients_example.py';
18+
import SQLStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/sql_storage_client_basic_example.py';
19+
import SQLStorageClientConfigurationExample from '!!raw-loader!./code_examples/storage_clients/sql_storage_client_configuration_example.py';
1720

1821
Storage clients provide a unified interface for interacting with <ApiLink to="class/Dataset">`Dataset`</ApiLink>, <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink>, and <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>, regardless of the underlying implementation. They handle operations like creating, reading, updating, and deleting storage instances, as well as managing data persistence and cleanup. This abstraction makes it easy to switch between different environments, such as local development and cloud production setups.
1922

@@ -23,6 +26,7 @@ Crawlee provides three main storage client implementations:
2326

2427
- <ApiLink to="class/FileSystemStorageClient">`FileSystemStorageClient`</ApiLink> - Provides persistent file system storage with in-memory caching.
2528
- <ApiLink to="class/MemoryStorageClient">`MemoryStorageClient`</ApiLink> - Stores data in memory with no persistence.
29+
- <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> – Provides persistent storage using a SQL database ([SQLite](https://sqlite.org/) or [PostgreSQL](https://www.postgresql.org/)). Requires installing the extra dependency: 'crawlee[sql_sqlite]' for SQLite or 'crawlee[sql_postgres]' for PostgreSQL.
2630
- [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient) - Manages storage on the [Apify platform](https://apify.com), implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python).
2731

2832
```mermaid
@@ -50,6 +54,8 @@ class FileSystemStorageClient
5054
5155
class MemoryStorageClient
5256
57+
class SqlStorageClient
58+
5359
class ApifyStorageClient
5460
5561
%% ========================
@@ -58,6 +64,7 @@ class ApifyStorageClient
5864
5965
StorageClient --|> FileSystemStorageClient
6066
StorageClient --|> MemoryStorageClient
67+
StorageClient --|> SqlStorageClient
6168
StorageClient --|> ApifyStorageClient
6269
```
6370

@@ -125,6 +132,187 @@ The `MemoryStorageClient` does not persist data between runs. All data is lost w
125132
{MemoryStorageClientBasicExample}
126133
</RunnableCodeBlock>
127134

135+
### SQL storage client
136+
137+
:::warning Experimental feature
138+
The `SqlStorageClient` is experimental. Its API and behavior may change in future releases.
139+
:::
140+
141+
The <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> provides persistent storage using a SQL database (SQLite by default, or PostgreSQL). It supports all Crawlee storage types and enables concurrent access from multiple independent clients or processes.
142+
143+
:::note dependencies
144+
The <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> is not included in the core Crawlee package.
145+
To use it, you need to install Crawlee with the appropriate extra dependency:
146+
147+
- For SQLite support, run:
148+
<code>pip install 'crawlee[sql_sqlite]'</code>
149+
- For PostgreSQL support, run:
150+
<code>pip install 'crawlee[sql_postgres]'</code>
151+
:::
152+
153+
By default, <ApiLink to="class/SqlStorageClient">SqlStorageClient</ApiLink> uses SQLite.
154+
To use PostgreSQL instead, just provide a PostgreSQL connection string via the `connection_string` parameter. No other code changes are needed—the same client works for both databases.
155+
156+
<RunnableCodeBlock className="language-python" language="python">
157+
{SQLStorageClientBasicExample}
158+
</RunnableCodeBlock>
159+
160+
Data is organized in relational tables. Below are the main tables and columns used for each storage type:
161+
162+
```mermaid
163+
---
164+
config:
165+
class:
166+
hideEmptyMembersBox: true
167+
---
168+
169+
classDiagram
170+
171+
%% ========================
172+
%% Storage Clients
173+
%% ========================
174+
175+
class SqlDatasetClient {
176+
<<Dataset>>
177+
}
178+
179+
class SqlKeyValueStoreClient {
180+
<<Key-value store>>
181+
}
182+
183+
%% ========================
184+
%% Dataset Tables
185+
%% ========================
186+
187+
class datasets {
188+
<<table>>
189+
+ dataset_id (PK)
190+
+ internal_name
191+
+ name
192+
+ accessed_at
193+
+ created_at
194+
+ modified_at
195+
+ item_count
196+
}
197+
198+
class dataset_records {
199+
<<table>>
200+
+ item_id (PK)
201+
+ dataset_id (FK)
202+
+ data
203+
}
204+
205+
%% ========================
206+
%% Key-Value Store Tables
207+
%% ========================
208+
209+
class key_value_stores {
210+
<<table>>
211+
+ key_value_store_id (PK)
212+
+ internal_name
213+
+ name
214+
+ accessed_at
215+
+ created_at
216+
+ modified_at
217+
}
218+
219+
class key_value_store_records {
220+
<<table>>
221+
+ key_value_store_id (FK, PK)
222+
+ key (PK)
223+
+ value
224+
+ content_type
225+
+ size
226+
}
227+
228+
%% ========================
229+
%% Client to Table arrows
230+
%% ========================
231+
232+
SqlDatasetClient --> datasets
233+
SqlDatasetClient --> dataset_records
234+
235+
SqlKeyValueStoreClient --> key_value_stores
236+
SqlKeyValueStoreClient --> key_value_store_records
237+
```
238+
```mermaid
239+
---
240+
config:
241+
class:
242+
hideEmptyMembersBox: true
243+
---
244+
245+
classDiagram
246+
247+
%% ========================
248+
%% Storage Clients
249+
%% ========================
250+
251+
class SqlRequestQueueClient {
252+
<<Request queue>>
253+
}
254+
255+
%% ========================
256+
%% Request Queue Tables
257+
%% ========================
258+
259+
class request_queues {
260+
<<table>>
261+
+ request_queue_id (PK)
262+
+ internal_name
263+
+ name
264+
+ accessed_at
265+
+ created_at
266+
+ modified_at
267+
+ had_multiple_clients
268+
+ handled_request_count
269+
+ pending_request_count
270+
+ total_request_count
271+
}
272+
273+
class request_queue_records {
274+
<<table>>
275+
+ request_id (PK)
276+
+ request_queue_id (FK, PK)
277+
+ data
278+
+ sequence_number
279+
+ is_handled
280+
+ time_blocked_until
281+
+ client_key
282+
}
283+
284+
class request_queue_state {
285+
<<table>>
286+
+ request_queue_id (FK, PK)
287+
+ sequence_counter
288+
+ forefront_sequence_counter
289+
}
290+
291+
%% ========================
292+
%% Client to Table arrows
293+
%% ========================
294+
295+
SqlRequestQueueClient --> request_queues
296+
SqlRequestQueueClient --> request_queue_records
297+
SqlRequestQueueClient --> request_queue_state
298+
```
299+
300+
Configuration options for the <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> can be set through environment variables or the <ApiLink to="class/Configuration">`Configuration`</ApiLink> class:
301+
302+
- **`storage_dir`** (env: `CRAWLEE_STORAGE_DIR`, default: `'./storage'`) - The root directory where the default SQLite database will be created if no connection string is provided.
303+
- **`purge_on_start`** (env: `CRAWLEE_PURGE_ON_START`, default: `True`) - Whether to purge default storages on start.
304+
305+
Configuration options for the <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> can be set via constructor arguments:
306+
307+
- **`connection_string`** (default: SQLite in <ApiLink to="class/Configuration">`Configuration`</ApiLink> storage dir) – SQLAlchemy connection string, e.g. `sqlite+aiosqlite:///my.db` or `postgresql+asyncpg://user:pass@host/db`.
308+
- **`engine`** – Pre-configured SQLAlchemy AsyncEngine (optional).
309+
310+
For advanced scenarios, you can configure <ApiLink to="class/SqlStorageClient">`SqlStorageClient`</ApiLink> with a custom SQLAlchemy engine and additional options via the <ApiLink to="class/Configuration">`Configuration`</ApiLink> class. This is useful, for example, when connecting to an external PostgreSQL database or customizing connection pooling.
311+
312+
<CodeBlock className="language-python" language="python">
313+
{SQLStorageClientConfigurationExample}
314+
</CodeBlock>
315+
128316
## Creating a custom storage client
129317

130318
A storage client consists of two parts: the storage client factory and individual storage type clients. The <ApiLink to="class/StorageClient">`StorageClient`</ApiLink> acts as a factory that creates specific clients (<ApiLink to="class/DatasetClient">`DatasetClient`</ApiLink>, <ApiLink to="class/KeyValueStoreClient">`KeyValueStoreClient`</ApiLink>, <ApiLink to="class/RequestQueueClient">`RequestQueueClient`</ApiLink>) where the actual storage logic is implemented.

pyproject.toml

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ dependencies = [
4848
]
4949

5050
[project.optional-dependencies]
51-
all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel]"]
51+
all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,sql_sqlite,sql_postgres]"]
5252
adaptive-crawler = [
5353
"jaro-winkler>=2.0.3",
5454
"playwright>=1.27.0",
@@ -71,6 +71,14 @@ otel = [
7171
"opentelemetry-semantic-conventions>=0.54",
7272
"wrapt>=1.17.0",
7373
]
74+
sql_postgres = [
75+
"sqlalchemy[asyncio]>=2.0.0,<3.0.0",
76+
"asyncpg>=0.24.0"
77+
]
78+
sql_sqlite = [
79+
"sqlalchemy[asyncio]>=2.0.0,<3.0.0",
80+
"aiosqlite>=0.21.0",
81+
]
7482

7583
[project.scripts]
7684
crawlee = "crawlee._cli:cli"
Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,21 @@
1+
from crawlee._utils.try_import import install_import_hook as _install_import_hook
2+
from crawlee._utils.try_import import try_import as _try_import
3+
4+
# These imports have only mandatory dependencies, so they are imported directly.
15
from ._base import StorageClient
26
from ._file_system import FileSystemStorageClient
37
from ._memory import MemoryStorageClient
48

9+
_install_import_hook(__name__)
10+
11+
# The following imports are wrapped in try_import to handle optional dependencies,
12+
# ensuring the module can still function even if these dependencies are missing.
13+
with _try_import(__name__, 'SqlStorageClient'):
14+
from ._sql import SqlStorageClient
15+
516
__all__ = [
617
'FileSystemStorageClient',
718
'MemoryStorageClient',
19+
'SqlStorageClient',
820
'StorageClient',
921
]
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
from ._dataset_client import SqlDatasetClient
2+
from ._key_value_store_client import SqlKeyValueStoreClient
3+
from ._request_queue_client import SqlRequestQueueClient
4+
from ._storage_client import SqlStorageClient
5+
6+
__all__ = ['SqlDatasetClient', 'SqlKeyValueStoreClient', 'SqlRequestQueueClient', 'SqlStorageClient']

0 commit comments

Comments
 (0)