Skip to content

Commit 0f1cf6f

Browse files
authored
fix: Fix the usage of Configuration (#899)
### Description - Utilized all the `Configuration` fields, except for the `chrome_executable_path` and `xvfb`. - Enhanced the `Configuration` fields documentation. ### Issues - Closes: #670
1 parent 0d3e474 commit 0f1cf6f

File tree

12 files changed

+205
-98
lines changed

12 files changed

+205
-98
lines changed

src/crawlee/_autoscaling/snapshotter.py

Lines changed: 64 additions & 53 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
from operator import attrgetter
88
from typing import TYPE_CHECKING, TypeVar, cast
99

10-
import psutil
1110
from sortedcontainers import SortedList
1211

1312
from crawlee import service_locator
@@ -16,11 +15,14 @@
1615
from crawlee._utils.context import ensure_context
1716
from crawlee._utils.docs import docs_group
1817
from crawlee._utils.recurring_task import RecurringTask
18+
from crawlee._utils.system import get_memory_info
1919
from crawlee.events._types import Event, EventSystemInfoData
2020

2121
if TYPE_CHECKING:
2222
from types import TracebackType
2323

24+
from crawlee.configuration import Configuration
25+
2426
logger = getLogger(__name__)
2527

2628
T = TypeVar('T')
@@ -36,89 +38,98 @@ class Snapshotter:
3638
dynamically based on the current demand and system load.
3739
"""
3840

41+
_EVENT_LOOP_SNAPSHOT_INTERVAL = timedelta(milliseconds=500)
42+
"""The interval at which the event loop is sampled."""
43+
44+
_CLIENT_SNAPSHOT_INTERVAL = timedelta(milliseconds=1000)
45+
"""The interval at which the client is sampled."""
46+
47+
_SNAPSHOT_HISTORY = timedelta(seconds=30)
48+
"""The time interval for which the snapshots are kept."""
49+
50+
_RESERVE_MEMORY_RATIO = 0.5
51+
"""Fraction of memory kept in reserve. Used to calculate critical memory overload threshold."""
52+
53+
_MEMORY_WARNING_COOLDOWN_PERIOD = timedelta(milliseconds=10000)
54+
"""Minimum time interval between logging successive critical memory overload warnings."""
55+
56+
_CLIENT_RATE_LIMIT_ERROR_RETRY_COUNT = 2
57+
"""Number of retries for a client request before considering it a failure due to rate limiting."""
58+
3959
def __init__(
4060
self,
4161
*,
42-
event_loop_snapshot_interval: timedelta = timedelta(milliseconds=500),
43-
client_snapshot_interval: timedelta = timedelta(milliseconds=1000),
44-
max_used_cpu_ratio: float = 0.95,
45-
max_memory_size: ByteSize | None = None,
46-
max_used_memory_ratio: float = 0.9,
47-
max_event_loop_delay: timedelta = timedelta(milliseconds=50),
48-
max_client_errors: int = 1,
49-
snapshot_history: timedelta = timedelta(seconds=30),
50-
available_memory_ratio: float | None = None,
51-
reserve_memory_ratio: float = 0.5,
52-
memory_warning_cooldown_period: timedelta = timedelta(milliseconds=10000),
53-
client_rate_limit_error_retry_count: int = 2,
62+
max_used_cpu_ratio: float,
63+
max_used_memory_ratio: float,
64+
max_event_loop_delay: timedelta,
65+
max_client_errors: int,
66+
max_memory_size: ByteSize,
5467
) -> None:
5568
"""A default constructor.
5669
70+
In most cases, you should use the `from_config` constructor to create a new instance based on
71+
the provided configuration.
72+
5773
Args:
58-
event_loop_snapshot_interval: The interval at which the event loop is sampled.
59-
client_snapshot_interval: The interval at which the client is sampled.
6074
max_used_cpu_ratio: Sets the ratio, defining the maximum CPU usage. When the CPU usage is higher than
6175
the provided ratio, the CPU is considered overloaded.
62-
max_memory_size: Sets the maximum amount of system memory to be used by the `AutoscaledPool`. If `None`
63-
is provided, the max amount of memory to be used is set to one quarter of total system memory.
64-
I.e. on a system with 8192 MB, the `AutoscaledPool` will only use up to 2048 MB of memory.
6576
max_used_memory_ratio: Sets the ratio, defining the maximum ratio of memory usage. When the memory usage
6677
is higher than the provided ratio of `max_memory_size`, the memory is considered overloaded.
6778
max_event_loop_delay: Sets the maximum delay of the event loop. When the delay is higher than the provided
6879
value, the event loop is considered overloaded.
6980
max_client_errors: Sets the maximum number of client errors (HTTP 429). When the number of client errors
7081
is higher than the provided number, the client is considered overloaded.
71-
snapshot_history: Sets the time interval for which the snapshots are kept.
72-
available_memory_ratio: How big part of the system memory should be used if `max_memory_size` is not given.
73-
reserve_memory_ratio: Fraction of memory kept in reserve. Used to calculate critical memory overload
74-
threshold.
75-
memory_warning_cooldown_period: Minimum time interval between logging successive critical memory overload
76-
warnings.
77-
client_rate_limit_error_retry_count: Number of retries for a client request before considering it a failure
78-
due to rate limiting.
82+
max_memory_size: Sets the maximum amount of system memory to be used by the `AutoscaledPool`.
7983
"""
80-
if available_memory_ratio is None and max_memory_size is None:
81-
raise ValueError('At least one of `available_memory_ratio` or `max_memory_size` must be specified')
82-
83-
self._event_loop_snapshot_interval = event_loop_snapshot_interval
84-
self._client_snapshot_interval = client_snapshot_interval
85-
self._max_event_loop_delay = max_event_loop_delay
8684
self._max_used_cpu_ratio = max_used_cpu_ratio
8785
self._max_used_memory_ratio = max_used_memory_ratio
86+
self._max_event_loop_delay = max_event_loop_delay
8887
self._max_client_errors = max_client_errors
89-
self._snapshot_history = snapshot_history
90-
self._reserve_memory_ratio = reserve_memory_ratio
91-
self._memory_warning_cooldown_period = memory_warning_cooldown_period
92-
self._client_rate_limit_error_retry_count = client_rate_limit_error_retry_count
93-
self._max_memory_size = max_memory_size or self._get_default_max_memory_size(
94-
cast(float, available_memory_ratio)
95-
)
88+
self._max_memory_size = max_memory_size
9689

9790
self._cpu_snapshots = self._get_sorted_list_by_created_at(list[CpuSnapshot]())
9891
self._event_loop_snapshots = self._get_sorted_list_by_created_at(list[EventLoopSnapshot]())
9992
self._memory_snapshots = self._get_sorted_list_by_created_at(list[MemorySnapshot]())
10093
self._client_snapshots = self._get_sorted_list_by_created_at(list[ClientSnapshot]())
10194

102-
self._snapshot_event_loop_task = RecurringTask(self._snapshot_event_loop, self._event_loop_snapshot_interval)
103-
self._snapshot_client_task = RecurringTask(self._snapshot_client, self._client_snapshot_interval)
95+
self._snapshot_event_loop_task = RecurringTask(self._snapshot_event_loop, self._EVENT_LOOP_SNAPSHOT_INTERVAL)
96+
self._snapshot_client_task = RecurringTask(self._snapshot_client, self._CLIENT_SNAPSHOT_INTERVAL)
10497

10598
self._timestamp_of_last_memory_warning: datetime = datetime.now(timezone.utc) - timedelta(hours=1)
10699

107100
# Flag to indicate the context state.
108101
self._active = False
109102

103+
@classmethod
104+
def from_config(cls, config: Configuration | None = None) -> Snapshotter:
105+
"""Create a new instance based on the provided configuration.
106+
107+
Args:
108+
config: The configuration object. Uses the global (default) configuration if not provided.
109+
"""
110+
config = service_locator.get_configuration()
111+
112+
# Compute the maximum memory size based on the provided configuration. If `memory_mbytes` is provided,
113+
# it uses that value. Otherwise, it calculates the `max_memory_size` as a proportion of the system's
114+
# total available memory based on `available_memory_ratio`.
115+
max_memory_size = (
116+
ByteSize.from_mb(config.memory_mbytes)
117+
if config.memory_mbytes
118+
else ByteSize(int(get_memory_info().total_size.bytes * config.available_memory_ratio))
119+
)
120+
121+
return cls(
122+
max_used_cpu_ratio=config.max_used_cpu_ratio,
123+
max_used_memory_ratio=config.max_used_memory_ratio,
124+
max_event_loop_delay=config.max_event_loop_delay,
125+
max_client_errors=config.max_client_errors,
126+
max_memory_size=max_memory_size,
127+
)
128+
110129
@staticmethod
111130
def _get_sorted_list_by_created_at(input_list: list[T]) -> SortedList[T]:
112131
return SortedList(input_list, key=attrgetter('created_at'))
113132

114-
@staticmethod
115-
def _get_default_max_memory_size(available_memory_ratio: float) -> ByteSize:
116-
"""Default `memory_max_size` is 1/4 of the total system memory."""
117-
max_memory_size_in_bytes = int(psutil.virtual_memory().total * available_memory_ratio)
118-
max_memory_size = ByteSize(max_memory_size_in_bytes)
119-
logger.info(f'Setting max_memory_size of this run to {max_memory_size}.')
120-
return max_memory_size
121-
122133
@property
123134
def active(self) -> bool:
124135
"""Indicates whether the context is active."""
@@ -281,7 +292,7 @@ def _snapshot_event_loop(self) -> None:
281292
previous_snapshot = self._event_loop_snapshots[-1] if self._event_loop_snapshots else None
282293

283294
if previous_snapshot:
284-
event_loop_delay = snapshot.created_at - previous_snapshot.created_at - self._event_loop_snapshot_interval
295+
event_loop_delay = snapshot.created_at - previous_snapshot.created_at - self._EVENT_LOOP_SNAPSHOT_INTERVAL
285296
snapshot.delay = event_loop_delay
286297

287298
snapshots = cast(list[Snapshot], self._event_loop_snapshots)
@@ -319,7 +330,7 @@ def _prune_snapshots(self, snapshots: list[Snapshot], now: datetime) -> None:
319330
# We'll keep snapshots from this index onwards.
320331
keep_from_index = None
321332
for i, snapshot in enumerate(snapshots):
322-
if now - snapshot.created_at <= self._snapshot_history:
333+
if now - snapshot.created_at <= self._SNAPSHOT_HISTORY:
323334
keep_from_index = i
324335
break
325336

@@ -338,11 +349,11 @@ def _evaluate_memory_load(self, current_memory_usage_size: ByteSize, snapshot_ti
338349
snapshot_timestamp: The time at which the memory snapshot was taken.
339350
"""
340351
# Check if the warning has been logged recently to avoid spamming
341-
if snapshot_timestamp < self._timestamp_of_last_memory_warning + self._memory_warning_cooldown_period:
352+
if snapshot_timestamp < self._timestamp_of_last_memory_warning + self._MEMORY_WARNING_COOLDOWN_PERIOD:
342353
return
343354

344355
threshold_memory_size = self._max_used_memory_ratio * self._max_memory_size
345-
buffer_memory_size = self._max_memory_size * (1 - self._max_used_memory_ratio) * self._reserve_memory_ratio
356+
buffer_memory_size = self._max_memory_size * (1 - self._max_used_memory_ratio) * self._RESERVE_MEMORY_RATIO
346357
overload_memory_threshold_size = threshold_memory_size + buffer_memory_size
347358

348359
# Log a warning if current memory usage exceeds the critical overload threshold

src/crawlee/_service_locator.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,11 @@ def get_event_manager(self) -> EventManager:
5151
if self._event_manager is None:
5252
from crawlee.events import LocalEventManager
5353

54-
self._event_manager = LocalEventManager()
54+
self._event_manager = (
55+
LocalEventManager().from_config(config=self._configuration)
56+
if self._configuration
57+
else LocalEventManager.from_config()
58+
)
5559

5660
self._event_manager_was_retrieved = True
5761
return self._event_manager
@@ -75,7 +79,11 @@ def get_storage_client(self) -> BaseStorageClient:
7579
if self._storage_client is None:
7680
from crawlee.storage_clients import MemoryStorageClient
7781

78-
self._storage_client = MemoryStorageClient.from_config()
82+
self._storage_client = (
83+
MemoryStorageClient.from_config(config=self._configuration)
84+
if self._configuration
85+
else MemoryStorageClient.from_config()
86+
)
7987

8088
self._storage_client_was_retrieved = True
8189
return self._storage_client

src/crawlee/browsers/_playwright_browser_plugin.py

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from playwright.async_api import Playwright, async_playwright
99
from typing_extensions import override
1010

11+
from crawlee import service_locator
1112
from crawlee._utils.context import ensure_context
1213
from crawlee._utils.docs import docs_group
1314
from crawlee.browsers._base_browser_plugin import BaseBrowserPlugin
@@ -35,8 +36,8 @@ def __init__(
3536
self,
3637
*,
3738
browser_type: BrowserType = 'chromium',
38-
browser_launch_options: Mapping[str, Any] | None = None,
39-
browser_new_context_options: Mapping[str, Any] | None = None,
39+
browser_launch_options: dict[str, Any] | None = None,
40+
browser_new_context_options: dict[str, Any] | None = None,
4041
max_open_pages_per_browser: int = 20,
4142
) -> None:
4243
"""A default constructor.
@@ -52,8 +53,17 @@ def __init__(
5253
max_open_pages_per_browser: The maximum number of pages that can be opened in a single browser instance.
5354
Once reached, a new browser instance will be launched to handle the excess.
5455
"""
56+
config = service_locator.get_configuration()
57+
58+
# Default browser launch options are based on the configuration.
59+
default_launch_browser_options = {
60+
'headless': config.headless,
61+
'executable_path': config.default_browser_path,
62+
'chromium_sandbox': not config.disable_browser_sandbox,
63+
}
64+
5565
self._browser_type = browser_type
56-
self._browser_launch_options = browser_launch_options or {}
66+
self._browser_launch_options = default_launch_browser_options | (browser_launch_options or {})
5767
self._browser_new_context_options = browser_new_context_options or {}
5868
self._max_open_pages_per_browser = max_open_pages_per_browser
5969

0 commit comments

Comments
 (0)