diff --git a/src/crawlee/_autoscaling/snapshotter.py b/src/crawlee/_autoscaling/snapshotter.py index 62e2c4d2df..a62d5e4df3 100644 --- a/src/crawlee/_autoscaling/snapshotter.py +++ b/src/crawlee/_autoscaling/snapshotter.py @@ -7,7 +7,6 @@ from operator import attrgetter from typing import TYPE_CHECKING, TypeVar, cast -import psutil from sortedcontainers import SortedList from crawlee import service_locator @@ -16,11 +15,14 @@ from crawlee._utils.context import ensure_context from crawlee._utils.docs import docs_group from crawlee._utils.recurring_task import RecurringTask +from crawlee._utils.system import get_memory_info from crawlee.events._types import Event, EventSystemInfoData if TYPE_CHECKING: from types import TracebackType + from crawlee.configuration import Configuration + logger = getLogger(__name__) T = TypeVar('T') @@ -36,89 +38,98 @@ class Snapshotter: dynamically based on the current demand and system load. """ + _EVENT_LOOP_SNAPSHOT_INTERVAL = timedelta(milliseconds=500) + """The interval at which the event loop is sampled.""" + + _CLIENT_SNAPSHOT_INTERVAL = timedelta(milliseconds=1000) + """The interval at which the client is sampled.""" + + _SNAPSHOT_HISTORY = timedelta(seconds=30) + """The time interval for which the snapshots are kept.""" + + _RESERVE_MEMORY_RATIO = 0.5 + """Fraction of memory kept in reserve. Used to calculate critical memory overload threshold.""" + + _MEMORY_WARNING_COOLDOWN_PERIOD = timedelta(milliseconds=10000) + """Minimum time interval between logging successive critical memory overload warnings.""" + + _CLIENT_RATE_LIMIT_ERROR_RETRY_COUNT = 2 + """Number of retries for a client request before considering it a failure due to rate limiting.""" + def __init__( self, *, - event_loop_snapshot_interval: timedelta = timedelta(milliseconds=500), - client_snapshot_interval: timedelta = timedelta(milliseconds=1000), - max_used_cpu_ratio: float = 0.95, - max_memory_size: ByteSize | None = None, - max_used_memory_ratio: float = 0.9, - max_event_loop_delay: timedelta = timedelta(milliseconds=50), - max_client_errors: int = 1, - snapshot_history: timedelta = timedelta(seconds=30), - available_memory_ratio: float | None = None, - reserve_memory_ratio: float = 0.5, - memory_warning_cooldown_period: timedelta = timedelta(milliseconds=10000), - client_rate_limit_error_retry_count: int = 2, + max_used_cpu_ratio: float, + max_used_memory_ratio: float, + max_event_loop_delay: timedelta, + max_client_errors: int, + max_memory_size: ByteSize, ) -> None: """A default constructor. + In most cases, you should use the `from_config` constructor to create a new instance based on + the provided configuration. + Args: - event_loop_snapshot_interval: The interval at which the event loop is sampled. - client_snapshot_interval: The interval at which the client is sampled. max_used_cpu_ratio: Sets the ratio, defining the maximum CPU usage. When the CPU usage is higher than the provided ratio, the CPU is considered overloaded. - max_memory_size: Sets the maximum amount of system memory to be used by the `AutoscaledPool`. If `None` - is provided, the max amount of memory to be used is set to one quarter of total system memory. - I.e. on a system with 8192 MB, the `AutoscaledPool` will only use up to 2048 MB of memory. max_used_memory_ratio: Sets the ratio, defining the maximum ratio of memory usage. When the memory usage is higher than the provided ratio of `max_memory_size`, the memory is considered overloaded. max_event_loop_delay: Sets the maximum delay of the event loop. When the delay is higher than the provided value, the event loop is considered overloaded. max_client_errors: Sets the maximum number of client errors (HTTP 429). When the number of client errors is higher than the provided number, the client is considered overloaded. - snapshot_history: Sets the time interval for which the snapshots are kept. - available_memory_ratio: How big part of the system memory should be used if `max_memory_size` is not given. - reserve_memory_ratio: Fraction of memory kept in reserve. Used to calculate critical memory overload - threshold. - memory_warning_cooldown_period: Minimum time interval between logging successive critical memory overload - warnings. - client_rate_limit_error_retry_count: Number of retries for a client request before considering it a failure - due to rate limiting. + max_memory_size: Sets the maximum amount of system memory to be used by the `AutoscaledPool`. """ - if available_memory_ratio is None and max_memory_size is None: - raise ValueError('At least one of `available_memory_ratio` or `max_memory_size` must be specified') - - self._event_loop_snapshot_interval = event_loop_snapshot_interval - self._client_snapshot_interval = client_snapshot_interval - self._max_event_loop_delay = max_event_loop_delay self._max_used_cpu_ratio = max_used_cpu_ratio self._max_used_memory_ratio = max_used_memory_ratio + self._max_event_loop_delay = max_event_loop_delay self._max_client_errors = max_client_errors - self._snapshot_history = snapshot_history - self._reserve_memory_ratio = reserve_memory_ratio - self._memory_warning_cooldown_period = memory_warning_cooldown_period - self._client_rate_limit_error_retry_count = client_rate_limit_error_retry_count - self._max_memory_size = max_memory_size or self._get_default_max_memory_size( - cast(float, available_memory_ratio) - ) + self._max_memory_size = max_memory_size self._cpu_snapshots = self._get_sorted_list_by_created_at(list[CpuSnapshot]()) self._event_loop_snapshots = self._get_sorted_list_by_created_at(list[EventLoopSnapshot]()) self._memory_snapshots = self._get_sorted_list_by_created_at(list[MemorySnapshot]()) self._client_snapshots = self._get_sorted_list_by_created_at(list[ClientSnapshot]()) - self._snapshot_event_loop_task = RecurringTask(self._snapshot_event_loop, self._event_loop_snapshot_interval) - self._snapshot_client_task = RecurringTask(self._snapshot_client, self._client_snapshot_interval) + self._snapshot_event_loop_task = RecurringTask(self._snapshot_event_loop, self._EVENT_LOOP_SNAPSHOT_INTERVAL) + self._snapshot_client_task = RecurringTask(self._snapshot_client, self._CLIENT_SNAPSHOT_INTERVAL) self._timestamp_of_last_memory_warning: datetime = datetime.now(timezone.utc) - timedelta(hours=1) # Flag to indicate the context state. self._active = False + @classmethod + def from_config(cls, config: Configuration | None = None) -> Snapshotter: + """Create a new instance based on the provided configuration. + + Args: + config: The configuration object. Uses the global (default) configuration if not provided. + """ + config = service_locator.get_configuration() + + # Compute the maximum memory size based on the provided configuration. If `memory_mbytes` is provided, + # it uses that value. Otherwise, it calculates the `max_memory_size` as a proportion of the system's + # total available memory based on `available_memory_ratio`. + max_memory_size = ( + ByteSize.from_mb(config.memory_mbytes) + if config.memory_mbytes + else ByteSize(int(get_memory_info().total_size.bytes * config.available_memory_ratio)) + ) + + return cls( + max_used_cpu_ratio=config.max_used_cpu_ratio, + max_used_memory_ratio=config.max_used_memory_ratio, + max_event_loop_delay=config.max_event_loop_delay, + max_client_errors=config.max_client_errors, + max_memory_size=max_memory_size, + ) + @staticmethod def _get_sorted_list_by_created_at(input_list: list[T]) -> SortedList[T]: return SortedList(input_list, key=attrgetter('created_at')) - @staticmethod - def _get_default_max_memory_size(available_memory_ratio: float) -> ByteSize: - """Default `memory_max_size` is 1/4 of the total system memory.""" - max_memory_size_in_bytes = int(psutil.virtual_memory().total * available_memory_ratio) - max_memory_size = ByteSize(max_memory_size_in_bytes) - logger.info(f'Setting max_memory_size of this run to {max_memory_size}.') - return max_memory_size - @property def active(self) -> bool: """Indicates whether the context is active.""" @@ -281,7 +292,7 @@ def _snapshot_event_loop(self) -> None: previous_snapshot = self._event_loop_snapshots[-1] if self._event_loop_snapshots else None if previous_snapshot: - event_loop_delay = snapshot.created_at - previous_snapshot.created_at - self._event_loop_snapshot_interval + event_loop_delay = snapshot.created_at - previous_snapshot.created_at - self._EVENT_LOOP_SNAPSHOT_INTERVAL snapshot.delay = event_loop_delay snapshots = cast(list[Snapshot], self._event_loop_snapshots) @@ -319,7 +330,7 @@ def _prune_snapshots(self, snapshots: list[Snapshot], now: datetime) -> None: # We'll keep snapshots from this index onwards. keep_from_index = None for i, snapshot in enumerate(snapshots): - if now - snapshot.created_at <= self._snapshot_history: + if now - snapshot.created_at <= self._SNAPSHOT_HISTORY: keep_from_index = i break @@ -338,11 +349,11 @@ def _evaluate_memory_load(self, current_memory_usage_size: ByteSize, snapshot_ti snapshot_timestamp: The time at which the memory snapshot was taken. """ # Check if the warning has been logged recently to avoid spamming - if snapshot_timestamp < self._timestamp_of_last_memory_warning + self._memory_warning_cooldown_period: + if snapshot_timestamp < self._timestamp_of_last_memory_warning + self._MEMORY_WARNING_COOLDOWN_PERIOD: return threshold_memory_size = self._max_used_memory_ratio * self._max_memory_size - buffer_memory_size = self._max_memory_size * (1 - self._max_used_memory_ratio) * self._reserve_memory_ratio + buffer_memory_size = self._max_memory_size * (1 - self._max_used_memory_ratio) * self._RESERVE_MEMORY_RATIO overload_memory_threshold_size = threshold_memory_size + buffer_memory_size # Log a warning if current memory usage exceeds the critical overload threshold diff --git a/src/crawlee/_service_locator.py b/src/crawlee/_service_locator.py index ed879a8d0d..b82b7fe00c 100644 --- a/src/crawlee/_service_locator.py +++ b/src/crawlee/_service_locator.py @@ -51,7 +51,11 @@ def get_event_manager(self) -> EventManager: if self._event_manager is None: from crawlee.events import LocalEventManager - self._event_manager = LocalEventManager() + self._event_manager = ( + LocalEventManager().from_config(config=self._configuration) + if self._configuration + else LocalEventManager.from_config() + ) self._event_manager_was_retrieved = True return self._event_manager @@ -75,7 +79,11 @@ def get_storage_client(self) -> BaseStorageClient: if self._storage_client is None: from crawlee.storage_clients import MemoryStorageClient - self._storage_client = MemoryStorageClient.from_config() + self._storage_client = ( + MemoryStorageClient.from_config(config=self._configuration) + if self._configuration + else MemoryStorageClient.from_config() + ) self._storage_client_was_retrieved = True return self._storage_client diff --git a/src/crawlee/browsers/_playwright_browser_plugin.py b/src/crawlee/browsers/_playwright_browser_plugin.py index 3945b0fbac..a8a4867c6f 100644 --- a/src/crawlee/browsers/_playwright_browser_plugin.py +++ b/src/crawlee/browsers/_playwright_browser_plugin.py @@ -8,6 +8,7 @@ from playwright.async_api import Playwright, async_playwright from typing_extensions import override +from crawlee import service_locator from crawlee._utils.context import ensure_context from crawlee._utils.docs import docs_group from crawlee.browsers._base_browser_plugin import BaseBrowserPlugin @@ -35,8 +36,8 @@ def __init__( self, *, browser_type: BrowserType = 'chromium', - browser_launch_options: Mapping[str, Any] | None = None, - browser_new_context_options: Mapping[str, Any] | None = None, + browser_launch_options: dict[str, Any] | None = None, + browser_new_context_options: dict[str, Any] | None = None, max_open_pages_per_browser: int = 20, ) -> None: """A default constructor. @@ -52,8 +53,17 @@ def __init__( max_open_pages_per_browser: The maximum number of pages that can be opened in a single browser instance. Once reached, a new browser instance will be launched to handle the excess. """ + config = service_locator.get_configuration() + + # Default browser launch options are based on the configuration. + default_launch_browser_options = { + 'headless': config.headless, + 'executable_path': config.default_browser_path, + 'chromium_sandbox': not config.disable_browser_sandbox, + } + self._browser_type = browser_type - self._browser_launch_options = browser_launch_options or {} + self._browser_launch_options = default_launch_browser_options | (browser_launch_options or {}) self._browser_new_context_options = browser_new_context_options or {} self._max_open_pages_per_browser = max_open_pages_per_browser diff --git a/src/crawlee/configuration.py b/src/crawlee/configuration.py index fdd14c1953..5b6921eda4 100644 --- a/src/crawlee/configuration.py +++ b/src/crawlee/configuration.py @@ -44,7 +44,11 @@ class Configuration(BaseSettings): ) ), ] = None - """This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670.""" + """Specifies the path to the browser executable. Currently primarily for Playwright-based features. This option + is passed directly to Playwright's `browser_type.launch` method as `executable_path` argument. For more details, + refer to the Playwright documentation: + https://playwright.dev/docs/api/class-browsertype#browser-type-launch. + """ disable_browser_sandbox: Annotated[ bool, @@ -55,7 +59,10 @@ class Configuration(BaseSettings): ) ), ] = False - """This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670.""" + """Disables the sandbox for the browser. Currently primarily for Playwright-based features. This option + is passed directly to Playwright's `browser_type.launch` method as `chromium_sandbox`. For more details, + refer to the Playwright documentation: + https://playwright.dev/docs/api/class-browsertype#browser-type-launch.""" log_level: Annotated[ Literal['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], @@ -79,7 +86,7 @@ class Configuration(BaseSettings): ) ), ] = 'default' - """The default dataset ID.""" + """The default dataset ID. This option is utilized by the storage client.""" default_key_value_store_id: Annotated[ str, @@ -91,7 +98,7 @@ class Configuration(BaseSettings): ) ), ] = 'default' - """The default key-value store ID.""" + """The default key-value store ID. This option is utilized by the storage client.""" default_request_queue_id: Annotated[ str, @@ -103,7 +110,7 @@ class Configuration(BaseSettings): ) ), ] = 'default' - """The default request queue ID.""" + """The default request queue ID. This option is utilized by the storage client.""" purge_on_start: Annotated[ bool, @@ -114,10 +121,10 @@ class Configuration(BaseSettings): ) ), ] = True - """Whether to purge the storage on the start.""" + """Whether to purge the storage on the start. This option is utilized by the `MemoryStorageClient`.""" write_metadata: Annotated[bool, Field(alias='crawlee_write_metadata')] = True - """Whether to write the storage metadata.""" + """Whether to write the storage metadata. This option is utilized by the `MemoryStorageClient`.""" persist_storage: Annotated[ bool, @@ -128,7 +135,7 @@ class Configuration(BaseSettings): ) ), ] = True - """Whether to persist the storage.""" + """Whether to persist the storage. This option is utilized by the `MemoryStorageClient`.""" persist_state_interval: Annotated[ timedelta_ms, @@ -139,7 +146,8 @@ class Configuration(BaseSettings): ) ), ] = timedelta(minutes=1) - """This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670.""" + """Interval at which `PersistState` events are emitted. The event ensures the state persistence during + the crawler run. This option is utilized by the `EventManager`.""" system_info_interval: Annotated[ timedelta_ms, @@ -150,7 +158,8 @@ class Configuration(BaseSettings): ) ), ] = timedelta(seconds=1) - """This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670.""" + """Interval at which `SystemInfo` events are emitted. The event represents the current status of the system. + This option is utilized by the `LocalEventManager`.""" max_used_cpu_ratio: Annotated[ float, @@ -161,7 +170,44 @@ class Configuration(BaseSettings): ) ), ] = 0.95 - """This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670.""" + """The maximum CPU usage ratio. If the CPU usage exceeds this value, the system is considered overloaded. + This option is used by the `Snapshotter`.""" + + max_used_memory_ratio: Annotated[ + float, + Field( + validation_alias=AliasChoices( + 'apify_max_used_memory_ratio', + 'crawlee_max_used_memory_ratio', + ) + ), + ] = 0.9 + """The maximum memory usage ratio. If the memory usage exceeds this ratio, it is considered overloaded. + This option is used by the `Snapshotter`.""" + + max_event_loop_delay: Annotated[ + timedelta_ms, + Field( + validation_alias=AliasChoices( + 'apify_max_event_loop_delay_millis', + 'crawlee_max_event_loop_delay_millis', + ) + ), + ] = timedelta(milliseconds=50) + """The maximum event loop delay. If the event loop delay exceeds this value, it is considered overloaded. + This option is used by the `Snapshotter`.""" + + max_client_errors: Annotated[ + int, + Field( + validation_alias=AliasChoices( + 'apify_max_client_errors', + 'crawlee_max_client_errors', + ) + ), + ] = 1 + """The maximum number of client errors (HTTP 429) allowed before the system is considered overloaded. + This option is used by the `Snapshotter`.""" memory_mbytes: Annotated[ int | None, @@ -173,7 +219,7 @@ class Configuration(BaseSettings): ) ), ] = None - """The maximum memory in megabytes. The `Snapshotter.max_memory_size` is set to this value.""" + """The maximum used memory in megabytes. This option is utilized by the `Snapshotter`.""" available_memory_ratio: Annotated[ float, @@ -184,8 +230,8 @@ class Configuration(BaseSettings): ) ), ] = 0.25 - """The ratio of system memory to use when memory_mbytes is not specified. The `Snapshotter.available_memory_ratio` - is set to this value.""" + """The maximum proportion of system memory to use. If `memory_mbytes` is not provided, this ratio is used to + calculate the maximum memory. This option is utilized by the `Snapshotter`.""" storage_dir: Annotated[ str, @@ -196,7 +242,7 @@ class Configuration(BaseSettings): ), ), ] = './storage' - """The path to the storage directory.""" + """The path to the storage directory. This option is utilized by the `MemoryStorageClient`.""" chrome_executable_path: Annotated[ str | None, @@ -207,7 +253,7 @@ class Configuration(BaseSettings): ) ), ] = None - """This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670.""" + """This setting is currently unused.""" headless: Annotated[ bool, @@ -218,7 +264,11 @@ class Configuration(BaseSettings): ) ), ] = True - """This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670.""" + """Whether to run the browser in headless mode. Currently primarily for Playwright-based features. This option + is passed directly to Playwright's `browser_type.launch` method as `headless`. For more details, + refer to the Playwright documentation: + https://playwright.dev/docs/api/class-browsertype#browser-type-launch. + """ xvfb: Annotated[ bool, @@ -229,7 +279,7 @@ class Configuration(BaseSettings): ) ), ] = False - """This setting is currently unused. For more details, see https://github.com/apify/crawlee-python/issues/670.""" + """This setting is currently unused.""" @classmethod def get_global_configuration(cls) -> Self: diff --git a/src/crawlee/crawlers/_basic/_basic_crawler.py b/src/crawlee/crawlers/_basic/_basic_crawler.py index 9cd660b4fd..6c8e4edc06 100644 --- a/src/crawlee/crawlers/_basic/_basic_crawler.py +++ b/src/crawlee/crawlers/_basic/_basic_crawler.py @@ -23,7 +23,6 @@ from crawlee._log_config import configure_logger, get_configured_log_level from crawlee._request import Request, RequestState from crawlee._types import BasicCrawlingContext, HttpHeaders, RequestHandlerRunResult, SendRequestFunction -from crawlee._utils.byte_size import ByteSize from crawlee._utils.docs import docs_group from crawlee._utils.urls import convert_to_absolute_url, is_url_absolute from crawlee._utils.wait import wait_for @@ -305,10 +304,7 @@ def __init__( # Internal, not explicitly configurable components self._tld_extractor = TLDExtract(cache_dir=tempfile.TemporaryDirectory().name) - self._snapshotter = Snapshotter( - max_memory_size=ByteSize.from_mb(config.memory_mbytes) if config.memory_mbytes else None, - available_memory_ratio=config.available_memory_ratio, - ) + self._snapshotter = Snapshotter.from_config(config) self._autoscaled_pool = AutoscaledPool( system_status=SystemStatus(self._snapshotter), is_finished_function=self.__is_finished_function, diff --git a/src/crawlee/events/_local_event_manager.py b/src/crawlee/events/_local_event_manager.py index d1cea37b39..b5d05ddc97 100644 --- a/src/crawlee/events/_local_event_manager.py +++ b/src/crawlee/events/_local_event_manager.py @@ -10,6 +10,7 @@ from crawlee._utils.docs import docs_group from crawlee._utils.recurring_task import RecurringTask from crawlee._utils.system import get_cpu_info, get_memory_info +from crawlee.configuration import Configuration from crawlee.events._event_manager import EventManager, EventManagerOptions from crawlee.events._types import Event, EventSystemInfoData @@ -32,6 +33,9 @@ def __init__( ) -> None: """A default constructor. + In most cases, you should use the `from_config` constructor to create a new instance based on + the provided configuration. + Args: system_info_interval: Interval at which `SystemInfo` events are emitted. event_manager_options: Additional options for the parent class. @@ -46,6 +50,20 @@ def __init__( super().__init__(**event_manager_options) + @classmethod + def from_config(cls, config: Configuration | None = None) -> LocalEventManager: + """Create a new instance based on the provided configuration. + + Args: + config: The configuration object. Uses the global (default) configuration if not provided. + """ + config = config or Configuration.get_global_configuration() + + return cls( + system_info_interval=config.system_info_interval, + persist_state_interval=config.persist_state_interval, + ) + async def __aenter__(self) -> LocalEventManager: """Initializes the local event manager upon entering the async context. diff --git a/src/crawlee/storage_clients/_memory/_memory_storage_client.py b/src/crawlee/storage_clients/_memory/_memory_storage_client.py index d66ab61815..7dfde57376 100644 --- a/src/crawlee/storage_clients/_memory/_memory_storage_client.py +++ b/src/crawlee/storage_clients/_memory/_memory_storage_client.py @@ -68,6 +68,9 @@ def __init__( ) -> None: """A default constructor. + In most cases, you should use the `from_config` constructor to create a new instance based on + the provided configuration. + Args: write_metadata: Whether to write metadata to the storage. persist_storage: Whether to persist the storage. @@ -95,10 +98,8 @@ def __init__( def from_config(cls, config: Configuration | None = None) -> MemoryStorageClient: """Create a new instance based on the provided configuration. - All the memory storage client parameters are taken from the configuration object. - Args: - config: The configuration object. + config: The configuration object. Uses the global (default) configuration if not provided. """ config = config or Configuration.get_global_configuration() diff --git a/tests/unit/_autoscaling/test_snapshotter.py b/tests/unit/_autoscaling/test_snapshotter.py index 3c491c5cd2..1bf14b4a9f 100644 --- a/tests/unit/_autoscaling/test_snapshotter.py +++ b/tests/unit/_autoscaling/test_snapshotter.py @@ -12,12 +12,14 @@ from crawlee._autoscaling.types import CpuSnapshot, EventLoopSnapshot, Snapshot from crawlee._utils.byte_size import ByteSize from crawlee._utils.system import CpuInfo, MemoryInfo +from crawlee.configuration import Configuration from crawlee.events._types import Event, EventSystemInfoData @pytest.fixture def snapshotter() -> Snapshotter: - return Snapshotter(available_memory_ratio=0.25) + config = Configuration(available_memory_ratio=0.25) + return Snapshotter.from_config(config) @pytest.fixture @@ -32,7 +34,9 @@ def event_system_data_info() -> EventSystemInfoData: async def test_start_stop_lifecycle() -> None: - async with Snapshotter(available_memory_ratio=0.25): + config = Configuration(available_memory_ratio=0.25) + + async with Snapshotter.from_config(config): pass @@ -92,9 +96,7 @@ async def test_get_cpu_sample(snapshotter: Snapshotter) -> None: assert len(samples) == len(cpu_snapshots) -async def test_methods_raise_error_when_not_active() -> None: - snapshotter = Snapshotter(available_memory_ratio=0.25) - +async def test_methods_raise_error_when_not_active(snapshotter: Snapshotter) -> None: assert snapshotter.active is False with pytest.raises(RuntimeError, match='Snapshotter is not active.'): @@ -124,7 +126,7 @@ async def test_methods_raise_error_when_not_active() -> None: def test_snapshot_pruning_removes_outdated_records(snapshotter: Snapshotter) -> None: # Set the snapshot history to 2 hours - snapshotter._snapshot_history = timedelta(hours=2) + snapshotter._SNAPSHOT_HISTORY = timedelta(hours=2) # Create timestamps for testing now = datetime.now(timezone.utc) @@ -164,7 +166,7 @@ def test_pruning_empty_snapshot_list_remains_empty(snapshotter: Snapshotter) -> def test_snapshot_pruning_keeps_recent_records_unaffected(snapshotter: Snapshotter) -> None: - snapshotter._snapshot_history = timedelta(hours=2) + snapshotter._SNAPSHOT_HISTORY = timedelta(hours=2) # Create timestamps for testing now = datetime.now(timezone.utc) @@ -192,7 +194,9 @@ def test_snapshot_pruning_keeps_recent_records_unaffected(snapshotter: Snapshott def test_memory_load_evaluation_logs_warning_on_high_usage(caplog: pytest.LogCaptureFixture) -> None: - snapshotter = Snapshotter(max_memory_size=ByteSize.from_gb(8)) + config = Configuration(memory_mbytes=ByteSize.from_gb(8).bytes) + + snapshotter = Snapshotter.from_config(config) high_memory_usage = ByteSize.from_gb(8) * 0.95 # 95% of 8 GB @@ -232,7 +236,7 @@ def test_memory_load_evaluation_silent_on_acceptable_usage( assert mock_logger_warn.call_count == 0 -async def test_snapshots_time_ordered() -> None: +async def test_snapshots_time_ordered(snapshotter: Snapshotter) -> None: # All internal snapshot list should be ordered by creation time in ascending order. # Scenario where older emitted event arrives after newer event. # Snapshotter should not trust the event order and check events' times. @@ -249,7 +253,7 @@ def create_event_data(creation_time: datetime) -> EventSystemInfoData: async with ( service_locator.get_event_manager() as event_manager, - Snapshotter(available_memory_ratio=0.25) as snapshotter, + snapshotter, ): event_manager.emit(event=Event.SYSTEM_INFO, event_data=create_event_data(time_new)) await event_manager.wait_for_all_listeners_to_complete() diff --git a/tests/unit/_autoscaling/test_system_status.py b/tests/unit/_autoscaling/test_system_status.py index acb6e35314..b6893f4ca0 100644 --- a/tests/unit/_autoscaling/test_system_status.py +++ b/tests/unit/_autoscaling/test_system_status.py @@ -15,6 +15,7 @@ SystemInfo, ) from crawlee._utils.byte_size import ByteSize +from crawlee.configuration import Configuration if TYPE_CHECKING: from collections.abc import AsyncGenerator @@ -22,7 +23,8 @@ @pytest.fixture async def snapshotter() -> AsyncGenerator[Snapshotter, None]: - async with Snapshotter(available_memory_ratio=0.25) as snapshotter: + config = Configuration(available_memory_ratio=0.25) + async with Snapshotter.from_config(config) as snapshotter: yield snapshotter @@ -32,7 +34,9 @@ def now() -> datetime: async def test_start_stop_lifecycle() -> None: - async with Snapshotter(available_memory_ratio=0.25) as snapshotter: + config = Configuration(available_memory_ratio=0.25) + + async with Snapshotter.from_config(config) as snapshotter: system_status = SystemStatus(snapshotter) system_status.get_current_system_info() system_status.get_historical_system_info() diff --git a/tests/unit/browsers/test_playwright_browser_plugin.py b/tests/unit/browsers/test_playwright_browser_plugin.py index addf2f1344..73be0103f4 100644 --- a/tests/unit/browsers/test_playwright_browser_plugin.py +++ b/tests/unit/browsers/test_playwright_browser_plugin.py @@ -28,7 +28,8 @@ async def test_initial_state() -> None: # Test initial state assert plugin.browser_type == 'chromium' - assert plugin.browser_launch_options == {'headless': False} + assert 'headless' in plugin.browser_launch_options + assert plugin.browser_launch_options['headless'] is False assert plugin.browser_new_context_options == {'viewport': {'width': 1920, 'height': 1080}} assert plugin.max_open_pages_per_browser == 10 diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py index cd7f54c9b3..f8ef842ab4 100644 --- a/tests/unit/conftest.py +++ b/tests/unit/conftest.py @@ -37,6 +37,10 @@ def prepare_test_env(monkeypatch: pytest.MonkeyPatch, tmp_path: Path) -> Callabl """ def _prepare_test_env() -> None: + # Disable the browser sandbox by setting the environment variable. This is required for running + # Playwright tests in the CI environment, where the sandbox is not supported. + monkeypatch.setenv('CRAWLEE_DISABLE_BROWSER_SANDBOX', 'true') + # Set the environment variable for the local storage directory to the temporary path. monkeypatch.setenv('CRAWLEE_STORAGE_DIR', str(tmp_path)) diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py index 5d7ec04280..15c1780b7a 100644 --- a/tests/unit/crawlers/_basic/test_basic_crawler.py +++ b/tests/unit/crawlers/_basic/test_basic_crawler.py @@ -1005,7 +1005,7 @@ async def handler(context: BasicCrawlingContext) -> None: async def test_sets_services() -> None: custom_configuration = Configuration() - custom_event_manager = LocalEventManager() + custom_event_manager = LocalEventManager.from_config(custom_configuration) custom_storage_client = MemoryStorageClient.from_config(custom_configuration) crawler = BasicCrawler(