Skip to content

Commit 82ff69a

Browse files
authored
fix: Fix the order in which cookies are saved to the SessionCookies and the handler is executed for PlaywrightCrawler (#1163)
### Description - For `PlaywrightCrawler`, cookies should only be saved to the session store when the handler is fully executed. This is because the browser may continue to set cookies while the handler is being executed ### Testing - Add a test simulating the installation of a cookie in the browser during the `default_handler` execution process - Update the `test_isolation_cookies` test
1 parent f937070 commit 82ff69a

File tree

2 files changed

+44
-9
lines changed

2 files changed

+44
-9
lines changed

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -232,10 +232,6 @@ async def _navigate(
232232
# Set the loaded URL to the actual URL after redirection.
233233
context.request.loaded_url = context.page.url
234234

235-
if context.session:
236-
pw_cookies = await self._get_cookies(context.page)
237-
context.session.cookies.set_cookies_from_playwright_format(pw_cookies)
238-
239235
extract_links = self._create_extract_links_function(context)
240236

241237
error = yield PlaywrightCrawlingContext(
@@ -256,6 +252,10 @@ async def _navigate(
256252
block_requests=partial(block_requests, page=context.page),
257253
)
258254

255+
if context.session:
256+
pw_cookies = await self._get_cookies(context.page)
257+
context.session.cookies.set_cookies_from_playwright_format(pw_cookies)
258+
259259
# Collect data in case of errors, before the page object is closed.
260260
if error:
261261
await self.statistics.error_tracker.add(error=error, context=context, early=True)

tests/unit/crawlers/_playwright/test_playwright_crawler.py

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@
2222
from crawlee.fingerprint_suite._browserforge_adapter import get_available_header_values
2323
from crawlee.fingerprint_suite._consts import BROWSER_TYPE_HEADER_KEYWORD
2424
from crawlee.proxy_configuration import ProxyConfiguration
25-
from crawlee.sessions import SessionPool
25+
from crawlee.sessions import Session, SessionPool
2626
from crawlee.statistics import Statistics
2727
from crawlee.statistics._error_snapshotter import ErrorSnapshotter
2828
from tests.unit.server_endpoints import GENERIC_RESPONSE, HELLO_WORLD
@@ -304,6 +304,7 @@ async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None:
304304
)
305305
async def test_isolation_cookies(*, use_incognito_pages: bool, server_url: URL) -> None:
306306
sessions_ids: list[str] = []
307+
sessions: dict[str, Session] = {}
307308
sessions_cookies: dict[str, dict[str, str]] = {}
308309
response_cookies: dict[str, dict[str, str]] = {}
309310

@@ -319,13 +320,11 @@ async def handler(context: PlaywrightCrawlingContext) -> None:
319320
return
320321

321322
sessions_ids.append(context.session.id)
323+
sessions[context.session.id] = context.session
322324

323325
if context.request.unique_key not in {'1', '2'}:
324326
return
325327

326-
sessions_cookies[context.session.id] = {
327-
cookie['name']: cookie['value'] for cookie in context.session.cookies.get_cookies_as_dicts()
328-
}
329328
response_data = json.loads(await context.response.text())
330329
response_cookies[context.session.id] = response_data.get('cookies')
331330

@@ -343,11 +342,20 @@ async def handler(context: PlaywrightCrawlingContext) -> None:
343342
]
344343
)
345344

346-
assert len(sessions_cookies) == 2
347345
assert len(response_cookies) == 2
346+
assert len(sessions) == 2
348347

349348
assert sessions_ids[0] == sessions_ids[1]
350349

350+
sessions_cookies = {
351+
sessions_id: {
352+
cookie['name']: cookie['value'] for cookie in sessions[sessions_id].cookies.get_cookies_as_dicts()
353+
}
354+
for sessions_id in sessions_ids
355+
}
356+
357+
assert len(sessions_cookies) == 2
358+
351359
cookie_session_id = sessions_ids[0]
352360
clean_session_id = sessions_ids[2]
353361

@@ -372,6 +380,33 @@ async def handler(context: PlaywrightCrawlingContext) -> None:
372380
assert sessions_cookies[clean_session_id] == response_cookies[clean_session_id] == {'a': '1'}
373381

374382

383+
async def test_save_cookies_after_handler_processing(server_url: URL) -> None:
384+
"""Test that cookies are saved correctly."""
385+
async with SessionPool(max_pool_size=1) as session_pool:
386+
crawler = PlaywrightCrawler(session_pool=session_pool)
387+
388+
session_ids = []
389+
390+
@crawler.router.default_handler
391+
async def request_handler(context: PlaywrightCrawlingContext) -> None:
392+
# Simulate cookies installed from an external source in the browser
393+
await context.page.context.add_cookies([{'name': 'check', 'value': 'test', 'url': str(server_url)}])
394+
395+
if context.session:
396+
session_ids.append(context.session.id)
397+
398+
await crawler.run([str(server_url)])
399+
400+
assert len(session_ids) == 1
401+
402+
check_session = await session_pool.get_session()
403+
404+
assert check_session.id == session_ids[0]
405+
session_cookies = {cookie['name']: cookie['value'] for cookie in check_session.cookies.get_cookies_as_dicts()}
406+
407+
assert session_cookies == {'check': 'test'}
408+
409+
375410
async def test_custom_fingerprint_uses_generator_options(server_url: URL) -> None:
376411
min_width = 300
377412
max_width = 600

0 commit comments

Comments
 (0)