Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 51 additions & 2 deletions src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import asyncio
import logging
import warnings
from functools import partial
from typing import TYPE_CHECKING, Any, Callable, Generic, Literal, Union

Expand Down Expand Up @@ -34,11 +35,20 @@
from collections.abc import AsyncGenerator, Awaitable, Mapping, Sequence
from pathlib import Path

from playwright.async_api import Page
from playwright.async_api import Page, Route
from playwright.async_api import Request as PlaywrightRequest
from typing_extensions import Unpack

from crawlee import RequestTransformAction
from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, EnqueueLinksKwargs, ExtractLinksFunction
from crawlee._types import (
BasicCrawlingContext,
EnqueueLinksFunction,
EnqueueLinksKwargs,
ExtractLinksFunction,
HttpHeaders,
HttpMethod,
HttpPayload,
)
from crawlee.browsers._types import BrowserType


Expand Down Expand Up @@ -210,6 +220,27 @@ async def _open_page(
await hook(pre_navigation_context)
yield pre_navigation_context

def _prepare_request_interceptor(
self,
method: HttpMethod = 'GET',
headers: HttpHeaders | dict[str, str] | None = None,
payload: HttpPayload | None = None,
) -> Callable:
"""Create a request interceptor for Playwright to support non-GET methods with custom parameters.

The interceptor modifies requests by adding custom headers and payload before they are sent.

Args:
method: HTTP method to use for the request.
headers: Custom HTTP headers to send with the request.
payload: Request body data for POST/PUT requests.
"""

async def route_handler(route: Route, _: PlaywrightRequest) -> None:
await route.continue_(method=method, headers=dict(headers) if headers else None, post_data=payload)

return route_handler

async def _navigate(
self,
context: PlaywrightPreNavCrawlingContext,
Expand All @@ -235,6 +266,24 @@ async def _navigate(
if context.request.headers:
await context.page.set_extra_http_headers(context.request.headers.model_dump())
# Navigate to the URL and get response.
if context.request.method != 'GET':
# Call the notification only once
warnings.warn(
'Using other request methods than GET or adding payloads has a high impact on performance'
' in recent versions of Playwright. Use only when necessary.',
category=UserWarning,
stacklevel=2,
)

route_handler = self._prepare_request_interceptor(
method=context.request.method,
headers=context.request.headers,
payload=context.request.payload,
)

# Set route_handler only for current request
await context.page.route(context.request.url, route_handler)

response = await context.page.goto(context.request.url)

if response is None:
Expand Down
20 changes: 13 additions & 7 deletions tests/unit/crawlers/_playwright/test_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,21 @@
from yarl import URL

from crawlee._request import RequestOptions
from crawlee._types import HttpMethod, HttpPayload
from crawlee.browsers._types import BrowserType
from crawlee.crawlers import PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext


async def test_basic_request(server_url: URL) -> None:
requests = [str(server_url)]
@pytest.mark.parametrize(
('method', 'path', 'payload'),
[
pytest.param('GET', 'get', None, id='get request'),
pytest.param('POST', 'post', None, id='post request'),
pytest.param('POST', 'post', b'Hello, world!', id='post request with payload'),
],
)
async def test_basic_request(method: HttpMethod, path: str, payload: HttpPayload, server_url: URL) -> None:
requests = [Request.from_url(str(server_url / path), method=method, payload=payload)]
crawler = PlaywrightCrawler()
result: dict = {}

Expand All @@ -49,14 +58,11 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
assert context.page is not None
result['request_url'] = context.request.url
result['page_url'] = context.page.url
result['page_title'] = await context.page.title()
result['page_content'] = await context.page.content()

await crawler.run(requests)

assert result.get('request_url') == result.get('page_url') == requests[0]
assert 'Hello, world!' in result.get('page_title', '')
assert '<html' in result.get('page_content', '') # there is some HTML content
assert result.get('request_url') == result.get('page_url') == requests[0].url
assert (payload.decode() if payload else '') in result.get('page_content', '')


async def test_enqueue_links(redirect_server_url: URL, server_url: URL) -> None:
Expand Down
Loading