Skip to content

Commit dbb9f44

Browse files
authored
fix: add support non-GET requests for PlaywrightCrawler (#1208)
### Description - Add support for non-GET requests for `PlaywrightCrawler` using [`Route`](https://playwright.dev/python/docs/api/class-route) ### Issues - Closes: #1201
1 parent 7d23580 commit dbb9f44

File tree

2 files changed

+64
-9
lines changed

2 files changed

+64
-9
lines changed

src/crawlee/crawlers/_playwright/_playwright_crawler.py

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
import asyncio
44
import logging
5+
import warnings
56
from functools import partial
67
from typing import TYPE_CHECKING, Any, Callable, Generic, Literal, Union
78

@@ -34,11 +35,20 @@
3435
from collections.abc import AsyncGenerator, Awaitable, Mapping, Sequence
3536
from pathlib import Path
3637

37-
from playwright.async_api import Page
38+
from playwright.async_api import Page, Route
39+
from playwright.async_api import Request as PlaywrightRequest
3840
from typing_extensions import Unpack
3941

4042
from crawlee import RequestTransformAction
41-
from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, EnqueueLinksKwargs, ExtractLinksFunction
43+
from crawlee._types import (
44+
BasicCrawlingContext,
45+
EnqueueLinksFunction,
46+
EnqueueLinksKwargs,
47+
ExtractLinksFunction,
48+
HttpHeaders,
49+
HttpMethod,
50+
HttpPayload,
51+
)
4252
from crawlee.browsers._types import BrowserType
4353

4454

@@ -210,6 +220,27 @@ async def _open_page(
210220
await hook(pre_navigation_context)
211221
yield pre_navigation_context
212222

223+
def _prepare_request_interceptor(
224+
self,
225+
method: HttpMethod = 'GET',
226+
headers: HttpHeaders | dict[str, str] | None = None,
227+
payload: HttpPayload | None = None,
228+
) -> Callable:
229+
"""Create a request interceptor for Playwright to support non-GET methods with custom parameters.
230+
231+
The interceptor modifies requests by adding custom headers and payload before they are sent.
232+
233+
Args:
234+
method: HTTP method to use for the request.
235+
headers: Custom HTTP headers to send with the request.
236+
payload: Request body data for POST/PUT requests.
237+
"""
238+
239+
async def route_handler(route: Route, _: PlaywrightRequest) -> None:
240+
await route.continue_(method=method, headers=dict(headers) if headers else None, post_data=payload)
241+
242+
return route_handler
243+
213244
async def _navigate(
214245
self,
215246
context: PlaywrightPreNavCrawlingContext,
@@ -235,6 +266,24 @@ async def _navigate(
235266
if context.request.headers:
236267
await context.page.set_extra_http_headers(context.request.headers.model_dump())
237268
# Navigate to the URL and get response.
269+
if context.request.method != 'GET':
270+
# Call the notification only once
271+
warnings.warn(
272+
'Using other request methods than GET or adding payloads has a high impact on performance'
273+
' in recent versions of Playwright. Use only when necessary.',
274+
category=UserWarning,
275+
stacklevel=2,
276+
)
277+
278+
route_handler = self._prepare_request_interceptor(
279+
method=context.request.method,
280+
headers=context.request.headers,
281+
payload=context.request.payload,
282+
)
283+
284+
# Set route_handler only for current request
285+
await context.page.route(context.request.url, route_handler)
286+
238287
response = await context.page.goto(context.request.url)
239288

240289
if response is None:

tests/unit/crawlers/_playwright/test_playwright_crawler.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -35,12 +35,21 @@
3535
from yarl import URL
3636

3737
from crawlee._request import RequestOptions
38+
from crawlee._types import HttpMethod, HttpPayload
3839
from crawlee.browsers._types import BrowserType
3940
from crawlee.crawlers import PlaywrightCrawlingContext, PlaywrightPreNavCrawlingContext
4041

4142

42-
async def test_basic_request(server_url: URL) -> None:
43-
requests = [str(server_url)]
43+
@pytest.mark.parametrize(
44+
('method', 'path', 'payload'),
45+
[
46+
pytest.param('GET', 'get', None, id='get request'),
47+
pytest.param('POST', 'post', None, id='post request'),
48+
pytest.param('POST', 'post', b'Hello, world!', id='post request with payload'),
49+
],
50+
)
51+
async def test_basic_request(method: HttpMethod, path: str, payload: HttpPayload, server_url: URL) -> None:
52+
requests = [Request.from_url(str(server_url / path), method=method, payload=payload)]
4453
crawler = PlaywrightCrawler()
4554
result: dict = {}
4655

@@ -49,14 +58,11 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
4958
assert context.page is not None
5059
result['request_url'] = context.request.url
5160
result['page_url'] = context.page.url
52-
result['page_title'] = await context.page.title()
5361
result['page_content'] = await context.page.content()
5462

5563
await crawler.run(requests)
56-
57-
assert result.get('request_url') == result.get('page_url') == requests[0]
58-
assert 'Hello, world!' in result.get('page_title', '')
59-
assert '<html' in result.get('page_content', '') # there is some HTML content
64+
assert result.get('request_url') == result.get('page_url') == requests[0].url
65+
assert (payload.decode() if payload else '') in result.get('page_content', '')
6066

6167

6268
async def test_enqueue_links(redirect_server_url: URL, server_url: URL) -> None:

0 commit comments

Comments
 (0)