Skip to content

Commit ffc6048

Browse files
Pijukatelvdusek
andauthored
refactor!: Rename PlaywrightCrawler kwargs: browser_options, page_options (#831)
### Description Motivation is for the names to match their actual usage. Rename them also in internal components. Change `PlaywrightBrowserController.close` method to rely on context to close pages, as per PW documentation recommendations. ### Breaking changes - Renamed `PlaywrightCrawler.__init__` keyword arguments: `browser_options` to `browser_launch_options`, `page_options` to `browser_new_context_options`. - Same keyword arguments renaming happened in `PlaywrightBrowserPlugin`, `BaseBrowserPlugin`, `PlaywrightBrowserController`, `BaseBrowserController` in all methods where previously named arguments were present. --------- Co-authored-by: Vlada Dusek <[email protected]>
1 parent 108ca00 commit ffc6048

File tree

10 files changed

+126
-66
lines changed

10 files changed

+126
-66
lines changed

docs/examples/code/playwright_crawler_with_camoufox.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@ async def new_browser(self) -> PlaywrightBrowserController:
1818
raise RuntimeError('Playwright browser plugin is not initialized.')
1919

2020
return PlaywrightBrowserController(
21-
browser=await AsyncNewBrowser(self._playwright, headless=True, **self._browser_options),
21+
browser=await AsyncNewBrowser(self._playwright, headless=True, **self._browser_launch_options),
2222
max_open_pages_per_browser=1, # Increase, if camoufox can handle it in your use case.
2323
header_generator=None, # This turns off the crawlee header_generation. Camoufox has its own.
2424
)

docs/upgrading/upgrading_to_v0x.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,10 @@ This section summarizes the breaking changes between v0.4.x and v0.5.0.
4040
### PlaywrightCrawler
4141

4242
- The `PlaywrightPreNavigationContext` was renamed to `PlaywrightPreNavCrawlingContext`.
43+
- The input arguments in `PlaywrightCrawler.__init__` have been renamed:
44+
- `browser_options` is now `browser_launch_options`,
45+
- `page_options` is now `browser_new_context_options`.
46+
- These argument renaming changes have also been applied to `BrowserPool`, `PlaywrightBrowserPlugin`, and `PlaywrightBrowserController`.
4347

4448
## Upgrading to v0.4
4549

src/crawlee/browsers/_base_browser_controller.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -59,13 +59,15 @@ def browser_type(self) -> BrowserType:
5959
@abstractmethod
6060
async def new_page(
6161
self,
62-
page_options: Mapping[str, Any] | None = None,
62+
browser_new_context_options: Mapping[str, Any] | None = None,
6363
proxy_info: ProxyInfo | None = None,
6464
) -> Page:
6565
"""Create a new page with the given context options.
6666
6767
Args:
68-
page_options: Options to configure the new page.
68+
browser_new_context_options: Keyword arguments to pass to the browser new context method. These options
69+
are provided directly to Playwright's `browser.new_context` method. For more details, refer to the
70+
Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
6971
proxy_info: The proxy configuration to use for the new page.
7072
7173
Returns:

src/crawlee/browsers/_base_browser_plugin.py

Lines changed: 14 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -35,13 +35,23 @@ def browser_type(self) -> BrowserType:
3535

3636
@property
3737
@abstractmethod
38-
def browser_options(self) -> Mapping[str, Any]:
39-
"""Return the options for a new browser."""
38+
def browser_launch_options(self) -> Mapping[str, Any]:
39+
"""Return the options for the `browser.launch` method.
40+
41+
Keyword arguments to pass to the browser launch method. These options are provided directly to Playwright's
42+
`browser_type.launch` method. For more details, refer to the Playwright documentation:
43+
https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.
44+
"""
4045

4146
@property
4247
@abstractmethod
43-
def page_options(self) -> Mapping[str, Any]:
44-
"""Return the options for a new page."""
48+
def browser_new_context_options(self) -> Mapping[str, Any]:
49+
"""Return the options for the `browser.new_context` method.
50+
51+
Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's
52+
`browser.new_context` method. For more details, refer to the Playwright documentation:
53+
https://playwright.dev/python/docs/api/class-browser#browser-new-context.
54+
"""
4555

4656
@property
4757
@abstractmethod

src/crawlee/browsers/_browser_pool.py

Lines changed: 16 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -100,30 +100,30 @@ def with_default_plugin(
100100
cls,
101101
*,
102102
browser_type: BrowserType | None = None,
103-
browser_options: Mapping[str, Any] | None = None,
104-
page_options: Mapping[str, Any] | None = None,
103+
browser_launch_options: Mapping[str, Any] | None = None,
104+
browser_new_context_options: Mapping[str, Any] | None = None,
105105
headless: bool | None = None,
106106
**kwargs: Any,
107107
) -> BrowserPool:
108108
"""Create a new instance with a single `PlaywrightBrowserPlugin` configured with the provided options.
109109
110110
Args:
111111
browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
112-
browser_options: Keyword arguments to pass to the browser launch method. These options are provided
112+
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
113113
directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright
114114
documentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.
115-
page_options: Keyword arguments to pass to the page object is set at the playwright context level.
116-
These options are provided directly to Playwright's `browser.new_context` method. For more details,
117-
refer to the Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
115+
browser_new_context_options: Keyword arguments to pass to the browser new context method. These options
116+
are provided directly to Playwright's `browser.new_context` method. For more details, refer to the
117+
Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
118118
headless: Whether to run the browser in headless mode.
119119
kwargs: Additional arguments for default constructor.
120120
"""
121121
plugin_options: dict = defaultdict(dict)
122-
plugin_options['browser_options'] = browser_options or {}
123-
plugin_options['page_options'] = page_options or {}
122+
plugin_options['browser_launch_options'] = browser_launch_options or {}
123+
plugin_options['browser_new_context_options'] = browser_new_context_options or {}
124124

125125
if headless is not None:
126-
plugin_options['browser_options']['headless'] = headless
126+
plugin_options['browser_launch_options']['headless'] = headless
127127

128128
if browser_type:
129129
plugin_options['browser_type'] = browser_type
@@ -262,13 +262,16 @@ async def _get_new_page(
262262
) -> CrawleePage:
263263
"""Internal method to initialize a new page in a browser using the specified plugin."""
264264
timeout = self._operation_timeout.total_seconds()
265-
browser = self._pick_browser_with_free_capacity(plugin)
265+
browser_controller = self._pick_browser_with_free_capacity(plugin)
266266

267267
try:
268-
if not browser:
269-
browser = await asyncio.wait_for(self._launch_new_browser(plugin), timeout)
268+
if not browser_controller:
269+
browser_controller = await asyncio.wait_for(self._launch_new_browser(plugin), timeout)
270270
page = await asyncio.wait_for(
271-
browser.new_page(page_options=plugin.page_options, proxy_info=proxy_info),
271+
browser_controller.new_page(
272+
browser_new_context_options=plugin.browser_new_context_options,
273+
proxy_info=proxy_info,
274+
),
272275
timeout,
273276
)
274277
except asyncio.TimeoutError as exc:

src/crawlee/browsers/_playwright_browser_controller.py

Lines changed: 41 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,10 @@
2020

2121
from crawlee.proxy_configuration import ProxyInfo
2222

23+
from logging import getLogger
24+
25+
logger = getLogger(__name__)
26+
2327

2428
@docs_group('Classes')
2529
class PlaywrightBrowserController(BaseBrowserController):
@@ -94,11 +98,25 @@ def browser_type(self) -> BrowserType:
9498
@override
9599
async def new_page(
96100
self,
97-
page_options: Mapping[str, Any] | None = None,
101+
browser_new_context_options: Mapping[str, Any] | None = None,
98102
proxy_info: ProxyInfo | None = None,
99103
) -> Page:
104+
"""Create a new page with the given context options.
105+
106+
Args:
107+
browser_new_context_options: Keyword arguments to pass to the browser new context method. These options
108+
are provided directly to Playwright's `browser.new_context` method. For more details, refer to the
109+
Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
110+
proxy_info: The proxy configuration to use for the new page.
111+
112+
Returns:
113+
Page: The newly created page.
114+
115+
Raises:
116+
ValueError: If the browser has reached the maximum number of open pages.
117+
"""
100118
if not self._browser_context:
101-
self._browser_context = await self._create_browser_context(page_options, proxy_info)
119+
self._browser_context = await self._create_browser_context(browser_new_context_options, proxy_info)
102120

103121
if not self.has_free_capacity:
104122
raise ValueError('Cannot open more pages in this browser.')
@@ -116,21 +134,27 @@ async def new_page(
116134

117135
@override
118136
async def close(self, *, force: bool = False) -> None:
119-
if force:
120-
for page in self._pages:
121-
await page.close()
137+
"""Close the browser.
122138
123-
if self.pages_count > 0:
139+
Args:
140+
force: Whether to force close all open pages before closing the browser.
141+
142+
Raises:
143+
ValueError: If there are still open pages when trying to close the browser.
144+
"""
145+
if self.pages_count > 0 and not force:
124146
raise ValueError('Cannot close the browser while there are open pages.')
125147

148+
if self._browser_context:
149+
await self._browser_context.close()
126150
await self._browser.close()
127151

128152
def _on_page_close(self, page: Page) -> None:
129153
"""Handle actions after a page is closed."""
130154
self._pages.remove(page)
131155

132156
async def _create_browser_context(
133-
self, page_options: Mapping[str, Any] | None = None, proxy_info: ProxyInfo | None = None
157+
self, browser_new_context_options: Mapping[str, Any] | None = None, proxy_info: ProxyInfo | None = None
134158
) -> BrowserContext:
135159
"""Create a new browser context with the specified proxy settings."""
136160
if self._header_generator:
@@ -141,17 +165,19 @@ async def _create_browser_context(
141165
else:
142166
extra_http_headers = None
143167

144-
page_options = dict(page_options) if page_options else {}
145-
page_options['extra_http_headers'] = page_options.get('extra_http_headers', extra_http_headers)
168+
browser_new_context_options = dict(browser_new_context_options) if browser_new_context_options else {}
169+
browser_new_context_options['extra_http_headers'] = browser_new_context_options.get(
170+
'extra_http_headers', extra_http_headers
171+
)
172+
173+
if proxy_info:
174+
if browser_new_context_options['proxy']:
175+
logger.warning("browser_new_context_options['proxy'] overriden by explicit `proxy_info` argument.")
146176

147-
proxy = (
148-
ProxySettings(
177+
browser_new_context_options['proxy'] = ProxySettings(
149178
server=f'{proxy_info.scheme}://{proxy_info.hostname}:{proxy_info.port}',
150179
username=proxy_info.username,
151180
password=proxy_info.password,
152181
)
153-
if proxy_info
154-
else None
155-
)
156182

157-
return await self._browser.new_context(proxy=proxy, **page_options)
183+
return await self._browser.new_context(**browser_new_context_options)

src/crawlee/browsers/_playwright_browser_plugin.py

Lines changed: 27 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -35,26 +35,26 @@ def __init__(
3535
self,
3636
*,
3737
browser_type: BrowserType = 'chromium',
38-
browser_options: Mapping[str, Any] | None = None,
39-
page_options: Mapping[str, Any] | None = None,
38+
browser_launch_options: Mapping[str, Any] | None = None,
39+
browser_new_context_options: Mapping[str, Any] | None = None,
4040
max_open_pages_per_browser: int = 20,
4141
) -> None:
4242
"""A default constructor.
4343
4444
Args:
4545
browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
46-
browser_options: Keyword arguments to pass to the browser launch method. These options are provided
46+
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
4747
directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright
4848
documentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.
49-
page_options: Keyword arguments to pass to the page object is set at the playwright context level.
50-
These options are provided directly to Playwright's `browser.new_context` method. For more details,
51-
refer to the Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
49+
browser_new_context_options: Keyword arguments to pass to the browser new context method. These options
50+
are provided directly to Playwright's `browser.new_context` method. For more details, refer to the
51+
Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
5252
max_open_pages_per_browser: The maximum number of pages that can be opened in a single browser instance.
5353
Once reached, a new browser instance will be launched to handle the excess.
5454
"""
5555
self._browser_type = browser_type
56-
self._browser_options = browser_options or {}
57-
self._page_options = page_options or {}
56+
self._browser_launch_options = browser_launch_options or {}
57+
self._browser_new_context_options = browser_new_context_options or {}
5858
self._max_open_pages_per_browser = max_open_pages_per_browser
5959

6060
self._playwright_context_manager = async_playwright()
@@ -75,13 +75,25 @@ def browser_type(self) -> BrowserType:
7575

7676
@property
7777
@override
78-
def browser_options(self) -> Mapping[str, Any]:
79-
return self._browser_options
78+
def browser_launch_options(self) -> Mapping[str, Any]:
79+
"""Return the options for the `browser.launch` method.
80+
81+
Keyword arguments to pass to the browser launch method. These options are provided directly to Playwright's
82+
`browser_type.launch` method. For more details, refer to the Playwright documentation:
83+
https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.
84+
"""
85+
return self._browser_launch_options
8086

8187
@property
8288
@override
83-
def page_options(self) -> Mapping[str, Any]:
84-
return self._page_options
89+
def browser_new_context_options(self) -> Mapping[str, Any]:
90+
"""Return the options for the `browser.new_context` method.
91+
92+
Keyword arguments to pass to the browser new context method. These options are provided directly to Playwright's
93+
`browser.new_context` method. For more details, refer to the Playwright documentation:
94+
https://playwright.dev/python/docs/api/class-browser#browser-new-context.
95+
"""
96+
return self._browser_new_context_options
8597

8698
@property
8799
@override
@@ -117,11 +129,11 @@ async def new_browser(self) -> PlaywrightBrowserController:
117129
raise RuntimeError('Playwright browser plugin is not initialized.')
118130

119131
if self._browser_type == 'chromium':
120-
browser = await self._playwright.chromium.launch(**self._browser_options)
132+
browser = await self._playwright.chromium.launch(**self._browser_launch_options)
121133
elif self._browser_type == 'firefox':
122-
browser = await self._playwright.firefox.launch(**self._browser_options)
134+
browser = await self._playwright.firefox.launch(**self._browser_launch_options)
123135
elif self._browser_type == 'webkit':
124-
browser = await self._playwright.webkit.launch(**self._browser_options)
136+
browser = await self._playwright.webkit.launch(**self._browser_launch_options)
125137
else:
126138
raise ValueError(f'Invalid browser type: {self._browser_type}')
127139

src/crawlee/playwright_crawler/_playwright_crawler.py

Lines changed: 14 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -71,8 +71,8 @@ def __init__(
7171
self,
7272
browser_pool: BrowserPool | None = None,
7373
browser_type: BrowserType | None = None,
74-
browser_options: Mapping[str, Any] | None = None,
75-
page_options: Mapping[str, Any] | None = None,
74+
browser_launch_options: Mapping[str, Any] | None = None,
75+
browser_new_context_options: Mapping[str, Any] | None = None,
7676
headless: bool | None = None,
7777
**kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext]],
7878
) -> None:
@@ -82,33 +82,36 @@ def __init__(
8282
browser_pool: A `BrowserPool` instance to be used for launching the browsers and getting pages.
8383
browser_type: The type of browser to launch ('chromium', 'firefox', or 'webkit').
8484
This option should not be used if `browser_pool` is provided.
85-
browser_options: Keyword arguments to pass to the browser launch method. These options are provided
85+
browser_launch_options: Keyword arguments to pass to the browser launch method. These options are provided
8686
directly to Playwright's `browser_type.launch` method. For more details, refer to the Playwright
8787
documentation: https://playwright.dev/python/docs/api/class-browsertype#browser-type-launch.
8888
This option should not be used if `browser_pool` is provided.
89-
page_options: Keyword arguments to pass to the new page method. These options are provided directly to
90-
Playwright's `browser_context.new_page` method. For more details, refer to the Playwright documentation:
91-
https://playwright.dev/python/docs/api/class-browsercontext#browser-context-new-page.
89+
browser_new_context_options: Keyword arguments to pass to the browser new context method. These options
90+
are provided directly to Playwright's `browser.new_context` method. For more details, refer to the
91+
Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
9292
This option should not be used if `browser_pool` is provided.
9393
headless: Whether to run the browser in headless mode.
9494
This option should not be used if `browser_pool` is provided.
9595
kwargs: Additional keyword arguments to pass to the underlying `BasicCrawler`.
9696
"""
9797
if browser_pool:
9898
# Raise an exception if browser_pool is provided together with other browser-related arguments.
99-
if any(param is not None for param in (headless, browser_type, browser_options, page_options)):
99+
if any(
100+
param is not None
101+
for param in (headless, browser_type, browser_launch_options, browser_new_context_options)
102+
):
100103
raise ValueError(
101-
'You cannot provide `headless`, `browser_type`, `browser_options` or `page_options` '
102-
'arguments when `browser_pool` is provided.'
104+
'You cannot provide `headless`, `browser_type`, `browser_launch_options` or '
105+
'`browser_new_context_options` arguments when `browser_pool` is provided.'
103106
)
104107

105108
# If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
106109
else:
107110
browser_pool = BrowserPool.with_default_plugin(
108111
headless=headless,
109112
browser_type=browser_type,
110-
browser_options=browser_options,
111-
page_options=page_options,
113+
browser_launch_options=browser_launch_options,
114+
browser_new_context_options=browser_new_context_options,
112115
)
113116

114117
self._browser_pool = browser_pool

tests/unit/browsers/test_browser_pool.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ async def test_methods_raise_error_when_not_active() -> None:
154154

155155

156156
async def test_with_plugin_contains_page_options(httpbin: URL) -> None:
157-
plugin = PlaywrightBrowserPlugin(page_options={'user_agent': 'My Best User-Agent'})
157+
plugin = PlaywrightBrowserPlugin(browser_new_context_options={'user_agent': 'My Best User-Agent'})
158158
async with BrowserPool(plugins=[plugin]) as browser_pool:
159159
test_page = await browser_pool.new_page()
160160
await test_page.page.goto(str(httpbin / 'user-agent'))

0 commit comments

Comments
 (0)