Skip to content

Commit 2b156b4

Browse files
Pijukatelbarjinvdusek
authored
feat: Integrate browserforge fingerprints (#829)
### Description Integrate `browserforge` package and use it's fingerprint and header generation capabilities to enable fingerprint generation in PlaywrightCrawler ### Issues - Will be documented later by #481 - Closes: #549 --------- Co-authored-by: Jindřich Bär <[email protected]> Co-authored-by: Vlada Dusek <[email protected]>
1 parent 4c362cb commit 2b156b4

16 files changed

+510
-81
lines changed

Makefile

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ install-dev:
1010
poetry install --all-extras
1111
poetry run pre-commit install
1212
poetry run playwright install
13+
poetry run python -m browserforge update
1314

1415
build:
1516
poetry build --no-interaction -vv
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
import asyncio
2+
3+
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
4+
from crawlee.fingerprint_suite import DefaultFingerprintGenerator, HeaderGeneratorOptions, ScreenOptions
5+
6+
7+
async def main() -> None:
8+
# Use default fingerprint generator with desired fingerprint options.
9+
# Generator will try to generate real looking browser fingerprint based on the options.
10+
# Unspecified fingerprint options will be automatically selected by the generator.
11+
fingerprint_generator = DefaultFingerprintGenerator(
12+
header_options=HeaderGeneratorOptions(browsers=['chromium']),
13+
screen_options=ScreenOptions(min_width=400),
14+
)
15+
16+
crawler = PlaywrightCrawler(
17+
# Limit the crawl to max requests. Remove or increase it for crawling all links.
18+
max_requests_per_crawl=10,
19+
# Headless mode, set to False to see the browser in action.
20+
headless=False,
21+
# Browser types supported by Playwright.
22+
browser_type='chromium',
23+
# Fingerprint generator to be used. By default no fingerprint generation is done.
24+
fingerprint_generator=fingerprint_generator,
25+
)
26+
27+
# Define the default request handler, which will be called for every request.
28+
@crawler.router.default_handler
29+
async def request_handler(context: PlaywrightCrawlingContext) -> None:
30+
context.log.info(f'Processing {context.request.url} ...')
31+
32+
# Find a link to the next page and enqueue it if it exists.
33+
await context.enqueue_links(selector='.morelink')
34+
35+
# Run the crawler with the initial list of URLs.
36+
await crawler.run(['https://news.ycombinator.com/'])
37+
38+
39+
if __name__ == '__main__':
40+
asyncio.run(main())
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
---
2+
id: playwright-crawler-with-fingeprint-generator
3+
title: Playwright crawler with fingerprint generator
4+
---
5+
6+
import ApiLink from '@site/src/components/ApiLink';
7+
import CodeBlock from '@theme/CodeBlock';
8+
9+
import PlaywrightCrawlerExample from '!!raw-loader!./code/playwright_crawler_with_fingerprint_generator.py';
10+
11+
This example demonstrates how to use <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> together with <ApiLink to="class/FingerprintGenerator">`FingerprintGenerator`</ApiLink> that will populate several browser attributes to mimic real browser fingerprint. To read more about fingerprints please see: https://docs.apify.com/academy/anti-scraping/techniques/fingerprinting.
12+
13+
You can implement your own fingerprint generator or use <ApiLink to="class/BrowserforgeFingerprintGenerator">`DefaultFingerprintGenerator`</ApiLink>. To use the generator initialize it with the desired fingerprint options. The generator will try to create fingerprint based on those options. Unspecified options will be automatically selected by the generator from the set of reasonable values. If some option is important for you, do not rely on the default and explicitly define it.
14+
15+
<CodeBlock className="language-python">
16+
{PlaywrightCrawlerExample}
17+
</CodeBlock>

poetry.lock

Lines changed: 64 additions & 45 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ keywords = [
4444
python = "^3.9"
4545
apify = { version = ">=2.0.0", optional = true }
4646
beautifulsoup4 = { version = ">=4.12.0", optional = true }
47+
browserforge = { version = ">=1.2.3", optional = true }
4748
colorama = ">=0.4.0"
4849
cookiecutter = ">=2.6.0"
4950
curl-cffi = { version = ">=0.7.2", optional = true }
@@ -100,10 +101,10 @@ types-python-dateutil = "~2.9.0.20240316"
100101
# Support for re-using groups in other groups https://peps.python.org/pep-0735/ in poetry:
101102
# https://github.com/python-poetry/poetry/issues/9751
102103
adaptive-playwright = ["jaro-winkler", "playwright", "scikit-learn"]
103-
all = ["beautifulsoup4", "curl-cffi", "html5lib", "jaro-winkler", "lxml", "parsel", "playwright", "scikit-learn"]
104+
all = ["beautifulsoup4", "browserforge", "curl-cffi", "html5lib", "jaro-winkler", "lxml", "parsel", "playwright", "scikit-learn"]
104105
beautifulsoup = ["beautifulsoup4", "lxml", "html5lib"]
105106
curl-impersonate = ["curl-cffi"]
106-
playwright = ["playwright"]
107+
playwright = ["browserforge", "playwright"]
107108
parsel = ["parsel"]
108109

109110
[tool.poetry.scripts]

src/crawlee/browsers/_browser_pool.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from types import TracebackType
2424

2525
from crawlee.browsers._base_browser_plugin import BaseBrowserPlugin
26+
from crawlee.fingerprint_suite import FingerprintGenerator
2627
from crawlee.proxy_configuration import ProxyInfo
2728

2829
logger = getLogger(__name__)
@@ -103,6 +104,7 @@ def with_default_plugin(
103104
browser_launch_options: Mapping[str, Any] | None = None,
104105
browser_new_context_options: Mapping[str, Any] | None = None,
105106
headless: bool | None = None,
107+
fingerprint_generator: FingerprintGenerator | None = None,
106108
use_incognito_pages: bool | None = False,
107109
**kwargs: Any,
108110
) -> BrowserPool:
@@ -117,6 +119,8 @@ def with_default_plugin(
117119
are provided directly to Playwright's `browser.new_context` method. For more details, refer to the
118120
Playwright documentation: https://playwright.dev/python/docs/api/class-browser#browser-new-context.
119121
headless: Whether to run the browser in headless mode.
122+
fingerprint_generator: An optional instance of implementation of `FingerprintGenerator` that is used
123+
to generate browser fingerprints together with consistent headers.
120124
use_incognito_pages: By default pages share the same browser context. If set to True each page uses its
121125
own context that is destroyed once the page is closed or crashes.
122126
kwargs: Additional arguments for default constructor.
@@ -134,7 +138,10 @@ def with_default_plugin(
134138
if browser_type:
135139
plugin_options['browser_type'] = browser_type
136140

137-
plugin = PlaywrightBrowserPlugin(**plugin_options)
141+
plugin = PlaywrightBrowserPlugin(
142+
**plugin_options,
143+
fingerprint_generator=fingerprint_generator,
144+
)
138145
return cls(plugins=[plugin], **kwargs)
139146

140147
@property

0 commit comments

Comments
 (0)