Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/guides/avoid_blocking.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ Browser fingerprint is a collection of browser attributes and significant featur

## Using browser fingerprints

Changing browser fingerprints can be a tedious job. Luckily, Crawlee provides this feature with minimal configuration necessary - the usage of fingerprints can be enabled in <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> by using the `fingerprint_generator` argument of the <ApiLink to="class/PlaywrightCrawler#__init__">`PlaywrightCrawler.__init__`</ApiLink>. You can either pass your own implementation of <ApiLink to="class/FingerprintGenerator">`FingerprintGenerator`</ApiLink> or use <ApiLink to="class/BrowserforgeFingerprintGenerator">`DefaultFingerprintGenerator`</ApiLink>.
Changing browser fingerprints can be a tedious job. Luckily, Crawlee provides this feature with minimal configuration necessary - the usage of fingerprints in <ApiLink to="class/PlaywrightCrawler">`PlaywrightCrawler`</ApiLink> is enabled by default. You can customize the fingerprints by using the `fingerprint_generator` argument of the <ApiLink to="class/PlaywrightCrawler#__init__">`PlaywrightCrawler.__init__`</ApiLink>, either pass your own implementation of <ApiLink to="class/FingerprintGenerator">`FingerprintGenerator`</ApiLink> or use <ApiLink to="class/BrowserforgeFingerprintGenerator">`DefaultFingerprintGenerator`</ApiLink>.

<CodeBlock className="language-python">
{PlaywrightDefaultFingerprintGenerator}
Expand All @@ -29,7 +29,7 @@ In certain cases we want to narrow down the fingerprints used - e.g. specify a c
{PlaywrightDefaultFingerprintGeneratorWithArgs}
</CodeBlock>

If you do not want to use fingerprints, then do not pass `fingerprint_generator` argument to the <ApiLink to="class/PlaywrightCrawler#__init__">`PlaywrightCrawler.__init__`</ApiLink>. By default, fingerprints are disabled.
If you do not want to use fingerprints, then pass `fingerprint_generator=None` argument to the <ApiLink to="class/PlaywrightCrawler#__init__">`PlaywrightCrawler.__init__`</ApiLink>.

## Using Camoufox

Expand Down
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
import asyncio

from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
from crawlee.fingerprint_suite import DefaultFingerprintGenerator


async def main() -> None:
crawler = PlaywrightCrawler(
# Fingerprint generator to be used. By default no fingerprint generation is done.
fingerprint_generator=DefaultFingerprintGenerator(),
)
# Fingerprint generator is used by default.
crawler = PlaywrightCrawler()

# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
Expand Down
14 changes: 10 additions & 4 deletions src/crawlee/crawlers/_playwright/_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import logging
from functools import partial
from typing import TYPE_CHECKING, Any, Callable, Generic
from typing import TYPE_CHECKING, Any, Callable, Generic, Literal

from pydantic import ValidationError
from typing_extensions import NotRequired, TypedDict, TypeVar
Expand All @@ -14,6 +14,7 @@
from crawlee.browsers import BrowserPool
from crawlee.crawlers._basic import BasicCrawler, BasicCrawlerOptions, ContextPipeline
from crawlee.errors import SessionError
from crawlee.fingerprint_suite import DefaultFingerprintGenerator, FingerprintGenerator, HeaderGeneratorOptions
from crawlee.sessions._cookies import PlaywrightCookieParam
from crawlee.statistics import StatisticsState

Expand All @@ -34,7 +35,6 @@
from crawlee import RequestTransformAction
from crawlee._types import BasicCrawlingContext, EnqueueLinksKwargs
from crawlee.browsers._types import BrowserType
from crawlee.fingerprint_suite import FingerprintGenerator


@docs_group('Classes')
Expand Down Expand Up @@ -86,7 +86,7 @@ def __init__(
user_data_dir: str | Path | None = None,
browser_launch_options: Mapping[str, Any] | None = None,
browser_new_context_options: Mapping[str, Any] | None = None,
fingerprint_generator: FingerprintGenerator | None = None,
fingerprint_generator: FingerprintGenerator | None | Literal['default'] = 'default',
headless: bool | None = None,
use_incognito_pages: bool | None = None,
**kwargs: Unpack[BasicCrawlerOptions[PlaywrightCrawlingContext, StatisticsState]],
Expand Down Expand Up @@ -119,7 +119,7 @@ def __init__(
if browser_pool:
# Raise an exception if browser_pool is provided together with other browser-related arguments.
if any(
param is not None
param not in [None, 'default']
for param in (
user_data_dir,
use_incognito_pages,
Expand All @@ -138,6 +138,12 @@ def __init__(

# If browser_pool is not provided, create a new instance of BrowserPool with specified arguments.
else:
if fingerprint_generator == 'default':
generator_browser_type = None if browser_type is None else [browser_type]
fingerprint_generator = DefaultFingerprintGenerator(
header_options=HeaderGeneratorOptions(browsers=generator_browser_type)
)

browser_pool = BrowserPool.with_default_plugin(
headless=headless,
browser_type=browser_type,
Expand Down
154 changes: 117 additions & 37 deletions src/crawlee/fingerprint_suite/_browserforge_adapter.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,17 @@
from __future__ import annotations

from collections.abc import Iterable
from copy import deepcopy
from typing import TYPE_CHECKING, Any
from functools import reduce
from operator import or_
from typing import TYPE_CHECKING, Any, Literal

from browserforge.bayesian_network import extract_json
from browserforge.fingerprints import Fingerprint as bf_Fingerprint
from browserforge.fingerprints import FingerprintGenerator as bf_FingerprintGenerator
from browserforge.fingerprints import Screen
from browserforge.headers.generator import DATA_DIR
from browserforge.headers import Browser
from browserforge.headers.generator import DATA_DIR, ListOrString
from browserforge.headers.generator import HeaderGenerator as bf_HeaderGenerator
from typing_extensions import override

Expand All @@ -21,7 +25,7 @@


class PatchedHeaderGenerator(bf_HeaderGenerator):
"""Browserforge `HeaderGenerator` that contains patches not accepted in upstream repo."""
"""Browserforge `HeaderGenerator` that contains patches specific for our usage of the generator."""

def _get_accept_language_header(self, locales: tuple[str, ...]) -> str:
"""Generates the Accept-Language header based on the given locales.
Expand All @@ -38,6 +42,114 @@ def _get_accept_language_header(self, locales: tuple[str, ...]) -> str:
additional_locales = [f'{locale};q={0.9 - index * 0.1:.1f}' for index, locale in enumerate(locales[1:])]
return ','.join((locales[0], *additional_locales))

def generate(
self,
*,
browser: Iterable[str | Browser] | None = None,
os: ListOrString | None = None,
device: ListOrString | None = None,
locale: ListOrString | None = None,
http_version: Literal[1, 2] | None = None,
user_agent: ListOrString | None = None,
strict: bool | None = None,
request_dependent_headers: dict[str, str] | None = None,
) -> dict[str, str]:
"""Generate HTTP headers based on the specified parameters.

For detailed description of the original method see: `browserforge.headers.generator.HeaderGenerator.generate`
This patched version of the method adds additional quality checks on the output of the original method. It tries
to generate headers several times until they match the requirements.

The `browser` parameter accepts `chromium` as a general category, which includes not only Google Chrome
but also other Chromium-based browsers. As a result, a Safari-like user agent may be generated for a `chromium`
input, such as:
```
Mozilla/5.0 (iPhone; CPU iPhone OS 18_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko)
CriOS/130.0.6723.90 Mobile/15E148 Safari/604.1
```
To maintain consistency with previous implementations, only a subset of Chromium headers will be allowed.

Returns:
A generated headers.
"""
# browserforge header generation can be flaky. Enforce basic QA on generated headers
max_attempts = 10

single_browser = self._get_single_browser_type(browser)

if single_browser == 'chromium':
# `BrowserForge` header generator considers `chromium` in general sense and therefore will generate also
# other `Chromium` based browser headers. This adapter desires only specific subset of `chromium` headers
# that contain all 'sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform' headers.
# Increase max attempts as from `BrowserForge` header generator perspective even `chromium`
# headers without `sec-...` headers are valid.
max_attempts += 50

# Browserforge uses term 'safari', we use term 'webkit'
bf_browser_type = 'safari' if single_browser == 'webkit' else single_browser

# Use browserforge to generate headers until it satisfies our additional requirements.
for _attempt in range(max_attempts):
generated_header: dict[str, str] = super().generate(
browser=bf_browser_type,
os=os,
device=device,
locale=locale,
http_version=http_version,
user_agent=user_agent,
strict=strict,
request_dependent_headers=request_dependent_headers,
)

if ('headless' in generated_header.get('User-Agent', '').lower()) or (
'headless' in generated_header.get('sec-ch-ua', '').lower()
):
# It can be a valid header, but we never want to leak "headless". Get a different one.
continue

if any(
keyword in generated_header['User-Agent']
for keyword in self._get_expected_browser_keywords(single_browser)
):
if single_browser == 'chromium' and not self._contains_all_sec_headers(generated_header):
# Accept chromium header only with all sec headers.
continue

return generated_header
raise RuntimeError('Failed to generate header.')

def _contains_all_sec_headers(self, headers: dict[str, str]) -> bool:
return all(header_name in headers for header_name in ('sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform'))

def _get_expected_browser_keywords(self, browser: str | None) -> set[str]:
if not browser:
# Allow all possible keywords when there is no preference for specific browser type.
return reduce(or_, BROWSER_TYPE_HEADER_KEYWORD.values())

return BROWSER_TYPE_HEADER_KEYWORD[browser]

def _get_single_browser_type(self, browser: Iterable[str | Browser] | None) -> str | None:
"""Get single browser type.

Browserforge header generator accepts wider range of possible types.
Narrow it to single optional string as that is how we use it.
Handling the original multitype would be pointlessly complex.
"""
# In our case we never pass more than one browser type. In general case more browsers are just bigger pool to
# select from, so narrowing it to the first one is still a valid action.
first_browser = (
next(iter(browser)) if (isinstance(browser, Iterable) and not isinstance(browser, str)) else browser
)

if isinstance(first_browser, str):
single_name = first_browser
elif isinstance(first_browser, Browser):
single_name = first_browser.name
else:
single_name = None

return single_name


class PatchedFingerprintGenerator(bf_FingerprintGenerator):
"""Browserforge `FingerprintGenerator` that contains patches not accepted in upstream repo."""
Expand Down Expand Up @@ -91,8 +203,6 @@ def __init__(
screen_options: Defines the screen constrains for the fingerprint generator.
mock_web_rtc: Whether to mock WebRTC when injecting the fingerprint.
slim: Disables performance-heavy evasions when injecting the fingerprint.
strict: If set to `True`, it will raise error if it is not possible to generate fingerprints based on the
`options`. Default behavior is relaxation of `options` until it is possible to generate a fingerprint.
"""
bf_options: dict[str, Any] = {'mock_webrtc': mock_web_rtc, 'slim': slim}

Expand Down Expand Up @@ -136,38 +246,8 @@ def __init__(self) -> None:
self._generator = PatchedHeaderGenerator(locale=['en-US', 'en'])

def generate(self, browser_type: SupportedBrowserType = 'chromium') -> dict[str, str]:
"""Generate headers.

browser_type = `chromium` is in general sense not just Google Chrome, but also other chromium based browsers.
For example this Safari user agent can be generated for `chromium` input:
`Mozilla/5.0 (iPhone; CPU iPhone OS 18_0 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko)
CriOS/130.0.6723.90 Mobile/15E148 Safari/604.1`
To remain consistent with previous implementation only subset of `chromium` header will be allowed.
"""
# browserforge header generation can be flaky. Enforce basic QA on generated headers
max_attempts = 10

if browser_type == 'chromium':
# `BrowserForge` header generator considers `chromium` in general sense and therefore will generate also
# other `Chromium` based browser headers. This adapter desires only specific subset of `chromium` headers
# that contain all 'sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform' headers.
# Increase max attempts as from `BrowserForge` header generator perspective even `chromium`
# headers without `sec-...` headers are valid.
max_attempts += 50

bf_browser_type = 'safari' if browser_type == 'webkit' else browser_type

for _attempt in range(max_attempts):
generated_header: dict[str, str] = self._generator.generate(browser=bf_browser_type)
if any(keyword in generated_header['User-Agent'] for keyword in BROWSER_TYPE_HEADER_KEYWORD[browser_type]):
if browser_type == 'chromium' and not self._contains_all_sec_headers(generated_header):
continue

return generated_header
raise RuntimeError('Failed to generate header.')

def _contains_all_sec_headers(self, headers: dict[str, str]) -> bool:
return all(header_name in headers for header_name in ('sec-ch-ua', 'sec-ch-ua-mobile', 'sec-ch-ua-platform'))
"""Generate headers."""
return self._generator.generate(browser=browser_type)


def get_available_header_network() -> dict:
Expand Down
22 changes: 18 additions & 4 deletions tests/unit/crawlers/_playwright/test_playwright_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from __future__ import annotations

import json
from typing import TYPE_CHECKING, Any
from typing import TYPE_CHECKING, Any, Literal
from unittest import mock
from unittest.mock import Mock

Expand All @@ -15,6 +15,7 @@
from crawlee.crawlers import PlaywrightCrawler
from crawlee.fingerprint_suite import (
DefaultFingerprintGenerator,
FingerprintGenerator,
HeaderGeneratorOptions,
ScreenOptions,
)
Expand Down Expand Up @@ -145,9 +146,22 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
assert handled_urls == set()


async def test_chromium_headless_headers(header_network: dict) -> None:
@pytest.mark.parametrize(
'fingerprint_generator',
[
pytest.param(None, id='No fingerprint generator. Headers generated by header generator.'),
pytest.param(
DefaultFingerprintGenerator(header_options=HeaderGeneratorOptions(browsers=['chromium'])),
id='Explicitly passed fingerprint generator.',
),
pytest.param('default', id='Default fingerprint generator.'),
],
)
async def test_chromium_headless_headers(
header_network: dict, fingerprint_generator: None | FingerprintGenerator | Literal['default']
) -> None:
browser_type: BrowserType = 'chromium'
crawler = PlaywrightCrawler(headless=True, browser_type=browser_type)
crawler = PlaywrightCrawler(headless=True, browser_type=browser_type, fingerprint_generator=fingerprint_generator)
headers = dict[str, str]()

@crawler.pre_navigation_hook
Expand All @@ -169,7 +183,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:

user_agent = headers.get('user-agent')
assert user_agent in get_available_header_values(header_network, {'user-agent', 'User-Agent'})
assert any(keyword in user_agent for keyword in BROWSER_TYPE_HEADER_KEYWORD[browser_type])
assert any(keyword in user_agent for keyword in BROWSER_TYPE_HEADER_KEYWORD[browser_type]), user_agent

assert headers.get('sec-ch-ua') in get_available_header_values(header_network, 'sec-ch-ua')
assert headers.get('sec-ch-ua-mobile') in get_available_header_values(header_network, 'sec-ch-ua-mobile')
Expand Down
Loading