Skip to content

Commit 0fbd017

Browse files
authored
feat: Add possibility to use None as no proxy in tiered proxies (#760)
Using None in input lists for tiered proxy configuration will be interpreted as no proxy. This allows to have for example lowest tier without proxy. Add tests for it. Update docs. - Closes: #687
1 parent 19cd328 commit 0fbd017

File tree

5 files changed

+83
-20
lines changed

5 files changed

+83
-20
lines changed

docs/guides/code/proxy_management/tiers_bs_example.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ async def main() -> None:
88
# Create a ProxyConfiguration object and pass it to the crawler.
99
proxy_configuration = ProxyConfiguration(
1010
tiered_proxy_urls=[
11+
# No proxy tier. (Not needed, but optional in case you do not want to use any proxy on lowest tier.)
12+
[None],
1113
# lower tier, cheaper, preferred as long as they work
1214
['http://cheap-datacenter-proxy-1.com/', 'http://cheap-datacenter-proxy-2.com/'],
1315
# higher tier, more expensive, used as a fallback

docs/guides/code/proxy_management/tiers_pw_example.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ async def main() -> None:
88
# Create a ProxyConfiguration object and pass it to the crawler.
99
proxy_configuration = ProxyConfiguration(
1010
tiered_proxy_urls=[
11+
# No proxy tier. (Not needed, but optional in case you do not want to use any proxy on lowest tier.)
12+
[None],
1113
# lower tier, cheaper, preferred as long as they work
1214
['http://cheap-datacenter-proxy-1.com/', 'http://cheap-datacenter-proxy-2.com/'],
1315
# higher tier, more expensive, used as a fallback

src/crawlee/proxy_configuration.py

Lines changed: 17 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import inspect
44
from collections import defaultdict
55
from dataclasses import dataclass
6-
from typing import TYPE_CHECKING
6+
from typing import TYPE_CHECKING, Union
77

88
from more_itertools import flatten
99
from pydantic import AnyHttpUrl, TypeAdapter
@@ -68,9 +68,9 @@ class ProxyConfiguration:
6868
def __init__(
6969
self,
7070
*,
71-
proxy_urls: list[str] | None = None,
71+
proxy_urls: list[str | None] | None = None,
7272
new_url_function: _NewUrlFunction | None = None,
73-
tiered_proxy_urls: list[list[str]] | None = None,
73+
tiered_proxy_urls: list[list[str | None]] | None = None,
7474
) -> None:
7575
"""A default constructor.
7676
@@ -85,7 +85,7 @@ def __init__(
8585
the proxy selection mechanism.
8686
"""
8787
self._next_custom_url_index = 0
88-
self._used_proxy_urls = dict[str, URL]()
88+
self._used_proxy_urls = dict[str, Union[URL, None]]()
8989
self._url_validator = TypeAdapter(AnyHttpUrl)
9090

9191
# Validation
@@ -95,18 +95,22 @@ def __init__(
9595
'must be specified (and non-empty).'
9696
)
9797

98-
self._proxy_urls = (
99-
[URL(url) for url in proxy_urls if self._url_validator.validate_python(url)] if proxy_urls else []
100-
)
98+
self._proxy_urls = [self._create_url(url) for url in proxy_urls] if proxy_urls else []
10199
self._proxy_tier_tracker = (
102-
_ProxyTierTracker(
103-
[[URL(url) for url in tier if self._url_validator.validate_python(url)] for tier in tiered_proxy_urls]
104-
)
100+
_ProxyTierTracker([[self._create_url(url) for url in tier] for tier in tiered_proxy_urls])
105101
if tiered_proxy_urls
106102
else None
107103
)
108104
self._new_url_function = new_url_function
109105

106+
def _create_url(self, url: str | None) -> URL | None:
107+
"""Create URL from input string. None means that intentionally no proxy should be used."""
108+
if url is None:
109+
return None
110+
111+
self._url_validator.validate_python(url)
112+
return URL(url)
113+
110114
async def new_proxy_info(
111115
self, session_id: str | None, request: Request | None, proxy_tier: int | None
112116
) -> ProxyInfo | None:
@@ -208,16 +212,16 @@ async def _pick_url(
208212
class _ProxyTierTracker:
209213
"""Tracks the state of currently used proxy tiers and their error frequency for individual crawled domains."""
210214

211-
def __init__(self, tiered_proxy_urls: list[list[URL]]) -> None:
215+
def __init__(self, tiered_proxy_urls: list[list[URL | None]]) -> None:
212216
self._tiered_proxy_urls = tiered_proxy_urls
213217
self._histogram_by_domain = defaultdict[str, list[int]](lambda: [0 for _tier in tiered_proxy_urls])
214218
self._current_tier_by_domain = defaultdict[str, int](lambda: 0)
215219

216220
@property
217-
def all_urls(self) -> Sequence[URL]:
221+
def all_urls(self) -> Sequence[URL | None]:
218222
return list(flatten(self._tiered_proxy_urls))
219223

220-
def get_tier_urls(self, tier_number: int) -> Sequence[URL]:
224+
def get_tier_urls(self, tier_number: int) -> Sequence[URL | None]:
221225
return self._tiered_proxy_urls[tier_number]
222226

223227
def add_error(self, domain: str, tier: int) -> None:

tests/unit/proxy_configuration/test_new_proxy_info.py

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
from __future__ import annotations
22

3+
from itertools import cycle
34
from typing import TYPE_CHECKING
45

56
import pytest
@@ -11,10 +12,13 @@
1112

1213

1314
async def test_returns_proxy_info() -> None:
14-
config = ProxyConfiguration(proxy_urls=['http://proxy.com:1111'])
15+
"""Test that proxy_urls can return contain both string and None."""
16+
config = ProxyConfiguration(proxy_urls=[None, 'http://proxy.com:1111'])
1517

1618
proxy_info = await config.new_proxy_info(None, None, None)
19+
assert proxy_info is None
1720

21+
proxy_info = await config.new_proxy_info(None, None, None)
1822
assert proxy_info is not None
1923
assert proxy_info.url == 'http://proxy.com:1111'
2024
assert proxy_info.hostname == 'proxy.com'
@@ -33,10 +37,15 @@ async def test_throws_on_invalid_new_url_function() -> None:
3337

3438

3539
async def test_returns_proxy_info_with_new_url_function() -> None:
36-
config = ProxyConfiguration(new_url_function=lambda session_id=None, request=None: 'http://proxy.com:1111') # noqa: ARG005
40+
"""Test that new_url_function can return string and None."""
41+
proxy_iterator = cycle([None, 'http://proxy.com:1111'])
42+
43+
config = ProxyConfiguration(new_url_function=lambda session_id=None, request=None: next(proxy_iterator)) # noqa: ARG005
3744

3845
proxy_info = await config.new_proxy_info(None, None, None)
46+
assert proxy_info is None
3947

48+
proxy_info = await config.new_proxy_info(None, None, None)
4049
assert proxy_info is not None
4150
assert proxy_info.url == 'http://proxy.com:1111'
4251
assert proxy_info.hostname == 'proxy.com'
@@ -62,7 +71,7 @@ async def new_url(session_id: str | None = None, request: Request | None = None)
6271

6372

6473
async def test_rotates_proxies() -> None:
65-
proxy_urls = ['http://proxy:1111', 'http://proxy:2222', 'http://proxy:3333']
74+
proxy_urls: list[str | None] = ['http://proxy:1111', 'http://proxy:2222', 'http://proxy:3333']
6675
config = ProxyConfiguration(proxy_urls=proxy_urls)
6776

6877
info = await config.new_proxy_info(None, None, None)
@@ -79,7 +88,7 @@ async def test_rotates_proxies() -> None:
7988

8089

8190
async def test_rotates_proxies_with_sessions() -> None:
82-
proxy_urls = ['http://proxy:1111', 'http://proxy:2222', 'http://proxy:3333']
91+
proxy_urls: list[str | None] = ['http://proxy:1111', 'http://proxy:2222', 'http://proxy:3333']
8392
sessions = [f'session_{i}' for i in range(6)]
8493

8594
config = ProxyConfiguration(proxy_urls=proxy_urls)

tests/unit/proxy_configuration/test_tiers.py

Lines changed: 49 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66

77
async def test_rotates_proxies_uniformly_with_no_request() -> None:
8-
tiered_proxy_urls = [
8+
tiered_proxy_urls: list[list[str | None]] = [
99
['http://proxy:1111', 'http://proxy:2222'],
1010
['http://proxy:3333', 'http://proxy:4444'],
1111
]
@@ -34,7 +34,7 @@ async def test_rotates_proxies_uniformly_with_no_request() -> None:
3434

3535

3636
async def test_retrying_request_makes_tier_go_up() -> None:
37-
tiered_proxy_urls = [
37+
tiered_proxy_urls: list[list[str | None]] = [
3838
['http://proxy:1111'],
3939
['http://proxy:2222'],
4040
['http://proxy:3333'],
@@ -71,7 +71,7 @@ async def test_successful_request_makes_tier_go_down() -> None:
7171
ProxyConfiguration assumes those are retries. Then, requesting a proxy for different requests to the same domain
7272
will cause the tier to drop back down."""
7373

74-
tiered_proxy_urls = [
74+
tiered_proxy_urls: list[list[str | None]] = [
7575
['http://proxy:1111'],
7676
['http://proxy:2222'],
7777
['http://proxy:3333'],
@@ -94,3 +94,49 @@ async def test_successful_request_makes_tier_go_down() -> None:
9494

9595
assert info is not None
9696
assert info.url == tiered_proxy_urls[0][0]
97+
98+
99+
async def test_none_proxy_retrying_request_makes_tier_go_up() -> None:
100+
tiered_proxy_urls: list[list[str | None]] = [
101+
[None],
102+
['http://proxy:1111'],
103+
]
104+
105+
config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)
106+
107+
# Calling `new_proxy_info` with the same request most probably means it's being retried
108+
request_1 = Request(url='http://some.domain/abc', unique_key='1', id='1')
109+
110+
# No proxy used.
111+
info = await config.new_proxy_info(None, request_1, None)
112+
assert info is None, 'First entry in tired_proxy_urls is None. config.new_proxy_info is expected to generate None.'
113+
114+
# Proxy should go up one tier for same request that was already sent before.
115+
info = await config.new_proxy_info(None, request_1, None)
116+
assert info is not None, (
117+
'config.new_proxy_info is expected to generate non-none proxy info from non-none ' 'tiered_proxy_urls.'
118+
)
119+
assert info.url == tiered_proxy_urls[1][0]
120+
121+
122+
async def test_none_proxy_rotates_proxies_uniformly_with_no_request() -> None:
123+
tiered_proxy_urls = [
124+
[None, 'http://proxy:1111'],
125+
]
126+
127+
config = ProxyConfiguration(tiered_proxy_urls=tiered_proxy_urls)
128+
129+
# No proxy used.
130+
info = await config.new_proxy_info(None, None, None)
131+
assert info is None, 'First entry in tired_proxy_urls is None. config.new_proxy_info is expected to generate None.'
132+
133+
# Proxy should be rotated on the same proxy tier for a new request.
134+
info = await config.new_proxy_info(None, None, None)
135+
assert info is not None, (
136+
'config.new_proxy_info is expected to generate non-none proxy info from non-none ' 'tiered_proxy_urls.'
137+
)
138+
assert info.url == tiered_proxy_urls[0][1]
139+
140+
# Proxy rotation starts from the beginning of the proxy list after last proxy in tier was used. No proxy used again.
141+
info = await config.new_proxy_info(None, None, None)
142+
assert info is None, 'First entry in tired_proxy_urls is None. config.new_proxy_info is expected to generate None.'

0 commit comments

Comments
 (0)