3
3
import inspect
4
4
from collections import defaultdict
5
5
from dataclasses import dataclass
6
- from typing import TYPE_CHECKING
6
+ from typing import TYPE_CHECKING , Union
7
7
8
8
from more_itertools import flatten
9
9
from pydantic import AnyHttpUrl , TypeAdapter
@@ -68,9 +68,9 @@ class ProxyConfiguration:
68
68
def __init__ (
69
69
self ,
70
70
* ,
71
- proxy_urls : list [str ] | None = None ,
71
+ proxy_urls : list [str | None ] | None = None ,
72
72
new_url_function : _NewUrlFunction | None = None ,
73
- tiered_proxy_urls : list [list [str ]] | None = None ,
73
+ tiered_proxy_urls : list [list [str | None ]] | None = None ,
74
74
) -> None :
75
75
"""A default constructor.
76
76
@@ -85,7 +85,7 @@ def __init__(
85
85
the proxy selection mechanism.
86
86
"""
87
87
self ._next_custom_url_index = 0
88
- self ._used_proxy_urls = dict [str , URL ]()
88
+ self ._used_proxy_urls = dict [str , Union [ URL , None ] ]()
89
89
self ._url_validator = TypeAdapter (AnyHttpUrl )
90
90
91
91
# Validation
@@ -95,18 +95,22 @@ def __init__(
95
95
'must be specified (and non-empty).'
96
96
)
97
97
98
- self ._proxy_urls = (
99
- [URL (url ) for url in proxy_urls if self ._url_validator .validate_python (url )] if proxy_urls else []
100
- )
98
+ self ._proxy_urls = [self ._create_url (url ) for url in proxy_urls ] if proxy_urls else []
101
99
self ._proxy_tier_tracker = (
102
- _ProxyTierTracker (
103
- [[URL (url ) for url in tier if self ._url_validator .validate_python (url )] for tier in tiered_proxy_urls ]
104
- )
100
+ _ProxyTierTracker ([[self ._create_url (url ) for url in tier ] for tier in tiered_proxy_urls ])
105
101
if tiered_proxy_urls
106
102
else None
107
103
)
108
104
self ._new_url_function = new_url_function
109
105
106
+ def _create_url (self , url : str | None ) -> URL | None :
107
+ """Create URL from input string. None means that intentionally no proxy should be used."""
108
+ if url is None :
109
+ return None
110
+
111
+ self ._url_validator .validate_python (url )
112
+ return URL (url )
113
+
110
114
async def new_proxy_info (
111
115
self , session_id : str | None , request : Request | None , proxy_tier : int | None
112
116
) -> ProxyInfo | None :
@@ -208,16 +212,16 @@ async def _pick_url(
208
212
class _ProxyTierTracker :
209
213
"""Tracks the state of currently used proxy tiers and their error frequency for individual crawled domains."""
210
214
211
- def __init__ (self , tiered_proxy_urls : list [list [URL ]]) -> None :
215
+ def __init__ (self , tiered_proxy_urls : list [list [URL | None ]]) -> None :
212
216
self ._tiered_proxy_urls = tiered_proxy_urls
213
217
self ._histogram_by_domain = defaultdict [str , list [int ]](lambda : [0 for _tier in tiered_proxy_urls ])
214
218
self ._current_tier_by_domain = defaultdict [str , int ](lambda : 0 )
215
219
216
220
@property
217
- def all_urls (self ) -> Sequence [URL ]:
221
+ def all_urls (self ) -> Sequence [URL | None ]:
218
222
return list (flatten (self ._tiered_proxy_urls ))
219
223
220
- def get_tier_urls (self , tier_number : int ) -> Sequence [URL ]:
224
+ def get_tier_urls (self , tier_number : int ) -> Sequence [URL | None ]:
221
225
return self ._tiered_proxy_urls [tier_number ]
222
226
223
227
def add_error (self , domain : str , tier : int ) -> None :
0 commit comments