Skip to content

Commit 83fa8a4

Browse files
authored
refactor!: Change method HttpResponse.read from sync to async (#1296)
### Description - Makes the `read` method for `HttpResponse` asynchronous. This is more consistent in a fully asynchronous framework. More flexible than a synchronous method
1 parent 0c4cfc9 commit 83fa8a4

File tree

22 files changed

+36
-31
lines changed

22 files changed

+36
-31
lines changed

docs/examples/code_examples/fill_and_submit_web_form_crawler.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ async def main() -> None:
1212
@crawler.router.default_handler
1313
async def request_handler(context: HttpCrawlingContext) -> None:
1414
context.log.info(f'Processing {context.request.url} ...')
15-
response = context.http_response.read().decode('utf-8')
15+
response = (await context.http_response.read()).decode('utf-8')
1616
context.log.info(f'Response: {response}') # To see the response in the logs.
1717

1818
# Prepare a POST request to the form endpoint.

docs/guides/code_examples/creating_web_archive/manual_archiving_parsel_crawler.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,10 @@
88
from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
99

1010

11-
def archive_response(context: ParselCrawlingContext, writer: WARCWriter) -> None:
11+
async def archive_response(context: ParselCrawlingContext, writer: WARCWriter) -> None:
1212
"""Helper function for archiving response in WARC format."""
1313
# Create WARC records for response
14-
response_body = context.http_response.read()
14+
response_body = await context.http_response.read()
1515
response_payload_stream = io.BytesIO(response_body)
1616

1717
response_headers = StatusAndHeaders(
@@ -51,7 +51,7 @@ async def main() -> None:
5151
@crawler.router.default_handler
5252
async def request_handler(context: ParselCrawlingContext) -> None:
5353
context.log.info(f'Archiving {context.request.url} ...')
54-
archive_response(context=context, writer=writer)
54+
await archive_response(context=context, writer=writer)
5555
await context.enqueue_links(strategy='same-domain')
5656

5757
await crawler.run(['https://crawlee.dev/'])

docs/guides/code_examples/error_handling/change_handle_error_status.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ async def default_handler(context: HttpCrawlingContext) -> None:
3030
response = await context.send_request(
3131
'https://placeholder.org/refresh', headers=headers
3232
)
33-
data = json.loads(response.read())
33+
data = json.loads(await response.read())
3434
# Add the new token to our `Request` headers
3535
new_headers = {
3636
**context.request.headers,

docs/guides/code_examples/login_crawler/http_login.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@ async def login_handler(context: HttpCrawlingContext) -> None:
4646
raise RuntimeError('Session not found')
4747

4848
# Parse the API response containing authentication tokens and user data
49-
data = json.loads(context.http_response.read())
49+
data = json.loads(await context.http_response.read())
5050

5151
# Extract authentication data from the response
5252
token = data['token']

docs/guides/code_examples/session_management/sm_basic.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,7 @@ async def default_handler(context: BasicCrawlingContext) -> None:
3030
# and `context.proxy_info`.
3131
response = await context.send_request(context.request.url)
3232

33-
page_content = response.read().decode()
33+
page_content = (await response.read()).decode()
3434
title_match = re.search(r'<title(?:.*?)>(.*?)</title>', page_content)
3535

3636
if context.session and (title := title_match.group(1) if title_match else None):

docs/guides/code_examples/session_management/sm_http.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ async def main() -> None:
2626
# based on the response content and potential blocking
2727
@crawler.router.default_handler
2828
async def default_handler(context: HttpCrawlingContext) -> None:
29-
page_content = context.http_response.read().decode()
29+
page_content = (await context.http_response.read()).decode()
3030
title_match = re.search(r'<title(?:.*?)>(.*?)</title>', page_content)
3131

3232
if context.session and (title := title_match.group(1) if title_match else None):

docs/upgrading/upgrading_to_v1.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,8 @@ dataset = await Dataset.open(
121121
The `persist_storage` and `persist_metadata` fields have been removed from the `Configuration` class.
122122
Persistence is now determined solely by the storage client class you use.
123123

124+
The `read` method for `HttpResponse` has been changed from synchronous to asynchronous.
125+
124126
### Storage client instance behavior
125127

126128
Instance caching is implemented for the storage open methods: `Dataset.open()`, `KeyValueStore.open()`,

src/crawlee/_utils/robots.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,9 @@ async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | N
5757
proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
5858
"""
5959
response = await http_client.send_request(url, proxy_info=proxy_info)
60-
body = b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else response.read()
60+
body = (
61+
b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else await response.read()
62+
)
6163

6264
robots = Protego.parse(body.decode('utf-8'))
6365

src/crawlee/crawlers/_abstract_http/_http_crawling_context.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def from_basic_crawling_context(cls, context: BasicCrawlingContext, http_respons
2626

2727
async def get_snapshot(self) -> PageSnapshot:
2828
"""Get snapshot of crawled page."""
29-
return PageSnapshot(html=self.http_response.read().decode('utf-8'))
29+
return PageSnapshot(html=(await self.http_response.read()).decode('utf-8'))
3030

3131

3232
@dataclass(frozen=True)

src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ def __init__(self, parser: BeautifulSoupParserType = 'lxml') -> None:
2121

2222
@override
2323
async def parse(self, response: HttpResponse) -> BeautifulSoup:
24-
return BeautifulSoup(response.read(), features=self._parser)
24+
return BeautifulSoup(await response.read(), features=self._parser)
2525

2626
@override
2727
async def parse_text(self, text: str) -> BeautifulSoup:

0 commit comments

Comments
 (0)