refactor!: Change method HttpResponse.read from sync to async (#1296)

Mantisus · web-flow · commit 83fa8a416b6d · 2025-07-10T11:25:38.000+02:00
### Description

- Makes the `read` method for `HttpResponse` asynchronous. This is more
consistent in a fully asynchronous framework. More flexible than a
synchronous method
diff --git a/docs/examples/code_examples/fill_and_submit_web_form_crawler.py b/docs/examples/code_examples/fill_and_submit_web_form_crawler.py
@@ -12,7 +12,7 @@ async def main() -> None:
     @crawler.router.default_handler
     async def request_handler(context: HttpCrawlingContext) -> None:
         context.log.info(f'Processing {context.request.url} ...')
-        response = context.http_response.read().decode('utf-8')
+        response = (await context.http_response.read()).decode('utf-8')
         context.log.info(f'Response: {response}')  # To see the response in the logs.
 
     # Prepare a POST request to the form endpoint.
diff --git a/docs/guides/code_examples/creating_web_archive/manual_archiving_parsel_crawler.py b/docs/guides/code_examples/creating_web_archive/manual_archiving_parsel_crawler.py
@@ -8,10 +8,10 @@
 from crawlee.crawlers import ParselCrawler, ParselCrawlingContext
 
 
-def archive_response(context: ParselCrawlingContext, writer: WARCWriter) -> None:
+async def archive_response(context: ParselCrawlingContext, writer: WARCWriter) -> None:
     """Helper function for archiving response in WARC format."""
     # Create WARC records for response
-    response_body = context.http_response.read()
+    response_body = await context.http_response.read()
     response_payload_stream = io.BytesIO(response_body)
 
     response_headers = StatusAndHeaders(
@@ -51,7 +51,7 @@ async def main() -> None:
         @crawler.router.default_handler
         async def request_handler(context: ParselCrawlingContext) -> None:
             context.log.info(f'Archiving {context.request.url} ...')
-            archive_response(context=context, writer=writer)
+            await archive_response(context=context, writer=writer)
             await context.enqueue_links(strategy='same-domain')
 
         await crawler.run(['https://crawlee.dev/'])
diff --git a/docs/guides/code_examples/error_handling/change_handle_error_status.py b/docs/guides/code_examples/error_handling/change_handle_error_status.py
@@ -30,7 +30,7 @@ async def default_handler(context: HttpCrawlingContext) -> None:
             response = await context.send_request(
                 'https://placeholder.org/refresh', headers=headers
             )
-            data = json.loads(response.read())
+            data = json.loads(await response.read())
             # Add the new token to our `Request` headers
             new_headers = {
                 **context.request.headers,
diff --git a/docs/guides/code_examples/login_crawler/http_login.py b/docs/guides/code_examples/login_crawler/http_login.py
@@ -46,7 +46,7 @@ async def login_handler(context: HttpCrawlingContext) -> None:
             raise RuntimeError('Session not found')
 
         # Parse the API response containing authentication tokens and user data
-        data = json.loads(context.http_response.read())
+        data = json.loads(await context.http_response.read())
 
         # Extract authentication data from the response
         token = data['token']
diff --git a/docs/guides/code_examples/session_management/sm_basic.py b/docs/guides/code_examples/session_management/sm_basic.py
@@ -30,7 +30,7 @@ async def default_handler(context: BasicCrawlingContext) -> None:
         # and `context.proxy_info`.
         response = await context.send_request(context.request.url)
 
-        page_content = response.read().decode()
+        page_content = (await response.read()).decode()
         title_match = re.search(r'<title(?:.*?)>(.*?)</title>', page_content)
 
         if context.session and (title := title_match.group(1) if title_match else None):
diff --git a/docs/guides/code_examples/session_management/sm_http.py b/docs/guides/code_examples/session_management/sm_http.py
@@ -26,7 +26,7 @@ async def main() -> None:
     # based on the response content and potential blocking
     @crawler.router.default_handler
     async def default_handler(context: HttpCrawlingContext) -> None:
-        page_content = context.http_response.read().decode()
+        page_content = (await context.http_response.read()).decode()
         title_match = re.search(r'<title(?:.*?)>(.*?)</title>', page_content)
 
         if context.session and (title := title_match.group(1) if title_match else None):
diff --git a/docs/upgrading/upgrading_to_v1.md b/docs/upgrading/upgrading_to_v1.md
@@ -121,6 +121,8 @@ dataset = await Dataset.open(
 The `persist_storage` and `persist_metadata` fields have been removed from the `Configuration` class.
 Persistence is now determined solely by the storage client class you use.
 
+The `read` method for `HttpResponse` has been changed from synchronous to asynchronous.
+
 ### Storage client instance behavior
 
 Instance caching is implemented for the storage open methods: `Dataset.open()`, `KeyValueStore.open()`,
diff --git a/src/crawlee/_utils/robots.py b/src/crawlee/_utils/robots.py
@@ -57,7 +57,9 @@ async def load(cls, url: str, http_client: HttpClient, proxy_info: ProxyInfo | N
             proxy_info: Optional `ProxyInfo` to be used when fetching the robots.txt file. If None, no proxy is used.
         """
         response = await http_client.send_request(url, proxy_info=proxy_info)
-        body = b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else response.read()
+        body = (
+            b'User-agent: *\nAllow: /' if is_status_code_client_error(response.status_code) else await response.read()
+        )
 
         robots = Protego.parse(body.decode('utf-8'))
 
diff --git a/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py b/src/crawlee/crawlers/_abstract_http/_http_crawling_context.py
@@ -26,7 +26,7 @@ def from_basic_crawling_context(cls, context: BasicCrawlingContext, http_respons
 
     async def get_snapshot(self) -> PageSnapshot:
         """Get snapshot of crawled page."""
-        return PageSnapshot(html=self.http_response.read().decode('utf-8'))
+        return PageSnapshot(html=(await self.http_response.read()).decode('utf-8'))
 
 
 @dataclass(frozen=True)
diff --git a/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py b/src/crawlee/crawlers/_beautifulsoup/_beautifulsoup_parser.py
@@ -21,7 +21,7 @@ def __init__(self, parser: BeautifulSoupParserType = 'lxml') -> None:
 
     @override
     async def parse(self, response: HttpResponse) -> BeautifulSoup:
-        return BeautifulSoup(response.read(), features=self._parser)
+        return BeautifulSoup(await response.read(), features=self._parser)
 
     @override
     async def parse_text(self, text: str) -> BeautifulSoup:
diff --git a/src/crawlee/crawlers/_http/_http_crawler.py b/src/crawlee/crawlers/_http/_http_crawler.py
@@ -36,7 +36,7 @@ async def request_handler(context: HttpCrawlingContext) -> None:
         # Extract data from the page.
         data = {
             'url': context.request.url,
-            'response': context.http_response.read().decode()[:100],
+            'response': (await context.http_response.read()).decode()[:100],
         }
 
         # Push the extracted data to the default dataset.
diff --git a/src/crawlee/crawlers/_http/_http_parser.py b/src/crawlee/crawlers/_http/_http_parser.py
@@ -21,7 +21,7 @@ class NoParser(AbstractHttpParser[bytes, bytes]):
 
     @override
     async def parse(self, response: HttpResponse) -> bytes:
-        return response.read()
+        return await response.read()
 
     @override
     async def parse_text(self, text: str) -> bytes:
diff --git a/src/crawlee/crawlers/_parsel/_parsel_parser.py b/src/crawlee/crawlers/_parsel/_parsel_parser.py
@@ -19,7 +19,8 @@ class ParselParser(AbstractHttpParser[Selector, Selector]):
 
     @override
     async def parse(self, response: HttpResponse) -> Selector:
-        return await asyncio.to_thread(lambda: Selector(body=response.read()))
+        response_body = await response.read()
+        return await asyncio.to_thread(lambda: Selector(body=response_body))
 
     @override
     async def parse_text(self, text: str) -> Selector:
diff --git a/src/crawlee/crawlers/_playwright/_types.py b/src/crawlee/crawlers/_playwright/_types.py
@@ -41,7 +41,7 @@ class PlaywrightHttpResponse:
     headers: HttpHeaders
     _content: bytes
 
-    def read(self) -> bytes:
+    async def read(self) -> bytes:
         return self._content
 
     async def read_stream(self) -> AsyncGenerator[bytes, None]:
diff --git a/src/crawlee/http_clients/_base.py b/src/crawlee/http_clients/_base.py
@@ -35,7 +35,7 @@ def status_code(self) -> int:
     def headers(self) -> HttpHeaders:
         """The HTTP headers received in the response."""
 
-    def read(self) -> bytes:
+    async def read(self) -> bytes:
         """Read the entire content of the response body.
 
         This method loads the complete response body into memory at once. It should be used
diff --git a/src/crawlee/http_clients/_curl_impersonate.py b/src/crawlee/http_clients/_curl_impersonate.py
@@ -85,7 +85,7 @@ def status_code(self) -> int:
     def headers(self) -> HttpHeaders:
         return HttpHeaders({key: value for key, value in self._response.headers.items() if value})
 
-    def read(self) -> bytes:
+    async def read(self) -> bytes:
         if self._response.astream_task:
             raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method')
         return self._response.content
diff --git a/src/crawlee/http_clients/_httpx.py b/src/crawlee/http_clients/_httpx.py
@@ -46,10 +46,10 @@ def status_code(self) -> int:
     def headers(self) -> HttpHeaders:
         return HttpHeaders(dict(self._response.headers))
 
-    def read(self) -> bytes:
+    async def read(self) -> bytes:
         if not self._response.is_closed:
             raise RuntimeError('Use `read_stream` to read the body of the Response received from the `stream` method')
-        return self._response.read()
+        return await self._response.aread()
 
     async def read_stream(self) -> AsyncIterator[bytes]:
         if self._response.is_stream_consumed:
diff --git a/tests/unit/crawlers/_basic/test_basic_crawler.py b/tests/unit/crawlers/_basic/test_basic_crawler.py
@@ -331,7 +331,7 @@ async def test_send_request_works(server_url: URL, method: HttpMethod, path: str
     async def handler(context: BasicCrawlingContext) -> None:
         response = await context.send_request(str(server_url / path), method=method, payload=payload)
 
-        response_data['body'] = json.loads(response.read())
+        response_data['body'] = json.loads(await response.read())
         response_data['headers'] = response.headers
 
     await crawler.run(['https://a.placeholder.com', 'https://b.placeholder.com', 'https://c.placeholder.com'])
diff --git a/tests/unit/crawlers/_http/test_http_crawler.py b/tests/unit/crawlers/_http/test_http_crawler.py
@@ -240,7 +240,7 @@ async def test_sending_payload_as_raw_data(http_client: HttpClient, server_url:
 
     @crawler.router.default_handler
     async def request_handler(context: HttpCrawlingContext) -> None:
-        response = json.loads(context.http_response.read())
+        response = json.loads(await context.http_response.read())
         # The post endpoint returns the provided payload in the response.
         responses.append(response)
 
@@ -271,7 +271,7 @@ async def test_sending_payload_as_form_data(http_client: HttpClient, server_url:
 
     @crawler.router.default_handler
     async def request_handler(context: HttpCrawlingContext) -> None:
-        response = json.loads(context.http_response.read())
+        response = json.loads(await context.http_response.read())
         # The /post endpoint returns the provided payload in the response.
         responses.append(response)
 
@@ -297,7 +297,7 @@ async def test_sending_payload_as_json(http_client: HttpClient, server_url: URL)
 
     @crawler.router.default_handler
     async def request_handler(context: HttpCrawlingContext) -> None:
-        response = json.loads(context.http_response.read())
+        response = json.loads(await context.http_response.read())
         # The /post endpoint returns the provided payload in the response.
         responses.append(response)
 
@@ -324,7 +324,7 @@ async def test_sending_url_query_params(http_client: HttpClient, server_url: URL
 
     @crawler.router.default_handler
     async def request_handler(context: HttpCrawlingContext) -> None:
-        response = json.loads(context.http_response.read())
+        response = json.loads(await context.http_response.read())
         # The /get endpoint returns the provided query parameters in the response.
         responses.append(response)
 
@@ -397,7 +397,7 @@ async def handler(context: HttpCrawlingContext) -> None:
         sessions_cookies[context.session.id] = {
             cookie['name']: cookie['value'] for cookie in context.session.cookies.get_cookies_as_dicts()
         }
-        response_data = json.loads(context.http_response.read())
+        response_data = json.loads(await context.http_response.read())
         response_cookies[context.session.id] = response_data.get('cookies')
 
         if context.request.user_data.get('retire_session'):
diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py
@@ -672,14 +672,14 @@ async def test_send_request(server_url: URL) -> None:
     @crawler.pre_navigation_hook
     async def some_hook(context: PlaywrightPreNavCrawlingContext) -> None:
         send_request_response = await context.send_request(str(server_url / 'user-agent'))
-        check_data['pre_send_request'] = dict(json.loads(send_request_response.read()))
+        check_data['pre_send_request'] = dict(json.loads(await send_request_response.read()))
 
     @crawler.router.default_handler
     async def request_handler(context: PlaywrightCrawlingContext) -> None:
         response = await context.response.text()
         check_data['default'] = dict(json.loads(response))
         send_request_response = await context.send_request(str(server_url / 'user-agent'))
-        check_data['send_request'] = dict(json.loads(send_request_response.read()))
+        check_data['send_request'] = dict(json.loads(await send_request_response.read()))
 
     await crawler.run([str(server_url / 'user-agent')])
 
@@ -703,7 +703,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
         response = await context.response.text()
         check_data['default'] = dict(json.loads(response))
         send_request_response = await context.send_request(str(server_url / 'user-agent'))
-        check_data['send_request'] = dict(json.loads(send_request_response.read()))
+        check_data['send_request'] = dict(json.loads(await send_request_response.read()))
 
     await crawler.run([str(server_url / 'user-agent')])
 
diff --git a/tests/unit/http_clients/test_curl_impersonate.py b/tests/unit/http_clients/test_curl_impersonate.py
@@ -162,7 +162,7 @@ async def test_stream_error_for_read(http_client: CurlImpersonateHttpClient, ser
         assert response.status_code == 200
 
         with pytest.raises(RuntimeError):
-            response.read()
+            await response.read()
 
 
 async def test_send_request_error_for_read_stream(http_client: CurlImpersonateHttpClient, server_url: URL) -> None:
diff --git a/tests/unit/http_clients/test_httpx.py b/tests/unit/http_clients/test_httpx.py
@@ -95,7 +95,7 @@ async def test_common_headers_and_user_agent(server_url: URL, header_network: di
     client = HttpxHttpClient()
 
     response = await client.send_request(str(server_url / 'headers'))
-    response_headers = json.loads(response.read().decode())
+    response_headers = json.loads((await response.read()).decode())
 
     assert 'accept' in response_headers
     assert response_headers['accept'] in get_available_header_values(header_network, {'Accept', 'accept'})
@@ -176,7 +176,7 @@ async def test_stream_error_for_read(http_client: HttpxHttpClient, server_url: U
         assert response.status_code == 200
 
         with pytest.raises(RuntimeError):
-            response.read()
+            await response.read()
 
 
 async def test_send_request_error_for_read_stream(http_client: HttpxHttpClient, server_url: URL) -> None:

Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ async def default_handler(context: HttpCrawlingContext) -> None:`
`30`	`30`	`response = await context.send_request(`
`31`	`31`	`'https://placeholder.org/refresh', headers=headers`
`32`	`32`	`)`
`33`		`- data = json.loads(response.read())`
	`33`	`+ data = json.loads(await response.read())`
`34`	`34`	# Add the new token to our `Request` headers
`35`	`35`	`new_headers = {`
`36`	`36`	`**context.request.headers,`