Skip to content

Commit fc8444c

Browse files
authored
feat: Add requests argument to EnqueueLinksFunction (#1024)
Add `requests` argument to `EnqueueLinksFunction`. Split `EnqueueLinksFunction` implementations to `extract_links` and `add_requests`. Add overload variants of `EnqueueLinksFunction`. Raise error in `EnqueueLinksFunction` implementations if called with mutually exclusive arguments.
1 parent 64ba292 commit fc8444c

File tree

10 files changed

+381
-77
lines changed

10 files changed

+381
-77
lines changed
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import asyncio
2+
3+
from crawlee import Glob
4+
from crawlee.crawlers import BeautifulSoupCrawler, BeautifulSoupCrawlingContext
5+
6+
7+
async def main() -> None:
8+
crawler = BeautifulSoupCrawler(
9+
# Limit the crawl to max requests. Remove or increase it for crawling all links.
10+
max_requests_per_crawl=10,
11+
)
12+
13+
# Define the default request handler, which will be called for every request.
14+
@crawler.router.default_handler
15+
async def request_handler(context: BeautifulSoupCrawlingContext) -> None:
16+
context.log.info(f'Processing {context.request.url} ...')
17+
18+
# Extract all the documentation links found on the page, except for the examples.
19+
extracted_links = await context.extract_links(
20+
include=[Glob('https://crawlee.dev/docs/**')],
21+
exclude=[Glob('https://crawlee.dev/docs/examples')],
22+
)
23+
# Some very custom filtering which can't be achieved by `extract_links` arguments.
24+
max_link_length = 30
25+
filtered_links = [
26+
link for link in extracted_links if len(link.url) < max_link_length
27+
]
28+
# Add filtered links to the request queue.
29+
await context.add_requests(filtered_links)
30+
31+
# Run the crawler with the initial list of requests.
32+
await crawler.run(['https://crawlee.dev'])
33+
34+
35+
if __name__ == '__main__':
36+
asyncio.run(main())
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
import asyncio
2+
3+
from crawlee import Glob
4+
from crawlee.crawlers import PlaywrightCrawler, PlaywrightCrawlingContext
5+
6+
7+
async def main() -> None:
8+
crawler = PlaywrightCrawler(
9+
# Limit the crawl to max requests. Remove or increase it for crawling all links.
10+
max_requests_per_crawl=10,
11+
)
12+
13+
# Define the default request handler, which will be called for every request.
14+
@crawler.router.default_handler
15+
async def request_handler(context: PlaywrightCrawlingContext) -> None:
16+
context.log.info(f'Processing {context.request.url} ...')
17+
18+
# Extract all the documentation links found on the page, except for the examples.
19+
extracted_links = await context.extract_links(
20+
include=[Glob('https://crawlee.dev/docs/**')],
21+
exclude=[Glob('https://crawlee.dev/docs/examples')],
22+
)
23+
# Some very custom filtering which can't be achieved by `extract_links` arguments.
24+
max_link_length = 30
25+
filtered_links = [
26+
link for link in extracted_links if len(link.url) < max_link_length
27+
]
28+
# Add filtered links to the request queue.
29+
await context.add_requests(filtered_links)
30+
31+
# Run the crawler with the initial list of requests.
32+
await crawler.run(['https://crawlee.dev'])
33+
34+
35+
if __name__ == '__main__':
36+
asyncio.run(main())

docs/examples/crawl_specific_links_on_website.mdx

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,12 @@ import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
1111
import BeautifulSoupExample from '!!raw-loader!roa-loader!./code_examples/crawl_specific_links_on_website_bs.py';
1212
import PlaywrightExample from '!!raw-loader!roa-loader!./code_examples/crawl_specific_links_on_website_pw.py';
1313

14+
import BeautifulSoupExampleExtractAndAdd from '!!raw-loader!roa-loader!./code_examples/extract_and_add_specific_links_on_website_bs.py';
15+
import PlaywrightExampleExtractAndAdd from '!!raw-loader!roa-loader!./code_examples/extract_and_add_specific_links_on_website_pw.py';
16+
1417
This example demonstrates how to crawl a website while targeting specific patterns of links. By utilizing the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> helper, you can pass `include` or `exclude` parameters to improve your crawling strategy. This approach ensures that only the links matching the specified patterns are added to the <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>. Both `include` and `exclude` support lists of globs or regular expressions. This functionality is great for focusing on relevant sections of a website and avoiding scraping unnecessary or irrelevant content.
1518

16-
<Tabs groupId="main">
19+
<Tabs groupId="first-example">
1720
<TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
1821
<RunnableCodeBlock className="language-python" language="python">
1922
{BeautifulSoupExample}
@@ -25,3 +28,20 @@ This example demonstrates how to crawl a website while targeting specific patter
2528
</RunnableCodeBlock>
2629
</TabItem>
2730
</Tabs>
31+
32+
## Even more control over the enqueued links
33+
34+
<ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink> is a convenience helper and internally it calls <ApiLink to="class/ExtractLinksFunction">`extract_links`</ApiLink> to find the links and <ApiLink to="class/AddRequestsFunction">`add_requests`</ApiLink> to add them to the queue. If you need some additional custom filtering of the extracted links before enqueuing them, then consider using <ApiLink to="class/ExtractLinksFunction">`extract_links`</ApiLink> and <ApiLink to="class/AddRequestsFunction">`add_requests`</ApiLink> instead of the <ApiLink to="class/EnqueueLinksFunction">`enqueue_links`</ApiLink>
35+
36+
<Tabs groupId="second-example">
37+
<TabItem value="BeautifulSoupCrawler" label="BeautifulSoupCrawler">
38+
<RunnableCodeBlock className="language-python">
39+
{BeautifulSoupExampleExtractAndAdd}
40+
</RunnableCodeBlock>
41+
</TabItem>
42+
<TabItem value="PlaywrightCrawler" label="PlaywrightCrawler">
43+
<RunnableCodeBlock className="language-python">
44+
{PlaywrightExampleExtractAndAdd}
45+
</RunnableCodeBlock>
46+
</TabItem>
47+
</Tabs>

src/crawlee/_types.py

Lines changed: 64 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -324,24 +324,83 @@ def __call__(
324324

325325
@docs_group('Functions')
326326
class EnqueueLinksFunction(Protocol):
327-
"""A function for enqueueing new URLs to crawl based on elements selected by a given selector.
327+
"""A function for enqueueing new URLs to crawl based on elements selected by a given selector or explicit requests.
328328
329-
It extracts URLs from the current page and enqueues them for further crawling. It allows filtering through
330-
selectors and other options. You can also specify labels and user data to be associated with the newly
331-
created `Request` objects.
329+
It adds explicitly passed `requests` to the `RequestManager` or it extracts URLs from the current page and enqueues
330+
them for further crawling. It allows filtering through selectors and other options. You can also specify labels and
331+
user data to be associated with the newly created `Request` objects.
332+
333+
It should not be called with `selector`, `label`, `user_data` or `transform_request_function` arguments together
334+
with `requests` argument.
335+
336+
For even more control over the enqueued links you can use combination of `ExtractLinksFunction` and
337+
`AddRequestsFunction`.
332338
"""
333339

340+
@overload
334341
def __call__(
335342
self,
336343
*,
337-
selector: str = 'a',
344+
selector: str | None = None,
338345
label: str | None = None,
339346
user_data: dict[str, Any] | None = None,
340347
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
341348
**kwargs: Unpack[EnqueueLinksKwargs],
349+
) -> Coroutine[None, None, None]: ...
350+
351+
@overload
352+
def __call__(
353+
self, *, requests: Sequence[str | Request] | None = None, **kwargs: Unpack[EnqueueLinksKwargs]
354+
) -> Coroutine[None, None, None]: ...
355+
356+
def __call__(
357+
self,
358+
*,
359+
selector: str | None = None,
360+
label: str | None = None,
361+
user_data: dict[str, Any] | None = None,
362+
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
363+
requests: Sequence[str | Request] | None = None,
364+
**kwargs: Unpack[EnqueueLinksKwargs],
342365
) -> Coroutine[None, None, None]:
343366
"""Call enqueue links function.
344367
368+
Args:
369+
selector: A selector used to find the elements containing the links. The behaviour differs based
370+
on the crawler used:
371+
- `PlaywrightCrawler` supports CSS and XPath selectors.
372+
- `ParselCrawler` supports CSS selectors.
373+
- `BeautifulSoupCrawler` supports CSS selectors.
374+
label: Label for the newly created `Request` objects, used for request routing.
375+
user_data: User data to be provided to the newly created `Request` objects.
376+
transform_request_function: A function that takes `RequestOptions` and returns either:
377+
- Modified `RequestOptions` to update the request configuration,
378+
- `'skip'` to exclude the request from being enqueued,
379+
- `'unchanged'` to use the original request options without modification.
380+
requests: Requests to be added to the `RequestManager`.
381+
**kwargs: Additional keyword arguments.
382+
"""
383+
384+
385+
@docs_group('Functions')
386+
class ExtractLinksFunction(Protocol):
387+
"""A function for extracting URLs to crawl based on elements selected by a given selector.
388+
389+
It extracts URLs from the current page and allows filtering through selectors and other options. You can also
390+
specify labels and user data to be associated with the newly created `Request` objects.
391+
"""
392+
393+
def __call__(
394+
self,
395+
*,
396+
selector: str = 'a',
397+
label: str | None = None,
398+
user_data: dict[str, Any] | None = None,
399+
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction] | None = None,
400+
**kwargs: Unpack[EnqueueLinksKwargs],
401+
) -> Coroutine[None, None, list[Request]]:
402+
"""Call extract links function.
403+
345404
Args:
346405
selector: A selector used to find the elements containing the links. The behaviour differs based
347406
on the crawler used:

src/crawlee/crawlers/_abstract_http/_abstract_http_crawler.py

Lines changed: 59 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
import logging
44
from abc import ABC
5-
from typing import TYPE_CHECKING, Any, Callable, Generic
5+
from typing import TYPE_CHECKING, Any, Callable, Generic, Union
66

77
from pydantic import ValidationError
88
from typing_extensions import TypeVar
@@ -17,12 +17,12 @@
1717
from ._http_crawling_context import HttpCrawlingContext, ParsedHttpCrawlingContext, TParseResult, TSelectResult
1818

1919
if TYPE_CHECKING:
20-
from collections.abc import AsyncGenerator, Awaitable
20+
from collections.abc import AsyncGenerator, Awaitable, Sequence
2121

2222
from typing_extensions import Unpack
2323

2424
from crawlee import RequestTransformAction
25-
from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, EnqueueLinksKwargs
25+
from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, EnqueueLinksKwargs, ExtractLinksFunction
2626

2727
from ._abstract_http_parser import AbstractHttpParser
2828

@@ -124,34 +124,36 @@ async def _parse_http_response(
124124
The original crawling context enhanced by the parsing result and enqueue links function.
125125
"""
126126
parsed_content = await self._parser.parse(context.http_response)
127+
extract_links = self._create_extract_links_function(context, parsed_content)
127128
yield ParsedHttpCrawlingContext.from_http_crawling_context(
128129
context=context,
129130
parsed_content=parsed_content,
130-
enqueue_links=self._create_enqueue_links_function(context, parsed_content),
131+
enqueue_links=self._create_enqueue_links_function(context, extract_links),
132+
extract_links=extract_links,
131133
)
132134

133-
def _create_enqueue_links_function(
135+
def _create_extract_links_function(
134136
self, context: HttpCrawlingContext, parsed_content: TParseResult
135-
) -> EnqueueLinksFunction:
136-
"""Create a callback function for extracting links from parsed content and enqueuing them to the crawl.
137+
) -> ExtractLinksFunction:
138+
"""Create a callback function for extracting links from parsed content.
137139
138140
Args:
139141
context: The current crawling context.
140142
parsed_content: The parsed http response.
141143
142144
Returns:
143-
Awaitable that is used for extracting links from parsed content and enqueuing them to the crawl.
145+
Awaitable that is used for extracting links from parsed content.
144146
"""
145147

146-
async def enqueue_links(
148+
async def extract_links(
147149
*,
148150
selector: str = 'a',
149151
label: str | None = None,
150152
user_data: dict[str, Any] | None = None,
151153
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
152154
| None = None,
153155
**kwargs: Unpack[EnqueueLinksKwargs],
154-
) -> None:
156+
) -> list[Request]:
155157
kwargs.setdefault('strategy', 'same-hostname')
156158

157159
requests = list[Request]()
@@ -183,8 +185,54 @@ async def enqueue_links(
183185
continue
184186

185187
requests.append(request)
188+
return requests
189+
190+
return extract_links
191+
192+
def _create_enqueue_links_function(
193+
self, context: HttpCrawlingContext, extract_links: ExtractLinksFunction
194+
) -> EnqueueLinksFunction:
195+
"""Create a callback function for extracting links from parsed content and enqueuing them to the crawl.
196+
197+
Args:
198+
context: The current crawling context.
199+
extract_links: Function used to extract links from the page.
186200
187-
await context.add_requests(requests, **kwargs)
201+
Returns:
202+
Awaitable that is used for extracting links from parsed content and enqueuing them to the crawl.
203+
"""
204+
205+
async def enqueue_links(
206+
*,
207+
selector: str | None = None,
208+
label: str | None = None,
209+
user_data: dict[str, Any] | None = None,
210+
transform_request_function: Callable[[RequestOptions], RequestOptions | RequestTransformAction]
211+
| None = None,
212+
requests: Sequence[str | Request] | None = None,
213+
**kwargs: Unpack[EnqueueLinksKwargs],
214+
) -> None:
215+
kwargs.setdefault('strategy', 'same-hostname')
216+
217+
if requests:
218+
if any((selector, label, user_data, transform_request_function)):
219+
raise ValueError(
220+
'You cannot provide `selector`, `label`, `user_data` or '
221+
'`transform_request_function` arguments when `requests` is provided.'
222+
)
223+
# Add directly passed requests.
224+
await context.add_requests(requests or list[Union[str, Request]](), **kwargs)
225+
else:
226+
# Add requests from extracted links.
227+
await context.add_requests(
228+
await extract_links(
229+
selector=selector or 'a',
230+
label=label,
231+
user_data=user_data,
232+
transform_request_function=transform_request_function,
233+
),
234+
**kwargs,
235+
)
188236

189237
return enqueue_links
190238

src/crawlee/crawlers/_abstract_http/_http_crawling_context.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55

66
from typing_extensions import Self, TypeVar
77

8-
from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction
8+
from crawlee._types import BasicCrawlingContext, EnqueueLinksFunction, ExtractLinksFunction
99
from crawlee._utils.docs import docs_group
1010
from crawlee.http_clients import HttpCrawlingResult, HttpResponse
1111

@@ -35,14 +35,18 @@ class ParsedHttpCrawlingContext(Generic[TParseResult], HttpCrawlingContext):
3535

3636
parsed_content: TParseResult
3737
enqueue_links: EnqueueLinksFunction
38+
extract_links: ExtractLinksFunction
3839

3940
@classmethod
4041
def from_http_crawling_context(
4142
cls,
4243
context: HttpCrawlingContext,
4344
parsed_content: TParseResult,
4445
enqueue_links: EnqueueLinksFunction,
46+
extract_links: ExtractLinksFunction,
4547
) -> Self:
4648
"""Initialize a new instance from an existing `HttpCrawlingContext`."""
4749
context_kwargs = {field.name: getattr(context, field.name) for field in fields(context)}
48-
return cls(parsed_content=parsed_content, enqueue_links=enqueue_links, **context_kwargs)
50+
return cls(
51+
parsed_content=parsed_content, enqueue_links=enqueue_links, extract_links=extract_links, **context_kwargs
52+
)

0 commit comments

Comments
 (0)