Skip to content

Commit 635ae4a

Browse files
authored
feat: Add statistics_log_format parameter to BasicCrawler (#1061)
### Description - Add the `use_table_logs` parameter that allows disabling tables in logs. This makes log parsing easier when needed. ### Issues - Closes: #700
1 parent a166334 commit 635ae4a

File tree

7 files changed

+218
-23
lines changed

7 files changed

+218
-23
lines changed
Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,91 @@
1+
from __future__ import annotations
2+
3+
import asyncio
4+
import inspect
5+
import logging
6+
import sys
7+
from typing import TYPE_CHECKING
8+
9+
from loguru import logger
10+
11+
from crawlee.crawlers import HttpCrawler, HttpCrawlingContext
12+
13+
if TYPE_CHECKING:
14+
from loguru import Record
15+
16+
17+
# Configure loguru interceptor to capture standard logging output
18+
class InterceptHandler(logging.Handler):
19+
def emit(self, record: logging.LogRecord) -> None:
20+
# Get corresponding Loguru level if it exists
21+
try:
22+
level: str | int = logger.level(record.levelname).name
23+
except ValueError:
24+
level = record.levelno
25+
26+
# Find caller from where originated the logged message
27+
frame, depth = inspect.currentframe(), 0
28+
while frame:
29+
filename = frame.f_code.co_filename
30+
is_logging = filename == logging.__file__
31+
is_frozen = 'importlib' in filename and '_bootstrap' in filename
32+
if depth > 0 and not (is_logging | is_frozen):
33+
break
34+
frame = frame.f_back
35+
depth += 1
36+
37+
dummy_record = logging.LogRecord('dummy', 0, 'dummy', 0, 'dummy', None, None)
38+
standard_attrs = set(dummy_record.__dict__.keys())
39+
extra_dict = {
40+
key: value
41+
for key, value in record.__dict__.items()
42+
if key not in standard_attrs
43+
}
44+
45+
(
46+
logger.bind(**extra_dict)
47+
.opt(depth=depth, exception=record.exc_info)
48+
.patch(lambda loguru_record: loguru_record.update({'name': record.name}))
49+
.log(level, record.getMessage())
50+
)
51+
52+
53+
# Configure loguru formatter
54+
def formatter(record: Record) -> str:
55+
basic_format = '[{name}] | <level>{level: ^8}</level> | - {message}'
56+
if record['extra']:
57+
basic_format = basic_format + ' {extra}'
58+
return f'{basic_format}\n'
59+
60+
61+
# Remove default loguru logger
62+
logger.remove()
63+
64+
# Set up loguru with JSONL serialization in file `crawler.log`
65+
logger.add('crawler.log', format=formatter, serialize=True, level='INFO')
66+
67+
# Set up loguru logger for console
68+
logger.add(sys.stderr, format=formatter, colorize=True, level='INFO')
69+
70+
# Configure standard logging to use our interceptor
71+
logging.basicConfig(handlers=[InterceptHandler()], level=logging.INFO, force=True)
72+
73+
74+
async def main() -> None:
75+
# Initialize crawler with disabled table logs
76+
crawler = HttpCrawler(
77+
configure_logging=False, # Disable default logging configuration
78+
statistics_log_format='inline', # Set inline formatting for statistics logs
79+
)
80+
81+
# Define the default request handler, which will be called for every request.
82+
@crawler.router.default_handler
83+
async def request_handler(context: HttpCrawlingContext) -> None:
84+
context.log.info(f'Processing {context.request.url} ...')
85+
86+
# Run the crawler
87+
await crawler.run(['https://www.crawlee.dev/'])
88+
89+
90+
if __name__ == '__main__':
91+
asyncio.run(main())

docs/examples/json_logging.mdx

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
---
2+
id: configure-json-logging
3+
title: Сonfigure JSON logging
4+
---
5+
6+
import ApiLink from '@site/src/components/ApiLink';
7+
import CodeBlock from '@theme/CodeBlock';
8+
9+
import JsonLoggingExample from '!!raw-loader!./code_examples/configure_json_logging.py';
10+
11+
This example demonstrates how to configure JSON line (JSONL) logging with Crawlee. By using the `use_table_logs=False` parameter, you can disable table-formatted statistics logs, which makes it easier to parse logs with external tools or to serialize them as JSON.
12+
13+
The example shows how to integrate with the popular [`loguru`](https://github.com/delgan/loguru) library to capture Crawlee logs and format them as JSONL (one JSON object per line). This approach works well when you need to collect logs for analysis, monitoring, or when integrating with logging platforms like ELK Stack, Grafana Loki, or similar systems.
14+
15+
<CodeBlock className="language-python">
16+
{JsonLoggingExample}
17+
</CodeBlock>
18+
19+
Here's an example of what a crawler statistics log entry in JSONL format.
20+
21+
```json
22+
{
23+
"text": "[HttpCrawler] | INFO | - Final request statistics: {'requests_finished': 1, 'requests_failed': 0, 'retry_histogram': [1], 'request_avg_failed_duration': None, 'request_avg_finished_duration': 3.57098, 'requests_finished_per_minute': 17, 'requests_failed_per_minute': 0, 'request_total_duration': 3.57098, 'requests_total': 1, 'crawler_runtime': 3.59165}\n",
24+
"record": {
25+
"elapsed": { "repr": "0:00:05.604568", "seconds": 5.604568 },
26+
"exception": null,
27+
"extra": {
28+
"requests_finished": 1,
29+
"requests_failed": 0,
30+
"retry_histogram": [1],
31+
"request_avg_failed_duration": null,
32+
"request_avg_finished_duration": 3.57098,
33+
"requests_finished_per_minute": 17,
34+
"requests_failed_per_minute": 0,
35+
"request_total_duration": 3.57098,
36+
"requests_total": 1,
37+
"crawler_runtime": 3.59165
38+
},
39+
"file": {
40+
"name": "_basic_crawler.py",
41+
"path": "/crawlers/_basic/_basic_crawler.py"
42+
},
43+
"function": "run",
44+
"level": { "icon": "ℹ️", "name": "INFO", "no": 20 },
45+
"line": 583,
46+
"message": "Final request statistics:",
47+
"module": "_basic_crawler",
48+
"name": "HttpCrawler",
49+
"process": { "id": 198383, "name": "MainProcess" },
50+
"thread": { "id": 135312814966592, "name": "MainThread" },
51+
"time": {
52+
"repr": "2025-03-17 17:14:45.339150+00:00",
53+
"timestamp": 1742231685.33915
54+
}
55+
}
56+
}
57+
```

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -226,6 +226,7 @@ module = [
226226
"apify_fingerprint_datapoints", # Untyped and stubs not available
227227
"camoufox", # Example code shows integration of camoufox and crawlee.
228228
"jaro", # Untyped and stubs not available
229+
"loguru", # Example code shows integration of loguru and crawlee for JSON logging.
229230
"sklearn.linear_model", # Untyped and stubs not available
230231
]
231232
ignore_missing_imports = true

src/crawlee/crawlers/_basic/_basic_crawler.py

Lines changed: 15 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@
1313
from datetime import timedelta
1414
from functools import partial
1515
from pathlib import Path
16-
from typing import TYPE_CHECKING, Any, Callable, Generic, Union, cast
16+
from typing import TYPE_CHECKING, Any, Callable, Generic, Literal, Union, cast
1717
from urllib.parse import ParseResult, urlparse
1818
from weakref import WeakKeyDictionary
1919

@@ -135,6 +135,11 @@ class _BasicCrawlerOptions(TypedDict):
135135
configure_logging: NotRequired[bool]
136136
"""If True, the crawler will set up logging infrastructure automatically."""
137137

138+
statistics_log_format: NotRequired[Literal['table', 'inline']]
139+
"""If 'table', displays crawler statistics as formatted tables in logs. If 'inline', outputs statistics as plain
140+
text log messages.
141+
"""
142+
138143
keep_alive: NotRequired[bool]
139144
"""Flag that can keep crawler running even when there are no requests in queue."""
140145

@@ -231,6 +236,7 @@ def __init__(
231236
abort_on_error: bool = False,
232237
keep_alive: bool = False,
233238
configure_logging: bool = True,
239+
statistics_log_format: Literal['table', 'inline'] = 'table',
234240
_context_pipeline: ContextPipeline[TCrawlingContext] | None = None,
235241
_additional_context_managers: Sequence[AbstractAsyncContextManager] | None = None,
236242
_logger: logging.Logger | None = None,
@@ -271,6 +277,8 @@ def __init__(
271277
keep_alive: If True, it will keep crawler alive even if there are no requests in queue.
272278
Use `crawler.stop()` to exit the crawler.
273279
configure_logging: If True, the crawler will set up logging infrastructure automatically.
280+
statistics_log_format: If 'table', displays crawler statistics as formatted tables in logs. If 'inline',
281+
outputs statistics as plain text log messages.
274282
_context_pipeline: Enables extending the request lifecycle and modifying the crawling context.
275283
Intended for use by subclasses rather than direct instantiation of `BasicCrawler`.
276284
_additional_context_managers: Additional context managers used throughout the crawler lifecycle.
@@ -346,12 +354,14 @@ def __init__(
346354
httpx_logger = logging.getLogger('httpx') # Silence HTTPX logger
347355
httpx_logger.setLevel(logging.DEBUG if get_configured_log_level() <= logging.DEBUG else logging.WARNING)
348356
self._logger = _logger or logging.getLogger(__name__)
357+
self._statistics_log_format = statistics_log_format
349358

350359
# Statistics
351360
self._statistics = statistics or cast(
352361
'Statistics[TStatisticsState]',
353362
Statistics.with_default_state(
354363
periodic_message_logger=self._logger,
364+
statistics_log_format=self._statistics_log_format,
355365
log_message='Current request statistics:',
356366
),
357367
)
@@ -567,8 +577,10 @@ def sigint_handler() -> None:
567577
await self._save_crawler_state()
568578

569579
final_statistics = self._statistics.calculate()
570-
self._logger.info(f'Final request statistics:\n{final_statistics.to_table()}')
571-
580+
if self._statistics_log_format == 'table':
581+
self._logger.info(f'Final request statistics:\n{final_statistics.to_table()}')
582+
else:
583+
self._logger.info('Final request statistics:', extra=final_statistics.to_dict())
572584
return final_statistics
573585

574586
async def _run_crawler(self) -> None:

src/crawlee/statistics/_models.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,9 @@ def to_table(self) -> str:
3535

3636
return make_table([(str(k), str(v)) for k, v in str_dict.items()], width=60)
3737

38+
def to_dict(self) -> dict[str, float | int | list[int]]:
39+
return {k: v.total_seconds() if isinstance(v, timedelta) else v for k, v in asdict(self).items()}
40+
3841
@override
3942
def __str__(self) -> str:
4043
return json.dumps(

src/crawlee/statistics/_statistics.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
import math
55
from datetime import datetime, timedelta, timezone
66
from logging import Logger, getLogger
7-
from typing import TYPE_CHECKING, Any, Generic, cast
7+
from typing import TYPE_CHECKING, Any, Generic, Literal, cast
88

99
from typing_extensions import Self, TypeVar
1010

@@ -76,6 +76,7 @@ def __init__(
7676
periodic_message_logger: Logger | None = None,
7777
log_interval: timedelta = timedelta(minutes=1),
7878
state_model: type[TStatisticsState],
79+
statistics_log_format: Literal['table', 'inline'] = 'table',
7980
) -> None:
8081
self._id = Statistics.__next_id
8182
Statistics.__next_id += 1
@@ -99,6 +100,7 @@ def __init__(
99100
self._key_value_store: KeyValueStore | None = key_value_store
100101

101102
self._log_message = log_message
103+
self._statistics_log_format = statistics_log_format
102104
self._periodic_message_logger = periodic_message_logger or logger
103105
self._periodic_logger = RecurringTask(self._log, log_interval)
104106

@@ -129,6 +131,7 @@ def with_default_state(
129131
log_message: str = 'Statistics',
130132
periodic_message_logger: Logger | None = None,
131133
log_interval: timedelta = timedelta(minutes=1),
134+
statistics_log_format: Literal['table', 'inline'] = 'table',
132135
) -> Statistics[StatisticsState]:
133136
"""Convenience constructor for creating a `Statistics` with default state model `StatisticsState`."""
134137
return Statistics[StatisticsState](
@@ -140,6 +143,7 @@ def with_default_state(
140143
periodic_message_logger=periodic_message_logger,
141144
log_interval=log_interval,
142145
state_model=StatisticsState,
146+
statistics_log_format=statistics_log_format,
143147
)
144148

145149
@property
@@ -281,7 +285,10 @@ async def reset(self) -> None:
281285

282286
def _log(self) -> None:
283287
stats = self.calculate()
284-
self._periodic_message_logger.info(f'{self._log_message}\n{stats.to_table()}')
288+
if self._statistics_log_format == 'table':
289+
self._periodic_message_logger.info(f'{self._log_message}\n{stats.to_table()}')
290+
else:
291+
self._periodic_message_logger.info(self._log_message, extra=stats.to_dict())
285292

286293
async def _maybe_load_statistics(self) -> None:
287294
if not self._persistence_enabled:

tests/unit/crawlers/_basic/test_basic_crawler.py

Lines changed: 42 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from dataclasses import dataclass
1212
from datetime import timedelta
1313
from pathlib import Path
14-
from typing import TYPE_CHECKING, Any, cast
14+
from typing import TYPE_CHECKING, Any, Literal, cast
1515
from unittest.mock import AsyncMock, Mock, call
1616

1717
import httpx
@@ -889,11 +889,20 @@ async def handler(context: BasicCrawlingContext) -> None:
889889

890890

891891
@pytest.mark.skipif(os.name == 'nt' and 'CI' in os.environ, reason='Skipped in Windows CI')
892-
async def test_logs_final_statistics(monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture) -> None:
892+
@pytest.mark.parametrize(
893+
('statistics_log_format'),
894+
[
895+
pytest.param('table', id='With table for logs'),
896+
pytest.param('inline', id='With inline logs'),
897+
],
898+
)
899+
async def test_logs_final_statistics(
900+
monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture, statistics_log_format: Literal['table', 'inline']
901+
) -> None:
893902
# Set the log level to INFO to capture the final statistics log.
894903
caplog.set_level(logging.INFO)
895904

896-
crawler = BasicCrawler(configure_logging=False)
905+
crawler = BasicCrawler(configure_logging=False, statistics_log_format=statistics_log_format)
897906

898907
@crawler.router.default_handler
899908
async def handler(context: BasicCrawlingContext) -> None:
@@ -923,21 +932,36 @@ async def handler(context: BasicCrawlingContext) -> None:
923932
)
924933

925934
assert final_statistics is not None
926-
assert final_statistics.msg.splitlines() == [
927-
'Final request statistics:',
928-
'┌───────────────────────────────┬───────────┐',
929-
'│ requests_finished │ 4 │',
930-
'│ requests_failed │ 33 │',
931-
'│ retry_histogram │ [1, 4, 8] │',
932-
'│ request_avg_failed_duration │ 99.0 │',
933-
'│ request_avg_finished_duration │ 0.483 │',
934-
'│ requests_finished_per_minute │ 0.33 │',
935-
'│ requests_failed_per_minute │ 0.1 │',
936-
'│ request_total_duration │ 720.0 │',
937-
'│ requests_total │ 37 │',
938-
'│ crawler_runtime │ 300.0 │',
939-
'└───────────────────────────────┴───────────┘',
940-
]
935+
if statistics_log_format == 'table':
936+
assert final_statistics.msg.splitlines() == [
937+
'Final request statistics:',
938+
'┌───────────────────────────────┬───────────┐',
939+
'│ requests_finished │ 4 │',
940+
'│ requests_failed │ 33 │',
941+
'│ retry_histogram │ [1, 4, 8] │',
942+
'│ request_avg_failed_duration │ 99.0 │',
943+
'│ request_avg_finished_duration │ 0.483 │',
944+
'│ requests_finished_per_minute │ 0.33 │',
945+
'│ requests_failed_per_minute │ 0.1 │',
946+
'│ request_total_duration │ 720.0 │',
947+
'│ requests_total │ 37 │',
948+
'│ crawler_runtime │ 300.0 │',
949+
'└───────────────────────────────┴───────────┘',
950+
]
951+
else:
952+
assert final_statistics.msg == 'Final request statistics:'
953+
954+
# ignore[attr-defined] since `extra` parameters are not defined for `LogRecord`
955+
assert final_statistics.requests_finished == 4 # type: ignore[attr-defined]
956+
assert final_statistics.requests_failed == 33 # type: ignore[attr-defined]
957+
assert final_statistics.retry_histogram == [1, 4, 8] # type: ignore[attr-defined]
958+
assert final_statistics.request_avg_failed_duration == 99.0 # type: ignore[attr-defined]
959+
assert final_statistics.request_avg_finished_duration == 0.483 # type: ignore[attr-defined]
960+
assert final_statistics.requests_finished_per_minute == 0.33 # type: ignore[attr-defined]
961+
assert final_statistics.requests_failed_per_minute == 0.1 # type: ignore[attr-defined]
962+
assert final_statistics.request_total_duration == 720.0 # type: ignore[attr-defined]
963+
assert final_statistics.requests_total == 37 # type: ignore[attr-defined]
964+
assert final_statistics.crawler_runtime == 300.0 # type: ignore[attr-defined]
941965

942966

943967
async def test_crawler_manual_stop(httpbin: URL) -> None:

0 commit comments

Comments
 (0)