Skip to content

Commit 1eb6da5

Browse files
authored
fix: Improve crawler statistics logging (#1364)
### Description - Increase statistics log table width (so that the default maximum number of retries fits there). - Add `format_duration` util function for human-readable duration values in stats. ### Issues - Closes: #1317
1 parent 727cbb8 commit 1eb6da5

File tree

6 files changed

+105
-48
lines changed

6 files changed

+105
-48
lines changed

src/crawlee/_utils/measure_time.py

Lines changed: 0 additions & 31 deletions
This file was deleted.

src/crawlee/_utils/time.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
from __future__ import annotations
2+
3+
import time
4+
from contextlib import contextmanager
5+
from dataclasses import dataclass
6+
from typing import TYPE_CHECKING
7+
8+
if TYPE_CHECKING:
9+
from collections.abc import Iterator
10+
from datetime import timedelta
11+
12+
_SECONDS_PER_MINUTE = 60
13+
_SECONDS_PER_HOUR = 3600
14+
15+
16+
@dataclass
17+
class TimerResult:
18+
wall: float | None = None
19+
cpu: float | None = None
20+
21+
22+
@contextmanager
23+
def measure_time() -> Iterator[TimerResult]:
24+
"""Measure the execution time (wall-clock and CPU) between the start and end of the with-block."""
25+
result = TimerResult()
26+
before_wall = time.monotonic()
27+
before_cpu = time.thread_time()
28+
29+
try:
30+
yield result
31+
finally:
32+
after_wall = time.monotonic()
33+
after_cpu = time.thread_time()
34+
result.wall = after_wall - before_wall
35+
result.cpu = after_cpu - before_cpu
36+
37+
38+
def format_duration(duration: timedelta | None) -> str:
39+
"""Format a timedelta into a human-readable string with appropriate units."""
40+
if duration is None:
41+
return 'None'
42+
43+
total_seconds = duration.total_seconds()
44+
45+
if total_seconds == 0:
46+
return '0s'
47+
48+
# For very small durations, show in milliseconds
49+
if total_seconds < 1:
50+
milliseconds = total_seconds * 1000
51+
if milliseconds < 1:
52+
microseconds = total_seconds * 1_000_000
53+
return f'{microseconds:.1f}μs'
54+
return f'{milliseconds:.1f}ms'
55+
56+
# For durations less than 60 seconds, show in seconds
57+
if total_seconds < _SECONDS_PER_MINUTE:
58+
return f'{total_seconds:.2f}s'
59+
60+
# For durations less than 1 hour, show in minutes and seconds
61+
if total_seconds < _SECONDS_PER_HOUR:
62+
minutes = int(total_seconds // _SECONDS_PER_MINUTE)
63+
seconds = total_seconds % _SECONDS_PER_MINUTE
64+
if seconds == 0:
65+
return f'{minutes}min'
66+
return f'{minutes}min {seconds:.1f}s'
67+
68+
# For longer durations, show in hours, minutes, and seconds
69+
hours = int(total_seconds // _SECONDS_PER_HOUR)
70+
remaining_seconds = total_seconds % _SECONDS_PER_HOUR
71+
minutes = int(remaining_seconds // _SECONDS_PER_MINUTE)
72+
seconds = remaining_seconds % _SECONDS_PER_MINUTE
73+
74+
result = f'{hours}h'
75+
if minutes > 0:
76+
result += f' {minutes}min'
77+
if seconds > 0:
78+
result += f' {seconds:.1f}s'
79+
80+
return result

src/crawlee/statistics/_models.py

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,9 @@
1111
from crawlee._utils.console import make_table
1212
from crawlee._utils.docs import docs_group
1313
from crawlee._utils.models import timedelta_ms
14+
from crawlee._utils.time import format_duration
15+
16+
_STATISTICS_TABLE_WIDTH = 100
1417

1518

1619
@dataclass(frozen=True)
@@ -31,9 +34,14 @@ class FinalStatistics:
3134

3235
def to_table(self) -> str:
3336
"""Print out the Final Statistics data as a table."""
34-
str_dict = {k: v.total_seconds() if isinstance(v, timedelta) else v for k, v in asdict(self).items()}
35-
36-
return make_table([(str(k), str(v)) for k, v in str_dict.items()], width=60)
37+
formatted_dict = {}
38+
for k, v in asdict(self).items():
39+
if isinstance(v, timedelta):
40+
formatted_dict[k] = format_duration(v)
41+
else:
42+
formatted_dict[k] = v
43+
44+
return make_table([(str(k), str(v)) for k, v in formatted_dict.items()], width=_STATISTICS_TABLE_WIDTH)
3745

3846
def to_dict(self) -> dict[str, float | int | list[int]]:
3947
return {k: v.total_seconds() if isinstance(v, timedelta) else v for k, v in asdict(self).items()}

tests/unit/_autoscaling/test_autoscaled_pool.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from crawlee._autoscaling import AutoscaledPool, SystemStatus
1515
from crawlee._autoscaling._types import LoadRatioInfo, SystemInfo
1616
from crawlee._types import ConcurrencySettings
17-
from crawlee._utils.measure_time import measure_time
17+
from crawlee._utils.time import measure_time
1818

1919
if TYPE_CHECKING:
2020
from collections.abc import Awaitable

tests/unit/_utils/test_measure_time.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
import asyncio
44
import time
55

6-
from crawlee._utils.measure_time import measure_time
6+
from crawlee._utils.time import measure_time
77

88

99
def test_measure_time_wall_sync() -> None:

tests/unit/crawlers/_basic/test_basic_crawler.py

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -951,18 +951,18 @@ async def handler(context: BasicCrawlingContext) -> None:
951951
if statistics_log_format == 'table':
952952
assert final_statistics.msg.splitlines() == [
953953
'Final request statistics:',
954-
'┌───────────────────────────────┬───────────┐',
955-
'│ requests_finished │ 4 │',
956-
'│ requests_failed │ 33 │',
957-
'│ retry_histogram │ [1, 4, 8] │',
958-
'│ request_avg_failed_duration │ 99.0 │',
959-
'│ request_avg_finished_duration │ 0.483 │',
960-
'│ requests_finished_per_minute │ 0.33 │',
961-
'│ requests_failed_per_minute │ 0.1 │',
962-
'│ request_total_duration │ 720.0 │',
963-
'│ requests_total │ 37 │',
964-
'│ crawler_runtime │ 300.0 │',
965-
'└───────────────────────────────┴───────────┘',
954+
'┌───────────────────────────────┬───────────┐',
955+
'│ requests_finished │ 4 │',
956+
'│ requests_failed │ 33 │',
957+
'│ retry_histogram │ [1, 4, 8] │',
958+
'│ request_avg_failed_duration │ 1min 39.0s │',
959+
'│ request_avg_finished_duration │ 483.0ms │',
960+
'│ requests_finished_per_minute │ 0.33 │',
961+
'│ requests_failed_per_minute │ 0.1 │',
962+
'│ request_total_duration │ 12min │',
963+
'│ requests_total │ 37 │',
964+
'│ crawler_runtime │ 5min │',
965+
'└───────────────────────────────┴───────────┘',
966966
]
967967
else:
968968
assert final_statistics.msg == 'Final request statistics:'

0 commit comments

Comments
 (0)