1
+ import math
1
2
import threading
2
3
from abc import ABC , abstractmethod
3
4
from datetime import datetime , timedelta
7
8
8
9
from redis .multidb .circuit import State as CBState
9
10
10
- DEFAULT_FAILURES_THRESHOLD = 1000
11
- DEFAULT_FAILURES_DURATION = 2
11
+ DEFAULT_MIN_NUM_FAILURES = 1000
12
+ DEFAULT_FAILURE_RATE_THRESHOLD = 0.1
13
+ DEFAULT_FAILURES_DETECTION_WINDOW = 2
12
14
13
15
class FailureDetector (ABC ):
14
16
@@ -17,6 +19,11 @@ def register_failure(self, exception: Exception, cmd: tuple) -> None:
17
19
"""Register a failure that occurred during command execution."""
18
20
pass
19
21
22
+ @abstractmethod
23
+ def register_command_execution (self , cmd : tuple ) -> None :
24
+ """Register a command execution."""
25
+ pass
26
+
20
27
@abstractmethod
21
28
def set_command_executor (self , command_executor ) -> None :
22
29
"""Set the command executor for this failure."""
@@ -28,56 +35,65 @@ class CommandFailureDetector(FailureDetector):
28
35
"""
29
36
def __init__ (
30
37
self ,
31
- threshold : int = DEFAULT_FAILURES_THRESHOLD ,
32
- duration : float = DEFAULT_FAILURES_DURATION ,
38
+ min_num_failures : int = DEFAULT_MIN_NUM_FAILURES ,
39
+ failure_rate_threshold : float = DEFAULT_FAILURE_RATE_THRESHOLD ,
40
+ failure_detection_window : float = DEFAULT_FAILURES_DETECTION_WINDOW ,
33
41
error_types : Optional [List [Type [Exception ]]] = None ,
34
42
) -> None :
35
43
"""
36
44
Initialize a new CommandFailureDetector instance.
37
45
38
46
Args:
39
- threshold: The number of failures that must occur within the duration to trigger failure detection.
40
- duration: The time window in seconds during which failures are counted.
47
+ min_num_failures: Minimal count of failures required for failover
48
+ failure_rate_threshold: Percentage of failures required for failover
49
+ failure_detection_window: Time interval for executing health checks.
41
50
error_types: Optional list of exception types to trigger failover. If None, all exceptions are counted.
42
51
43
52
The detector tracks command failures within a sliding time window. When the number of failures
44
53
exceeds the threshold within the specified duration, it triggers failure detection.
45
54
"""
46
55
self ._command_executor = None
47
- self ._threshold = threshold
48
- self ._duration = duration
56
+ self ._min_num_failures = min_num_failures
57
+ self ._failure_rate_threshold = failure_rate_threshold
58
+ self ._failure_detection_window = failure_detection_window
49
59
self ._error_types = error_types
60
+ self ._commands_executed : int = 0
50
61
self ._start_time : datetime = datetime .now ()
51
- self ._end_time : datetime = self ._start_time + timedelta (seconds = self ._duration )
52
- self ._failures_within_duration : List [ tuple [ datetime , tuple ]] = []
62
+ self ._end_time : datetime = self ._start_time + timedelta (seconds = self ._failure_detection_window )
63
+ self ._failures_count : int = 0
53
64
self ._lock = threading .RLock ()
54
65
55
66
def register_failure (self , exception : Exception , cmd : tuple ) -> None :
56
- failure_time = datetime .now ()
57
-
58
- if not self ._start_time < failure_time < self ._end_time :
59
- self ._reset ()
60
-
61
67
with self ._lock :
62
68
if self ._error_types :
63
69
if type (exception ) in self ._error_types :
64
- self ._failures_within_duration . append (( datetime . now (), cmd ))
70
+ self ._failures_count += 1
65
71
else :
66
- self ._failures_within_duration . append (( datetime . now (), cmd ))
72
+ self ._failures_count += 1
67
73
68
- self ._check_threshold ()
74
+ self ._check_threshold ()
69
75
70
76
def set_command_executor (self , command_executor ) -> None :
71
77
self ._command_executor = command_executor
72
78
73
- def _check_threshold (self ) :
79
+ def register_command_execution (self , cmd : tuple ) -> None :
74
80
with self ._lock :
75
- if len (self ._failures_within_duration ) >= self ._threshold :
76
- self ._command_executor .active_database .circuit .state = CBState .OPEN
81
+ if not self ._start_time < datetime .now () < self ._end_time :
77
82
self ._reset ()
78
83
84
+ self ._commands_executed += 1
85
+
86
+ def _check_threshold (self ):
87
+ if (
88
+ self ._failures_count >= self ._min_num_failures
89
+ and self ._failures_count >= (math .ceil (self ._commands_executed * self ._failure_rate_threshold ))
90
+ ):
91
+ self ._command_executor .active_database .circuit .state = CBState .OPEN
92
+ self ._reset ()
93
+
79
94
def _reset (self ) -> None :
80
95
with self ._lock :
81
96
self ._start_time = datetime .now ()
82
- self ._end_time = self ._start_time + timedelta (seconds = self ._duration )
83
- self ._failures_within_duration = []
97
+ self ._end_time = self ._start_time + timedelta (seconds = self ._failure_detection_window )
98
+ self ._failures_count = 0
99
+ self ._commands_executed = 0
0 commit comments