@@ -63,6 +63,45 @@ def _calculate_messages_token_count(messages: list[MemoryMessage]) -> int:
63
63
return total_tokens
64
64
65
65
66
+ def _calculate_context_usage_percentages (
67
+ messages : list [MemoryMessage ],
68
+ model_name : ModelNameLiteral | None ,
69
+ context_window_max : int | None ,
70
+ ) -> tuple [float | None , float | None ]:
71
+ """
72
+ Calculate context usage percentages for total usage and until summarization triggers.
73
+
74
+ Args:
75
+ messages: List of messages to calculate token count for
76
+ model_name: The client's LLM model name for context window determination
77
+ context_window_max: Direct specification of context window max tokens
78
+
79
+ Returns:
80
+ Tuple of (total_percentage, until_summarization_percentage)
81
+ - total_percentage: Percentage (0-100) of total context window used
82
+ - until_summarization_percentage: Percentage (0-100) until summarization triggers
83
+ Both values are None if no model info provided
84
+ """
85
+ if not messages or (not model_name and not context_window_max ):
86
+ return None , None
87
+
88
+ # Calculate current token usage
89
+ current_tokens = _calculate_messages_token_count (messages )
90
+
91
+ # Get effective token limit for the client's model
92
+ max_tokens = _get_effective_token_limit (model_name , context_window_max )
93
+
94
+ # Calculate percentage of total context window used
95
+ total_percentage = (current_tokens / max_tokens ) * 100.0
96
+
97
+ # Calculate percentage until summarization threshold
98
+ token_threshold = int (max_tokens * settings .summarization_threshold )
99
+ until_summarization_percentage = (current_tokens / token_threshold ) * 100.0
100
+
101
+ # Cap both at 100% for display purposes
102
+ return min (total_percentage , 100.0 ), min (until_summarization_percentage , 100.0 )
103
+
104
+
66
105
async def _summarize_working_memory (
67
106
memory : WorkingMemory ,
68
107
model_name : ModelNameLiteral | None = None ,
@@ -88,8 +127,8 @@ async def _summarize_working_memory(
88
127
max_tokens = _get_effective_token_limit (model_name , context_window_max )
89
128
90
129
# Reserve space for new messages, function calls, and response generation
91
- # Use 70% of context window to leave room for new content
92
- token_threshold = int (max_tokens * 0.7 )
130
+ # Use configurable threshold to leave room for new content
131
+ token_threshold = int (max_tokens * settings . summarization_threshold )
93
132
94
133
if current_tokens <= token_threshold :
95
134
return memory
@@ -269,7 +308,22 @@ async def get_working_memory(
269
308
270
309
logger .debug (f"Working mem: { working_mem } " )
271
310
272
- return working_mem
311
+ # Calculate context usage percentages
312
+ total_percentage , until_summarization_percentage = (
313
+ _calculate_context_usage_percentages (
314
+ messages = working_mem .messages ,
315
+ model_name = model_name ,
316
+ context_window_max = context_window_max ,
317
+ )
318
+ )
319
+
320
+ # Return WorkingMemoryResponse with both percentage values
321
+ working_mem_data = working_mem .model_dump ()
322
+ working_mem_data ["context_percentage_total_used" ] = total_percentage
323
+ working_mem_data ["context_percentage_until_summarization" ] = (
324
+ until_summarization_percentage
325
+ )
326
+ return WorkingMemoryResponse (** working_mem_data )
273
327
274
328
275
329
@router .put ("/v1/working-memory/{session_id}" , response_model = WorkingMemoryResponse )
@@ -348,7 +402,22 @@ async def put_working_memory(
348
402
namespace = updated_memory .namespace ,
349
403
)
350
404
351
- return updated_memory
405
+ # Calculate context usage percentages based on the final state (after potential summarization)
406
+ total_percentage , until_summarization_percentage = (
407
+ _calculate_context_usage_percentages (
408
+ messages = updated_memory .messages ,
409
+ model_name = model_name ,
410
+ context_window_max = context_window_max ,
411
+ )
412
+ )
413
+
414
+ # Return WorkingMemoryResponse with both percentage values
415
+ updated_memory_data = updated_memory .model_dump ()
416
+ updated_memory_data ["context_percentage_total_used" ] = total_percentage
417
+ updated_memory_data ["context_percentage_until_summarization" ] = (
418
+ until_summarization_percentage
419
+ )
420
+ return WorkingMemoryResponse (** updated_memory_data )
352
421
353
422
354
423
@router .delete ("/v1/working-memory/{session_id}" , response_model = AckResponse )
0 commit comments