Fix SnapKV scoring (#2591)

vshampor · web-flow · commit 26b2cb6197ca · 2025-08-12T19:42:58.000Z
Adds an option to disable SnapKV entirely and fixes token occurence
counts in the SnapKV case. Also adds unit tests for the associated
logic.
diff --git a/src/cpp/include/openvino/genai/cache_eviction.hpp b/src/cpp/include/openvino/genai/cache_eviction.hpp
@@ -30,7 +30,6 @@ class CacheEvictionConfig {
         OPENVINO_ASSERT(start_size, "CacheEvictionConfig.start_size must be non-zero");
         OPENVINO_ASSERT(recent_size, "CacheEvictionConfig.recent_size must be non-zero");
         OPENVINO_ASSERT(max_cache_size, "CacheEvictionConfig.max_cache_size must be non-zero");
-        OPENVINO_ASSERT(snapkv_window_size, "CacheEvictionConfig.snapkv_window_size must be non-zero");
 
         OPENVINO_ASSERT(max_cache_size > (start_size + recent_size),
                         "CacheEvictionConfig.max_cache_size must be larger than CacheEvictionConfig.start_size + CacheEvictionConfig.recent_size");
@@ -73,7 +72,8 @@ class CacheEvictionConfig {
 
     /** The size of the importance score aggregation window (in token positions from the end of the prompt) for
      * computing initial importance scores at the beginning of the generation phase for purposes of eviction,
-     * following the SnapKV article approach (https://arxiv.org/abs/2404.14469). **/
+     * following the SnapKV article approach (https://arxiv.org/abs/2404.14469). Setting this to 0 disables the SnapKV
+     * score aggregation. **/
     size_t snapkv_window_size = 8;
 
 private:
diff --git a/src/cpp/src/continuous_batching/cache_eviction.cpp b/src/cpp/src/continuous_batching/cache_eviction.cpp
@@ -50,7 +50,15 @@ namespace ov::genai {
 
     void EvictionScoreManager::register_new_token_scores(
             const AttentionScoresForEachDecoderLayer &attention_scores_for_all_decoder_layers,
-            const std::set<size_t>& skipped_logical_block_ids) {
+            const std::set<size_t>& skipped_logical_block_ids, size_t num_snapkv_scores) {
+
+        if (m_num_registered_snapkv_aggregated_scores < m_snapkv_window_size) {
+            OPENVINO_ASSERT(num_snapkv_scores + m_num_registered_snapkv_aggregated_scores <= m_snapkv_window_size, "Total number of aggregated SnapKV scores during prefill phase may not be larger than the configured SnapKV window size");
+            m_num_registered_snapkv_aggregated_scores += num_snapkv_scores;
+        }
+
+        // FIXME (vshampor): currently in terms of counters we do not discern between the cases when the last chunk has been prefill-only
+        // or last-prefill-chunk-plus-one-generation_token
         for (size_t decoder_layer_idx = 0; decoder_layer_idx < m_num_decoder_layers; decoder_layer_idx++) {
 
             const auto &attention_scores = attention_scores_for_all_decoder_layers[decoder_layer_idx];
@@ -105,9 +113,14 @@ namespace ov::genai {
                 max_pooled_hh_scores[idx] = max_val;
             }
 
-            auto &accumulated_scores_for_current_decoder_layer = m_scores[decoder_layer_idx];
+            auto& accumulated_scores_for_current_decoder_layer = m_scores[decoder_layer_idx];
 
             if (accumulated_scores_for_current_decoder_layer.empty()) {
+                if (m_snapkv_window_size != 0 && num_snapkv_scores == 0) {
+                    // SnapKV window not yet reached, no meaningful scores to accumulate
+                    continue;
+                }
+                // New sequence to track
                 if (skipped_logical_block_ids.empty()) {
                     accumulated_scores_for_current_decoder_layer = max_pooled_hh_scores;
                 }
@@ -127,12 +140,20 @@ namespace ov::genai {
                 }
 
                 if (m_aggregation_mode == AggregationMode::NORM_SUM) {
-                    // New sequence to track - will simulate that the tokens comprising the sequence were added one-by-one
-                    // from the standpoint of the occurrence tracker
                     std::size_t new_scores_size = num_hh_scores;
                     std::vector<std::size_t> counter(new_scores_size);
-                    std::generate(counter.begin(), counter.begin() + new_scores_size,
-                                  [&new_scores_size] { return new_scores_size--; });
+                    if (m_snapkv_window_size == 0) {
+                        // Will simulate that the tokens comprising the sequence were added one-by-one
+                        // from the standpoint of the occurrence tracker
+                        std::generate(counter.begin(), counter.begin() + new_scores_size,
+                                      [&new_scores_size] { return new_scores_size--; });
+                    }
+                    else {
+                        OPENVINO_ASSERT(num_snapkv_scores > 0);
+                        OPENVINO_ASSERT(new_scores_size >= num_snapkv_scores);
+                        std::fill(counter.begin(), counter.end() - num_snapkv_scores, num_snapkv_scores);
+                        std::iota(counter.rbegin(), counter.rbegin() + num_snapkv_scores, 1);
+                    }
                     m_cache_counter[decoder_layer_idx] = counter;
                 }
             } else {
@@ -142,18 +163,26 @@ namespace ov::genai {
                 OPENVINO_ASSERT(new_size_in_tokens >= old_size_in_tokens);
                 size_t num_new_tokens = new_size_in_tokens - old_size_in_tokens;
                 if (m_aggregation_mode == AggregationMode::NORM_SUM) {
-                    // Increment occurrence counts of all currently tracked cache blocks
                     auto &counter_for_current_decoder_layer = m_cache_counter[decoder_layer_idx];
-                    for (auto it = counter_for_current_decoder_layer.begin();
-                         it != counter_for_current_decoder_layer.end(); it++) {
-                        *it += num_new_tokens;
-                    }
-                    // Add occurrence counts for new tokens like above
                     counter_for_current_decoder_layer.resize(new_size_in_tokens);
-                    for (size_t i = 0; i < num_new_tokens; i++) {
-                        auto idx = old_size_in_tokens + i;
-                        counter_for_current_decoder_layer[idx] = num_new_tokens - i;
+                    if (m_snapkv_window_size == 0 || m_num_registered_snapkv_aggregated_scores == m_snapkv_window_size) {
+                        // Increment occurrence counts of all currently tracked cache blocks
+                        for (auto it = counter_for_current_decoder_layer.begin();
+                             it != counter_for_current_decoder_layer.end(); it++) {
+                            *it += num_new_tokens;
+                        }
+                        // Add occurrence counts for new tokens like above
+                        for (size_t i = 0; i < num_new_tokens; i++) {
+                            auto idx = old_size_in_tokens + i;
+                            counter_for_current_decoder_layer[idx] = num_new_tokens - i;
+                        }
                     }
+                    else {
+                        OPENVINO_ASSERT(new_size_in_tokens >= m_num_registered_snapkv_aggregated_scores);
+                        std::fill(counter_for_current_decoder_layer.begin(), counter_for_current_decoder_layer.end() - m_num_registered_snapkv_aggregated_scores, m_num_registered_snapkv_aggregated_scores);
+                        std::iota(counter_for_current_decoder_layer.rbegin(), counter_for_current_decoder_layer.rbegin() + m_num_registered_snapkv_aggregated_scores, 1);
+                    }
+
                 }
                 accumulated_scores_for_current_decoder_layer.resize(new_size_in_tokens);
                 add_with_skips(accumulated_scores_for_current_decoder_layer, max_pooled_hh_scores, skip_set_adjusted);
@@ -191,7 +220,7 @@ namespace ov::genai {
     CacheEvictionAlgorithm::CacheEvictionAlgorithm(const CacheEvictionConfig &eviction_config, size_t block_size,
                                                    size_t num_decoder_layers, size_t max_pool_window_size) :
             m_eviction_config(eviction_config), m_block_size(block_size), m_num_decoder_layers(num_decoder_layers),
-            m_score_manager(block_size, num_decoder_layers, max_pool_window_size, eviction_config.aggregation_mode, eviction_config.get_start_size() / block_size)
+            m_score_manager(block_size, num_decoder_layers, max_pool_window_size, eviction_config.aggregation_mode, eviction_config.get_start_size() / block_size, eviction_config.snapkv_window_size)
     {
             OPENVINO_ASSERT(!(m_eviction_config.get_start_size() % m_block_size),
                             "CacheEvictionConfig.start_size in tokens must be a multiple of block size ", m_block_size);
@@ -263,14 +292,15 @@ namespace ov::genai {
     }
 
     void CacheEvictionAlgorithm::register_new_token_scores(
-            const AttentionScoresForEachDecoderLayer &attention_scores_for_all_decoder_layers) {
-        register_new_token_scores(attention_scores_for_all_decoder_layers, {});
+            const AttentionScoresForEachDecoderLayer &attention_scores_for_all_decoder_layers, size_t num_snapkv_scores_aggregated) {
+        register_new_token_scores(attention_scores_for_all_decoder_layers, {}, num_snapkv_scores_aggregated);
     }
 
     void CacheEvictionAlgorithm::register_new_token_scores(
             const AttentionScoresForEachDecoderLayer &attention_scores_for_all_decoder_layers,
-            const std::set<size_t>& skipped_logical_block_ids) {
-        m_score_manager.register_new_token_scores(attention_scores_for_all_decoder_layers, skipped_logical_block_ids);
+            const std::set<size_t>& skipped_logical_block_ids,
+            size_t num_snapkv_scores_aggregated) {
+        m_score_manager.register_new_token_scores(attention_scores_for_all_decoder_layers, skipped_logical_block_ids, num_snapkv_scores_aggregated);
     }
 
 
@@ -457,4 +487,26 @@ namespace ov::genai {
         return retval;
     }
 
+size_t SnapKVScoreAggregationCalculator::get_num_token_scores_to_aggregate(size_t prompt_len, size_t num_scheduled_tokens, size_t num_processed_tokens) {
+    if (m_snapkv_window_size == 0) {
+        // If SnapKV is disabled, aggregate all available scores in this chunk
+        return num_scheduled_tokens;
+    }
+    size_t first_scored_token_position = m_snapkv_window_size > prompt_len ? 0 : prompt_len - m_snapkv_window_size;
+    size_t num_scored_token_positions_in_this_chunk = 0;
+    size_t num_processed_tokens_before_this_chunk = num_processed_tokens;
+    size_t num_processed_tokens_after_this_chunk = num_processed_tokens_before_this_chunk + num_scheduled_tokens;
+    if (num_processed_tokens_after_this_chunk > first_scored_token_position) {
+        if (num_processed_tokens_before_this_chunk > first_scored_token_position) {
+            num_scored_token_positions_in_this_chunk = num_scheduled_tokens;
+        }
+        else {
+            num_scored_token_positions_in_this_chunk = num_processed_tokens_after_this_chunk - first_scored_token_position;
+        }
+
+    }
+    return num_scored_token_positions_in_this_chunk;
+}
+
 }
+
diff --git a/src/cpp/src/continuous_batching/cache_eviction.hpp b/src/cpp/src/continuous_batching/cache_eviction.hpp
@@ -29,9 +29,17 @@ class EvictionScoreManager {
      * @param num_decoder_layers Number of independent KV caches (each corresponding to a single attention layer) in the underlying LLM.
      * @param max_pool_window_size Window size for the max pooling step applied to the newly registered scores before aggregation.
      * @param aggregation_mode Aggregation mode for the scores across register calls.
-     * @param ignore_first_n_blocks Number of blocks from the beginning of the per-token score vector, the scores for which will be disregarded and never aggregated.
+     * @param ignore_first_n_blocks Number of blocks from the beginning of the per-token score vector, the scores for which will
+     * be disregarded and never aggregated.
+     * @param snapkv_window_size Window size for the SnapKV algorithm in effect. If non-zero, then by the start of the generation phase
+     * for the tracked sequence (when the total number of `num_snapkv_scores` passed to each `register_new_token_scores` call reaches
+     * the `snapkv_window_size`) the internal occurence counters will be:
+     * `| S | S | ... | S | S - 1 | S - 2 | ... | 2 | 1 |`,
+     * where `S` is equal to `snapkv_window_size`. In contrast, if this is set to 0, then the initial counter state would be
+     * `| L | L - 1 | ... | 2 | 1 |`,
+     * where L is the prompt size of the sequence in tokens.
      */
-    explicit EvictionScoreManager(size_t block_size, size_t num_decoder_layers, size_t max_pool_window_size, AggregationMode aggregation_mode, size_t ignore_first_n_blocks = 0) : m_block_size(block_size), m_num_decoder_layers(num_decoder_layers), m_scores(num_decoder_layers), m_cache_counter(num_decoder_layers), m_max_pool_window_size(max_pool_window_size), m_aggregation_mode(aggregation_mode), m_ignore_first_n_blocks(ignore_first_n_blocks) {}
+    explicit EvictionScoreManager(size_t block_size, size_t num_decoder_layers, size_t max_pool_window_size, AggregationMode aggregation_mode, size_t ignore_first_n_blocks = 0, size_t snapkv_window_size = 0) : m_block_size(block_size), m_num_decoder_layers(num_decoder_layers), m_scores(num_decoder_layers), m_cache_counter(num_decoder_layers), m_max_pool_window_size(max_pool_window_size), m_aggregation_mode(aggregation_mode), m_ignore_first_n_blocks(ignore_first_n_blocks), m_snapkv_window_size(snapkv_window_size), m_num_registered_snapkv_aggregated_scores(0) {}
 
     /**
      * Registers new token scores and aggregates them internally as necessary. The token scores provided may be corresponding not to all
@@ -42,8 +50,9 @@ class EvictionScoreManager {
      * scores in a corresponding decoder layer.
      * @param skipped_logical_block_ids Logical block indices which had been skipped during inference call that produced the new scores, and
      * which are missing from the new scores.
+     * @param num_snapkv_scores Number of latest token scores that were aggregated together when computing the registered score. If SnapKV is not used, this should be set to 0.
      */
-    void register_new_token_scores(const AttentionScoresForEachDecoderLayer& attention_scores_for_all_decoder_layers, const std::set<size_t>& skipped_logical_block_ids);
+    void register_new_token_scores(const AttentionScoresForEachDecoderLayer& attention_scores_for_all_decoder_layers, const std::set<size_t>& skipped_logical_block_ids, size_t num_snapkv_scores = 0);
 
     /**
      * Removes the scores from tracking for given block indices and given decoder layer.
@@ -88,6 +97,22 @@ class EvictionScoreManager {
     std::size_t m_max_pool_window_size;
     AggregationMode m_aggregation_mode;
     std::size_t m_ignore_first_n_blocks;
+    std::size_t m_snapkv_window_size;
+    std::size_t m_num_registered_snapkv_aggregated_scores;
+};
+
+class SnapKVScoreAggregationCalculator {
+public:
+    SnapKVScoreAggregationCalculator() = default;
+    SnapKVScoreAggregationCalculator(const SnapKVScoreAggregationCalculator& rhs) = default;
+    SnapKVScoreAggregationCalculator& operator=(const SnapKVScoreAggregationCalculator& rhs) = default;
+    SnapKVScoreAggregationCalculator(size_t snapkv_window_size) : m_snapkv_window_size(snapkv_window_size) {}
+
+    size_t get_num_token_scores_to_aggregate(size_t prompt_len, size_t num_scheduled_tokens, size_t num_processed_tokens);
+
+private:
+    size_t m_snapkv_window_size;
+
 };
 
 /**
@@ -160,10 +185,12 @@ class CacheEvictionAlgorithm {
      * @param attention_scores_for_all_decoder_layers A vector with a size equal to the configured num_decoder_layers, where each entry is a
      * vector of per-token attention scores calculated within this layer.
      * @param skipped_logical_block_ids The set of logical indices that have been skipped from the scores as part of the sparse attention prefill process
+     * @param num_snapkv_scores The number of SnapKV-aggregated scores in this score chunk. Set to 0 if SnapKV is not used
+     * (i.e. eviction_config.snapkv_window_size == 0)
      */
-    void register_new_token_scores(const AttentionScoresForEachDecoderLayer& attention_scores_for_all_decoder_layers, const std::set<size_t>& skipped_logical_block_ids);
+    void register_new_token_scores(const AttentionScoresForEachDecoderLayer& attention_scores_for_all_decoder_layers, const std::set<size_t>& skipped_logical_block_ids, size_t num_snapkv_scores = 0);
 
-    void register_new_token_scores(const AttentionScoresForEachDecoderLayer& attention_scores_across_decoder_layers_for_current_sequence);
+    void register_new_token_scores(const AttentionScoresForEachDecoderLayer& attention_scores_across_decoder_layers_for_current_sequence, size_t num_snapkv_scores = 0);
     /**
      * Returns the per-layer sets of logical block indices that should be evicted according to the internally computed importance scores
      * and removes the corresponding blocks from the internal algorithm tracking.
diff --git a/src/cpp/src/continuous_batching/pipeline_impl.cpp b/src/cpp/src/continuous_batching/pipeline_impl.cpp
@@ -673,7 +673,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_maybe_evict_cache_bloc
 
         if (skip_set.empty()) {
             // For now, will only register token scores from the dense attention stages
-            cache_eviction_algo.register_new_token_scores(attention_scores_for_all_decoder_layers, skip_set);
+            cache_eviction_algo.register_new_token_scores(attention_scores_for_all_decoder_layers, skip_set, scheduler_output.m_score_aggregation_windows.at(seq_id));
         }
 
         auto seq_group_ptr_it = std::find_if(m_requests.begin(), m_requests.end(), [seq_id](const SequenceGroup::Ptr& val) { return val->has_sequence_with_id(seq_id); });
diff --git a/src/cpp/src/continuous_batching/scheduler.hpp b/src/cpp/src/continuous_batching/scheduler.hpp
@@ -15,6 +15,7 @@
 #include "continuous_batching/timer.hpp"
 #include "continuous_batching/sparse_attention.hpp"
 #include "utils.hpp"
+#include "continuous_batching/cache_eviction.hpp"
 
 namespace ov::genai {
 class Scheduler {
@@ -394,10 +395,7 @@ class Scheduler {
                         // block tables for each running sequence within a group
                         scheduler_output.m_block_tables[seq_id] = m_block_manager->get_block_tables(seq_id);
 
-                        if (seq->get_generated_len() == 0) {
-                            // full prompt or its remaining tail part fit completely into the next inference
-                            scheduler_output.m_score_aggregation_windows[seq_id] = _schedule_scores_to_aggregate(sequence_group);
-                        }
+                        scheduler_output.m_score_aggregation_windows[seq_id] = _schedule_scores_to_aggregate(sequence_group);
                         scheduler_output.m_xattention_block_size = m_config.sparse_attention_config.xattention_block_size;
                         scheduler_output.m_xattention_stride = m_config.sparse_attention_config.xattention_stride;
                     }
@@ -585,22 +583,13 @@ class Scheduler {
     }
 
     size_t _schedule_scores_to_aggregate(SequenceGroup::Ptr sequence_group) {
-        size_t prompt_len = sequence_group->get_prompt_len();
-        size_t first_scored_token_position = m_snapkv_window_size > prompt_len ? 0 : prompt_len - m_snapkv_window_size;
-        size_t num_scored_token_positions_in_this_chunk = 0;
-        size_t num_processed_tokens_before_this_chunk = sequence_group->get_num_processed_tokens();
+        auto calculator = SnapKVScoreAggregationCalculator(m_snapkv_window_size);
+
         size_t num_scheduled_tokens = sequence_group->get_num_scheduled_tokens();
-        size_t num_processed_tokens_after_this_chunk = num_processed_tokens_before_this_chunk + num_scheduled_tokens;
-        if (num_processed_tokens_after_this_chunk > first_scored_token_position) {
-            if (num_processed_tokens_before_this_chunk > first_scored_token_position) {
-                num_scored_token_positions_in_this_chunk = num_scheduled_tokens;
-            }
-            else {
-                num_scored_token_positions_in_this_chunk = num_processed_tokens_after_this_chunk - first_scored_token_position;
-            }
+        size_t num_processed_tokens = sequence_group->get_num_processed_tokens();
+        size_t prompt_len = sequence_group->get_prompt_len();
 
-        }
-        return num_scored_token_positions_in_this_chunk;
+        return calculator.get_num_token_scores_to_aggregate(prompt_len, num_scheduled_tokens, num_processed_tokens);
     }
 
     float _schedule_xattention_threshold(SequenceGroup::Ptr sequence_group) {
diff --git a/tests/cpp/cache_eviction.cpp b/tests/cpp/cache_eviction.cpp

Original file line number	Diff line number	Diff line change
`@@ -673,7 +673,7 @@ void ContinuousBatchingPipeline::ContinuousBatchingImpl::_maybe_evict_cache_bloc`
`673`	`673`
`674`	`674`	`if (skip_set.empty()) {`
`675`	`675`	`// For now, will only register token scores from the dense attention stages`
`676`		`- cache_eviction_algo.register_new_token_scores(attention_scores_for_all_decoder_layers, skip_set);`
	`676`	`+ cache_eviction_algo.register_new_token_scores(attention_scores_for_all_decoder_layers, skip_set, scheduler_output.m_score_aggregation_windows.at(seq_id));`
`677`	`677`	`}`
`678`	`678`
`679`	`679`	`auto seq_group_ptr_it = std::find_if(m_requests.begin(), m_requests.end(), [seq_id](const SequenceGroup::Ptr& val) { return val->has_sequence_with_id(seq_id); });`