openvinotoolkit
diff --git a/‎site/docs/concepts/optimization-techniques/kvcache-eviction-algorithm.md‎
Lines changed: 83 additions & 0 deletions b/‎site/docs/concepts/optimization-techniques/kvcache-eviction-algorithm.md‎
Lines changed: 83 additions & 0 deletions
diff --git a/‎src/cpp/include/openvino/genai/cache_eviction.hpp‎
Lines changed: 68 additions & 2 deletions b/‎src/cpp/include/openvino/genai/cache_eviction.hpp‎
Lines changed: 68 additions & 2 deletions
diff --git a/‎src/cpp/src/continuous_batching/cache_eviction.cpp‎
Lines changed: 33 additions & 1 deletion b/‎src/cpp/src/continuous_batching/cache_eviction.cpp‎
Lines changed: 33 additions & 1 deletion
diff --git a/‎src/cpp/src/continuous_batching/cache_eviction.hpp‎
Lines changed: 2 additions & 0 deletions b/‎src/cpp/src/continuous_batching/cache_eviction.hpp‎
Lines changed: 2 additions & 0 deletions
@@ -60,3 +60,86 @@ It can be enabled by setting the `CacheEvictionConfig.apply_rotation` field to `
 * Cache rotation is only targeted for the regular, linear LLaMa-like RoPE application and may degrade accuracy on models that use other RoPE schemes.
 
 * Cache rotation is currently only supported for the models with uniform V embedding sizes across the layers.
+
+## (Optional) KVCrush
+
+KVCrush enhances the standard H2O/SnapKV eviction by selecting the most representative blocks from the evictable area using clustering analysis, rather than simply evicting the low score blocks.
+
+### Algorithm Overview
+
+1. **Indicator Creation**: Generate binary indicators for tokens based on importance scores
+2. **Anchor Point Generation**: Create reference patterns using configurable modes
+3. **Distance Calculation**: Measure Hamming distance between block patterns and the anchor point
+4. **Representative Selection**: Select blocks to best represent context diversity
+
+### Configuration
+Setup KVCrush config parameters and pass it  to ```CacheEvictionConfig```. Sample code to allocate KVCrush a budget of 2 blocks and use MEAN anchor mode is following.
+```cpp
+const ov::genai::CacheEvictionConfig EXAMPLE_CACHE_EVICTION_CONFIG =
+    {32, 32, 192, ov::genai::AggregationMode::NORM_SUM, false, 8, KVCrushConfig(2, KVCrushAnchorPointMode::MEAN)};
+```
+```python
+CacheEvictionConfig(
+        start_size=32, 
+        recent_size=128, 
+        max_cache_size=448, 
+        aggregation_mode=AggregationMode.NORM_SUM,
+        apply_rotation=False,
+        snapkv_window_size=8,
+        kvcrush_config=KVCrushConfig(budget=2, anchor_point_mode=KVCrushAnchorPointMode.MEAN)
+    )
+```
+
+**Anchor Point Modes:**
+- `RANDOM`: Random binary pattern
+- `ZEROS`: All zeros pattern  
+- `ONES`: All ones pattern
+- `MEAN`: Mean of indicators across blocks
+- `ALTERNATE`: Alternating 0-1 pattern
+
+### Performance Comparison on LongBench
+
+**Note:** Values in **`this style`** indicate performance equal to or better than the "512, 0" configuration.
+
+#### H2O
+The following table shows accuracy results comparing standard H2O eviction with KVCrush. 
+
+Configuration format: H2O budget (tokens), KVCrush budget (blocks), Anchor Point
+
+| Configuration | qasper | samsum | trec |
+|---------------|--------|--------|------|
+| **FP16 (baseline)** | 21.43 | 34.83 | 1.00 |
+| **512, 0** | 12.40 | 34.39 | 0.50 |
+| **384, 128/32, MEAN** | **`12.91`** | 34.15 | **`0.50`** |
+| **384, 128/32, ALTERNATE** | **`12.55`** | **`34.39`** | **`0.50`** |
+| **384, 128/32, RANDOM** | 12.25 | 34.16 | **`0.50`** |
+| **480, 32/32, MEAN** | **`12.54`** | 33.79 | **`1.00`** |
+| **480, 32/32, ALTERNATE** | **`12.49`** | **`34.59`** | **`1.00`** |
+| **480, 32/32, RANDOM** | 12.37 | **`34.83`** | **`0.50`** |
+| **448, 64/32, MEAN** | **`12.85`** | **`34.61`** | **`1.00`** |
+| **448, 64/32, ALTERNATE** | **`12.61`** | **`34.41`** | **`1.00`** |
+| **448, 64/32, RANDOM** | **`12.43`** | 34.38 | **`1.00`** |
+| **KVCrush - Best** | **`12.91`** | **`34.83`** | **`1.00`** |
+
+#### SnapKV
+The following table shows accuracy results comparing standard SnapKV eviction with KVCrush.
+
+Configuration format: SnapKV budget (tokens), KVCrush budget (blocks), Anchor Point
+
+| Configuration | qasper | samsum | trec |
+|---------------|--------|--------|------|
+| **FP16 (baseline)** | 21.43 | 34.83 | 0.50 |
+| **512, 0** | 12.33 | 34.21 | 1.00 |
+| **384, 128/32, MEAN** | **`12.78`** | **`34.32`** | **`1.00`** |
+| **384, 128/32, ALTERNATE** | 11.87 | **`34.42`** | **`1.00`** |
+| **384, 128/32, RANDOM** | **`12.66`** | 34.05 | 0.50 |
+| **480, 32/32, MEAN** | **`12.97`** | 34.12 | 0.50 |
+| **480, 32/32, ALTERNATE** | **`13.14`** | **`34.22`** | 0.50 |
+| **480, 32/32, RANDOM** | **`13.01`** | **`34.40`** | 0.50 |
+| **448, 64/32, MEAN** | **`12.83`** | **`34.69`** | 0.50 |
+| **448, 64/32, ALTERNATE** | **`13.57`** | **`34.55`** | **`1.00`** |
+| **448, 64/32, RANDOM** | **`13.38`** | **`34.26`** | **`1.00`** |
+| **KVCrush - Best** | **`13.57`** | **`34.69`** | **`1.00`** |
+
+
+
@@ -19,14 +19,76 @@ enum class AggregationMode {
                 * of a given token in cache */
 };
 
+/**
+ * @brief Represents the mode of how anchor points are formed in KVCrush Cache eviction algorithm
+ */
+enum class KVCrushAnchorPointMode {
+    RANDOM, /**<In this mode the anchor point is a random binary vector of 0s and 1s > */
+    ZEROS,  /**<In this mode the anchor point is a vector of 0s */
+    ONES,   /**<In this mode the anchor point is a vector of 1s */
+    MEAN, /**<In this mode the anchor point is a random binary vector of 0s and 1s, where individual values are decided
+             based on majority value */
+    ALTERNATE /**In this mode the anchor point is a vector of alternate 0s and 1s */
+};
+
+class KVCrushConfig {
+public:
+    /**
+     * @brief Configuration struct for the KVCrush cache eviction algorithm.
+     */
+    /**
+     * @class KVCrushConfig
+     * @brief Configuration class for KVCrush cache mechanism.
+     *
+     * This class encapsulates the configuration parameters for the KVCrush cache,
+     * including cache budget, anchor point mode, and random seed.
+     */
+
+    KVCrushConfig() = default;
+
+    /**
+     * @brief Constructs a KVCrushConfig with the specified parameters.
+     * @param budget_ The cache budget, representing the number of blocks to store.
+     * @param anchor_point_mode_ The anchor point mode for KVCrush (see KVCrushAnchorPointMode).
+     * @param rng_seed_ Optional random seed for reproducibility (default is 0).
+     */
+
+    KVCrushConfig(size_t budget_, KVCrushAnchorPointMode anchor_point_mode_, size_t rng_seed_ = 0)
+        : budget(budget_),
+          anchor_point_mode(anchor_point_mode_),
+          rng_seed(rng_seed_) {}
+
+    /*KVCrush Cache budget - number of blocks*/
+    std::size_t budget = 0;
+    /*KVCrush Anchor point mode*/
+    KVCrushAnchorPointMode anchor_point_mode = KVCrushAnchorPointMode::RANDOM;
+    size_t rng_seed = 0;
+    std::size_t get_budget() const {
+        return budget;
+    }
+};
+
 /**
 * @brief Configuration struct for the cache eviction algorithm.
 */
 class CacheEvictionConfig {
 public:
     CacheEvictionConfig() = default;
 
-    CacheEvictionConfig(size_t start_size, size_t recent_size, size_t max_cache_size, AggregationMode aggregation_mode_, bool apply_rotation_ = false, size_t snapkv_window_size_ = 8) : aggregation_mode(aggregation_mode_), apply_rotation(apply_rotation_), snapkv_window_size(snapkv_window_size_), m_start_size(start_size), m_recent_size(recent_size), m_max_cache_size(max_cache_size) {
+    CacheEvictionConfig(size_t start_size,
+                        size_t recent_size,
+                        size_t max_cache_size,
+                        AggregationMode aggregation_mode_,
+                        bool apply_rotation_ = false,
+                        size_t snapkv_window_size_ = 8,
+                        const KVCrushConfig& kvcrush_config_ = KVCrushConfig(0, KVCrushAnchorPointMode::RANDOM))
+        : aggregation_mode(aggregation_mode_),
+          apply_rotation(apply_rotation_),
+          snapkv_window_size(snapkv_window_size_),
+          m_start_size(start_size),
+          m_recent_size(recent_size),
+          m_max_cache_size(max_cache_size),
+          kvcrush_config(kvcrush_config_) {
         OPENVINO_ASSERT(start_size, "CacheEvictionConfig.start_size must be non-zero");
         OPENVINO_ASSERT(recent_size, "CacheEvictionConfig.recent_size must be non-zero");
         OPENVINO_ASSERT(max_cache_size, "CacheEvictionConfig.max_cache_size must be non-zero");
@@ -35,7 +97,6 @@ class CacheEvictionConfig {
         OPENVINO_ASSERT(max_cache_size > (start_size + recent_size),
                         "CacheEvictionConfig.max_cache_size must be larger than CacheEvictionConfig.start_size + CacheEvictionConfig.recent_size");
         m_evictable_size = m_max_cache_size - m_start_size - m_recent_size;
-
     }
 
     /** @return Number of tokens between the "start" and "recent" areas of KV cache that
@@ -76,6 +137,11 @@ class CacheEvictionConfig {
      * following the SnapKV article approach (https://arxiv.org/abs/2404.14469). **/
     size_t snapkv_window_size = 8;
 
+    /** KVCrush configuration for this cache eviction algorithm.
+     * KVCrush is an additional mechanism that allows to retain some tokens in the cache
+     * even if they are not among the most important ones.*/
+    KVCrushConfig kvcrush_config;
+
 private:
     /** Number of tokens in the *beginning* of KV cache that should be retained
      * in the KV cache for this sequence during generation. Must be non-zero and a multiple of the KV cache block size for
 
@@ -191,7 +191,7 @@ namespace ov::genai {
     CacheEvictionAlgorithm::CacheEvictionAlgorithm(const CacheEvictionConfig &eviction_config, size_t block_size,
                                                    size_t num_decoder_layers, size_t max_pool_window_size) :
             m_eviction_config(eviction_config), m_block_size(block_size), m_num_decoder_layers(num_decoder_layers),
-            m_score_manager(block_size, num_decoder_layers, max_pool_window_size, eviction_config.aggregation_mode, eviction_config.get_start_size() / block_size)
+            m_score_manager(block_size, num_decoder_layers, max_pool_window_size, eviction_config.aggregation_mode, eviction_config.get_start_size() / block_size), m_kvcrush_algo(eviction_config.kvcrush_config, block_size)
     {
             OPENVINO_ASSERT(!(m_eviction_config.get_start_size() % m_block_size),
                             "CacheEvictionConfig.start_size in tokens must be a multiple of block size ", m_block_size);
@@ -236,6 +236,38 @@ namespace ov::genai {
             size_t num_blocks_to_evict = get_num_blocks_to_evict(decoder_layer_idx);
             auto evicted_block_indices = get_indices_of_blocks_to_evict(scores_for_all_evictable_blocks, num_blocks_to_evict);
 
+            // KVCrush: start
+            bool should_apply_kvcrush = (m_eviction_config.kvcrush_config.budget > 0) &&
+                                        (evicted_block_indices.size() >= m_eviction_config.kvcrush_config.budget);
+            if (should_apply_kvcrush) {
+                size_t num_tokens_in_evictable_blocks = scores_for_all_evictable_blocks.size() * m_block_size;
+
+                auto kvcrush_retained_block_indices = m_kvcrush_algo.get_indices_of_blocks_to_retain_using_kvcrush(
+                    num_tokens_in_evictable_blocks,
+                    evicted_block_indices,
+                    m_score_manager.get_scores()[decoder_layer_idx]);
+
+                // Remove the indices in kvcrush_retained_block_indices from evicted_block_indices
+                if (!kvcrush_retained_block_indices.empty()) {
+                    // Convert both vectors to sets for efficient operations
+                    std::unordered_set<std::size_t> retained_set(kvcrush_retained_block_indices.begin(),
+                                                                 kvcrush_retained_block_indices.end());
+
+                    // Create a new vector containing only elements not in retained_set
+                    std::vector<std::size_t> filtered_evicted_indices;
+                    filtered_evicted_indices.reserve(evicted_block_indices.size());
+
+                    for (const auto& idx : evicted_block_indices) {
+                        if (retained_set.find(idx) == retained_set.end()) {
+                            filtered_evicted_indices.push_back(idx);
+                        }
+                    }
+                    // Replace the original vector with the filtered one
+                    evicted_block_indices = std::move(filtered_evicted_indices);
+                }
+            }
+            // KVCrush: end
+
             m_num_evicted_tokens += evicted_block_indices.size() * m_block_size;
 
             // No longer need to track the overall "heavy-hitter" attention scores for freshly evicted blocks
 
@@ -11,6 +11,7 @@
 #include "openvino/openvino.hpp"
 #include "continuous_batching/attention_output.hpp"
 #include "openvino/genai/cache_eviction.hpp"
+#include "continuous_batching/kvcrush.hpp"
 
 namespace ov::genai {
 
@@ -188,6 +189,7 @@ class CacheEvictionAlgorithm {
     void remove_scores_of_evicted_blocks(const std::vector<std::size_t>& evicted_block_indices, size_t decoder_layer_idx);
 
     CacheEvictionConfig m_eviction_config;
+    KVCrushAlgorithm m_kvcrush_algo;
     std::size_t m_block_size;
     std::size_t m_num_evicted_tokens = 0;
     std::size_t m_num_decoder_layers;