Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -60,3 +60,40 @@ It can be enabled by setting the `CacheEvictionConfig.apply_rotation` field to `
* Cache rotation is only targeted for the regular, linear LLaMa-like RoPE application and may degrade accuracy on models that use other RoPE schemes.

* Cache rotation is currently only supported for the models with uniform V embedding sizes across the layers.

## (Optional) KVCrush

KVCrush enhances the standard H2O/SnapKV eviction by selecting the most representative blocks from the evictable area using clustering analysis, rather than simply evicting the low score blocks.

### Algorithm Overview

1. **Indicator Creation**: Generate binary indicators for tokens based on importance scores
2. **Anchor Point Generation**: Create reference patterns using configurable modes
3. **Distance Calculation**: Measure Hamming distance between block patterns and the anchor point
4. **Representative Selection**: Select blocks to best represent context diversity

### Configuration
Setup KVCrush config parameters and pass it to ```CacheEvictionConfig```. Sample code to allocate KVCrush a budget of 2 blocks and use MEAN anchor mode is following.
```cpp
const ov::genai::CacheEvictionConfig EXAMPLE_CACHE_EVICTION_CONFIG =
{32, 32, 192, ov::genai::AggregationMode::NORM_SUM, false, 8, KVCrushConfig(2, KVCrushAnchorPointMode::MEAN)};
```
```python
CacheEvictionConfig(
start_size=32,
recent_size=128,
max_cache_size=448,
aggregation_mode=AggregationMode.NORM_SUM,
apply_rotation=False,
snapkv_window_size=8,
kvcrush_config=KVCrushConfig(budget=2, anchor_point_mode=KVCrushAnchorPointMode.MEAN)
)
```

**Anchor Point Modes:**
- `RANDOM`: Random binary pattern
- `ZEROS`: All zeros pattern
- `ONES`: All ones pattern
- `MEAN`: Mean of indicators across blocks
- `ALTERNATE`: Alternating 0-1 pattern

70 changes: 68 additions & 2 deletions src/cpp/include/openvino/genai/cache_eviction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,83 @@ enum class AggregationMode {
* of a given token in cache */
};

/**
* @brief Represents the mode of how anchor points are formed in KVCrush Cache eviction algorithm
*/
enum class KVCrushAnchorPointMode {
RANDOM, /**<In this mode the anchor point is a random binary vector of 0s and 1s > */
ZEROS, /**<In this mode the anchor point is a vector of 0s */
ONES, /**<In this mode the anchor point is a vector of 1s */
MEAN, /**<In this mode the anchor point is a random binary vector of 0s and 1s, where individual values are decided
based on majority value */
ALTERNATE /**In this mode the anchor point is a vector of alternate 0s and 1s */
};

class KVCrushConfig {
public:
/**
* @brief Configuration struct for the KVCrush cache eviction algorithm.
*/
/**
* @class KVCrushConfig
* @brief Configuration class for KVCrush cache mechanism.
*
* This class encapsulates the configuration parameters for the KVCrush cache,
* including cache budget, anchor point mode, and random seed.
*/

KVCrushConfig() = default;

/**
* @brief Constructs a KVCrushConfig with the specified parameters.
* @param budget_ The cache budget, representing the number of blocks to store.
* @param anchor_point_mode_ The anchor point mode for KVCrush (see KVCrushAnchorPointMode).
* @param rng_seed_ Optional random seed for reproducibility (default is 0).
*/

KVCrushConfig(size_t budget_, KVCrushAnchorPointMode anchor_point_mode_, size_t rng_seed_ = 0)
: budget(budget_),
anchor_point_mode(anchor_point_mode_),
rng_seed(rng_seed_) {}

/*KVCrush Cache budget - number of blocks*/
std::size_t budget = 0;
/*KVCrush Anchor point mode*/
KVCrushAnchorPointMode anchor_point_mode = KVCrushAnchorPointMode::RANDOM;
size_t rng_seed = 0;
std::size_t get_budget() const {
return budget;
}
};

/**
* @brief Configuration struct for the cache eviction algorithm.
*/
class CacheEvictionConfig {
public:
CacheEvictionConfig() = default;

CacheEvictionConfig(size_t start_size, size_t recent_size, size_t max_cache_size, AggregationMode aggregation_mode_, bool apply_rotation_ = false, size_t snapkv_window_size_ = 8) : aggregation_mode(aggregation_mode_), apply_rotation(apply_rotation_), snapkv_window_size(snapkv_window_size_), m_start_size(start_size), m_recent_size(recent_size), m_max_cache_size(max_cache_size) {
CacheEvictionConfig(size_t start_size,
size_t recent_size,
size_t max_cache_size,
AggregationMode aggregation_mode_,
bool apply_rotation_ = false,
size_t snapkv_window_size_ = 8,
const KVCrushConfig& kvcrush_config_ = KVCrushConfig(0, KVCrushAnchorPointMode::RANDOM))
: aggregation_mode(aggregation_mode_),
apply_rotation(apply_rotation_),
snapkv_window_size(snapkv_window_size_),
m_start_size(start_size),
m_recent_size(recent_size),
m_max_cache_size(max_cache_size),
kvcrush_config(kvcrush_config_) {
OPENVINO_ASSERT(start_size, "CacheEvictionConfig.start_size must be non-zero");
OPENVINO_ASSERT(recent_size, "CacheEvictionConfig.recent_size must be non-zero");
OPENVINO_ASSERT(max_cache_size, "CacheEvictionConfig.max_cache_size must be non-zero");

OPENVINO_ASSERT(max_cache_size > (start_size + recent_size),
"CacheEvictionConfig.max_cache_size must be larger than CacheEvictionConfig.start_size + CacheEvictionConfig.recent_size");
m_evictable_size = m_max_cache_size - m_start_size - m_recent_size;

}

/** @return Number of tokens between the "start" and "recent" areas of KV cache that
Expand Down Expand Up @@ -76,6 +137,11 @@ class CacheEvictionConfig {
* score aggregation. **/
size_t snapkv_window_size = 8;

/** KVCrush configuration for this cache eviction algorithm.
* KVCrush is an additional mechanism that allows to retain some tokens in the cache
* even if they are not among the most important ones.*/
KVCrushConfig kvcrush_config;

private:
/** Number of tokens in the *beginning* of KV cache that should be retained
* in the KV cache for this sequence during generation. Must be non-zero and a multiple of the KV cache block size for
Expand Down
34 changes: 33 additions & 1 deletion src/cpp/src/continuous_batching/cache_eviction.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ namespace ov::genai {
CacheEvictionAlgorithm::CacheEvictionAlgorithm(const CacheEvictionConfig &eviction_config, size_t block_size,
size_t num_decoder_layers, size_t max_pool_window_size) :
m_eviction_config(eviction_config), m_block_size(block_size), m_num_decoder_layers(num_decoder_layers),
m_score_manager(block_size, num_decoder_layers, max_pool_window_size, eviction_config.aggregation_mode, eviction_config.get_start_size() / block_size, eviction_config.snapkv_window_size)
m_score_manager(block_size, num_decoder_layers, max_pool_window_size, eviction_config.aggregation_mode, eviction_config.get_start_size() / block_size, eviction_config.snapkv_window_size), m_kvcrush_algo(eviction_config.kvcrush_config, block_size)
{
OPENVINO_ASSERT(!(m_eviction_config.get_start_size() % m_block_size),
"CacheEvictionConfig.start_size in tokens must be a multiple of block size ", m_block_size);
Expand Down Expand Up @@ -265,6 +265,38 @@ namespace ov::genai {
size_t num_blocks_to_evict = get_num_blocks_to_evict(decoder_layer_idx);
auto evicted_block_indices = get_indices_of_blocks_to_evict(scores_for_all_evictable_blocks, num_blocks_to_evict);

// KVCrush: start
bool should_apply_kvcrush = (m_eviction_config.kvcrush_config.budget > 0) &&
(evicted_block_indices.size() >= m_eviction_config.kvcrush_config.budget);
if (should_apply_kvcrush) {
size_t num_tokens_in_evictable_blocks = scores_for_all_evictable_blocks.size() * m_block_size;

auto kvcrush_retained_block_indices = m_kvcrush_algo.get_indices_of_blocks_to_retain_using_kvcrush(
num_tokens_in_evictable_blocks,
evicted_block_indices,
m_score_manager.get_scores()[decoder_layer_idx]);

// Remove the indices in kvcrush_retained_block_indices from evicted_block_indices
if (!kvcrush_retained_block_indices.empty()) {
// Convert both vectors to sets for efficient operations
std::unordered_set<std::size_t> retained_set(kvcrush_retained_block_indices.begin(),
kvcrush_retained_block_indices.end());

// Create a new vector containing only elements not in retained_set
std::vector<std::size_t> filtered_evicted_indices;
filtered_evicted_indices.reserve(evicted_block_indices.size());

for (const auto& idx : evicted_block_indices) {
if (retained_set.find(idx) == retained_set.end()) {
filtered_evicted_indices.push_back(idx);
}
}
// Replace the original vector with the filtered one
evicted_block_indices = std::move(filtered_evicted_indices);
}
}
// KVCrush: end

m_num_evicted_tokens += evicted_block_indices.size() * m_block_size;

// No longer need to track the overall "heavy-hitter" attention scores for freshly evicted blocks
Expand Down
2 changes: 2 additions & 0 deletions src/cpp/src/continuous_batching/cache_eviction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "openvino/openvino.hpp"
#include "continuous_batching/attention_output.hpp"
#include "openvino/genai/cache_eviction.hpp"
#include "continuous_batching/kvcrush.hpp"

namespace ov::genai {

Expand Down Expand Up @@ -215,6 +216,7 @@ class CacheEvictionAlgorithm {
void remove_scores_of_evicted_blocks(const std::vector<std::size_t>& evicted_block_indices, size_t decoder_layer_idx);

CacheEvictionConfig m_eviction_config;
KVCrushAlgorithm m_kvcrush_algo;
std::size_t m_block_size;
std::size_t m_num_evicted_tokens = 0;
std::size_t m_num_decoder_layers;
Expand Down
176 changes: 176 additions & 0 deletions src/cpp/src/continuous_batching/kvcrush.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
// Copyright (C) 2023-2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#include "continuous_batching/kvcrush.hpp"

#include <random>
namespace ov::genai {

KVCrushAlgorithm::KVCrushAlgorithm(const KVCrushConfig& kvcrush_config, size_t block_size)
: m_kvcrush_config(kvcrush_config),
m_block_size(block_size),
rng(std::mt19937(kvcrush_config.rng_seed)) {}

// step 1: create_indicators_kvcrush()
std::vector<size_t> KVCrushAlgorithm::create_indicators_kvcrush(size_t num_tokens_in_evictable_blocks,

std::vector<size_t>& evicted_block_indices,
const std::vector<double>& layer_scores) {
// Step 1: Sort the scores of the blocks to be evicted
const auto& blocks_eligible_for_kvcrush = evicted_block_indices;
std::vector<size_t> indices(num_tokens_in_evictable_blocks);
std::iota(indices.begin(), indices.end(), 0);
std::partial_sort(indices.begin(),
indices.begin() + num_tokens_in_evictable_blocks / 2,
indices.end(),
[&](size_t i, size_t j) {
return layer_scores[i] > layer_scores[j];
});

std::vector<size_t> indicators(num_tokens_in_evictable_blocks, 0);
for (size_t i = 0; i < num_tokens_in_evictable_blocks / 2; ++i) {
indicators[indices[i]] = 1;
}
return indicators;
}
// step 2: create_anchor_point_kvcrush()
std::vector<size_t> KVCrushAlgorithm::create_anchor_point_kvcrush(size_t num_tokens_in_evictable_blocks,

std::vector<size_t>& indicators) {
// Step 2: Create a binary vector of size block_size as anchor point
std::vector<size_t> anchor_point(m_block_size);
// Initialize anchor_point based on anchor using switch-case
switch (m_kvcrush_config.anchor_point_mode) {
case KVCrushAnchorPointMode::RANDOM: {
std::uniform_int_distribution<int> dist(0, 1);
std::generate(anchor_point.begin(), anchor_point.end(), [&]() {
return dist(rng);
});
} break;
case KVCrushAnchorPointMode::ZEROS:
std::fill(anchor_point.begin(), anchor_point.end(), 0);
break;
case KVCrushAnchorPointMode::ONES:
std::fill(anchor_point.begin(), anchor_point.end(), 1);
break;
case KVCrushAnchorPointMode::MEAN: {
size_t num_blocks = num_tokens_in_evictable_blocks / m_block_size;
for (size_t pos = 0; pos < m_block_size; pos++) {
// Calculate sum of indicators at this position across all blocks
size_t sum = 0;
for (size_t block_idx = 0; block_idx < num_blocks; block_idx++) {
size_t idx = block_idx * m_block_size + pos;
sum += indicators[idx];
}

// Calculate mean and set anchor point based on threshold (0.5)
double mean = static_cast<double>(sum) / num_blocks;
anchor_point[pos] = (mean > 0.5) ? 1 : 0;
}
break;
}
case KVCrushAnchorPointMode::ALTERNATE:
for (size_t i = 0; i < m_block_size; ++i) {
anchor_point[i] = i % 2;
}
break;
default:
OPENVINO_THROW("Invalid anchor point type");
}
return anchor_point;
}

// step 3: calculate_hamming_distance()
std::vector<std::pair<size_t, size_t>> KVCrushAlgorithm::calculate_hamming_distance_kvcrush(
size_t num_tokens_in_evictable_blocks,

std::vector<size_t>& indicators,
std::vector<size_t>& anchor_point) {
// Step 3: Calculate Hamming distances between anchor point and each block
size_t num_blocks = num_tokens_in_evictable_blocks / m_block_size;
std::vector<std::pair<size_t, size_t>> block_distances; // pair<hamming_distance, block_idx>
block_distances.reserve(num_blocks);

for (size_t block_idx = 0; block_idx < num_blocks; ++block_idx) {
size_t hamming_distance = 0;
for (size_t j = 0; j < m_block_size; ++j) {
size_t token_idx = block_idx * m_block_size + j;
if (token_idx < num_tokens_in_evictable_blocks) {
// Use the indicators vector to determine the bit value of this position
int bit_value = indicators[token_idx];
if (bit_value != anchor_point[j]) {
hamming_distance++;
}
}
}
block_distances.emplace_back(hamming_distance, block_idx);
}
return block_distances;
}

// step 4: get_representative_blocks()
std::vector<std::size_t> KVCrushAlgorithm::get_representative_blocks_kvcrush(

size_t num_tokens_in_evictable_blocks,
std::vector<std::pair<size_t, size_t>>& block_distances,
const std::vector<size_t>& blocks_eligible_for_kvcrush) {
// Step 4: Find the representative blocks
// Filter block indices that are in blocks_eligible_for_kvcrush
std::vector<size_t> filtered_block_indices;
filtered_block_indices.reserve(block_distances.size());

for (const auto& entry : block_distances) {
size_t block_idx = entry.second;
// Check if block_idx is in blocks_eligible_for_kvcrush
if (std::find(blocks_eligible_for_kvcrush.begin(), blocks_eligible_for_kvcrush.end(), block_idx) !=
blocks_eligible_for_kvcrush.end()) {
filtered_block_indices.push_back(block_idx);
}
}
// Sort filtered_block_indices based on Hamming distance
std::sort(filtered_block_indices.begin(), filtered_block_indices.end(), [&](size_t a, size_t b) {
return block_distances[a].first < block_distances[b].first;
});
// select kvcrush_budget number of blocks from filtered_block_indices, uniformly spaced
size_t num_blocks_to_retain = std::min(filtered_block_indices.size(), m_kvcrush_config.get_budget());
size_t step = filtered_block_indices.size() / num_blocks_to_retain;
std::vector<std::size_t> kvcrush_retained_block_indices;
kvcrush_retained_block_indices.reserve(num_blocks_to_retain);
for (size_t i = 0; i < num_blocks_to_retain; ++i) {
size_t idx = i * step;
if (idx < filtered_block_indices.size()) {
kvcrush_retained_block_indices.push_back(filtered_block_indices[idx]);
}
}

return kvcrush_retained_block_indices;
}

std::vector<std::size_t> KVCrushAlgorithm::get_indices_of_blocks_to_retain_using_kvcrush(

size_t num_tokens_in_evictable_blocks,
std::vector<std::size_t>& evicted_block_indices,
const std::vector<double>& layer_scores) {
// step 1: Create indicators_kvcrush makes binary feature vectors based on top-k/2 scores
const auto& blocks_eligible_for_kvcrush = evicted_block_indices; // only the blocks that are evicted by the score
// based eviction are eligible for kvcrush

std::vector<size_t> indicators =
create_indicators_kvcrush(num_tokens_in_evictable_blocks, evicted_block_indices, layer_scores);

// Step 2: Create anchor_point based on the selected anchor point type
std::vector<size_t> anchor_point = create_anchor_point_kvcrush(num_tokens_in_evictable_blocks, indicators);

// Step 3: Calculate Hamming distances between anchor point and each block, where each block is represented by
// its binary feature vector called indicators
std::vector<std::pair<size_t, size_t>> block_distances =
calculate_hamming_distance_kvcrush(num_tokens_in_evictable_blocks, indicators, anchor_point);

// Step 4: Find the representative blocks
// Filter block indices that are in blocks_eligible_for_kvcrush
return get_representative_blocks_kvcrush(num_tokens_in_evictable_blocks,
block_distances,
blocks_eligible_for_kvcrush);
}

} // namespace ov::genai
Loading
Loading