1
+ // Copyright (C) 2023-2025 Intel Corporation
2
+ // SPDX-License-Identifier: Apache-2.0
3
+
4
+ #include " continuous_batching/kvcrush.hpp"
5
+
6
+ #include < random>
7
+ namespace ov ::genai {
8
+
9
+ KVCrushAlgorithm::KVCrushAlgorithm (const KVCrushConfig& kvcrush_config, size_t block_size)
10
+ : m_kvcrush_config(kvcrush_config),
11
+ m_block_size (block_size),
12
+ rng(std::mt19937(kvcrush_config.rng_seed)) {}
13
+
14
+ // step 1: create_indicators_kvcrush()
15
+ std::vector<size_t > KVCrushAlgorithm::create_indicators_kvcrush (size_t num_tokens_in_evictable_blocks,
16
+
17
+ std::vector<size_t >& evicted_block_indices,
18
+ const std::vector<double >& layer_scores) {
19
+ // Step 1: Sort the scores of the blocks to be evicted
20
+ const auto & blocks_eligible_for_kvcrush = evicted_block_indices;
21
+ std::vector<size_t > indices (num_tokens_in_evictable_blocks);
22
+ std::iota (indices.begin (), indices.end (), 0 );
23
+ std::partial_sort (indices.begin (),
24
+ indices.begin () + num_tokens_in_evictable_blocks / 2 ,
25
+ indices.end (),
26
+ [&](size_t i, size_t j) {
27
+ return layer_scores[i] > layer_scores[j];
28
+ });
29
+
30
+ std::vector<size_t > indicators (num_tokens_in_evictable_blocks, 0 );
31
+ for (size_t i = 0 ; i < num_tokens_in_evictable_blocks / 2 ; ++i) {
32
+ indicators[indices[i]] = 1 ;
33
+ }
34
+ return indicators;
35
+ }
36
+ // step 2: create_anchor_point_kvcrush()
37
+ std::vector<size_t > KVCrushAlgorithm::create_anchor_point_kvcrush (size_t num_tokens_in_evictable_blocks,
38
+
39
+ std::vector<size_t >& indicators) {
40
+ // Step 2: Create a binary vector of size block_size as anchor point
41
+ std::vector<size_t > anchor_point (m_block_size);
42
+ // Initialize anchor_point based on anchor using switch-case
43
+ switch (m_kvcrush_config.anchor_point_mode ) {
44
+ case KVCrushAnchorPointMode::RANDOM: {
45
+ std::uniform_int_distribution<int > dist (0 , 1 );
46
+ std::generate (anchor_point.begin (), anchor_point.end (), [&]() {
47
+ return dist (rng);
48
+ });
49
+ } break ;
50
+ case KVCrushAnchorPointMode::ZEROS:
51
+ std::fill (anchor_point.begin (), anchor_point.end (), 0 );
52
+ break ;
53
+ case KVCrushAnchorPointMode::ONES:
54
+ std::fill (anchor_point.begin (), anchor_point.end (), 1 );
55
+ break ;
56
+ case KVCrushAnchorPointMode::MEAN: {
57
+ size_t num_blocks = num_tokens_in_evictable_blocks / m_block_size;
58
+ for (size_t pos = 0 ; pos < m_block_size; pos++) {
59
+ // Calculate sum of indicators at this position across all blocks
60
+ size_t sum = 0 ;
61
+ for (size_t block_idx = 0 ; block_idx < num_blocks; block_idx++) {
62
+ size_t idx = block_idx * m_block_size + pos;
63
+ sum += indicators[idx];
64
+ }
65
+
66
+ // Calculate mean and set anchor point based on threshold (0.5)
67
+ double mean = static_cast <double >(sum) / num_blocks;
68
+ anchor_point[pos] = (mean > 0.5 ) ? 1 : 0 ;
69
+ }
70
+ break ;
71
+ }
72
+ case KVCrushAnchorPointMode::ALTERNATE:
73
+ for (size_t i = 0 ; i < m_block_size; ++i) {
74
+ anchor_point[i] = i % 2 ;
75
+ }
76
+ break ;
77
+ default :
78
+ OPENVINO_THROW (" Invalid anchor point type" );
79
+ }
80
+ return anchor_point;
81
+ }
82
+
83
+ // step 3: calculate_hamming_distance()
84
+ std::vector<std::pair<size_t , size_t >> KVCrushAlgorithm::calculate_hamming_distance_kvcrush (
85
+ size_t num_tokens_in_evictable_blocks,
86
+
87
+ std::vector<size_t >& indicators,
88
+ std::vector<size_t >& anchor_point) {
89
+ // Step 3: Calculate Hamming distances between anchor point and each block
90
+ size_t num_blocks = num_tokens_in_evictable_blocks / m_block_size;
91
+ std::vector<std::pair<size_t , size_t >> block_distances; // pair<hamming_distance, block_idx>
92
+ block_distances.reserve (num_blocks);
93
+
94
+ for (size_t block_idx = 0 ; block_idx < num_blocks; ++block_idx) {
95
+ size_t hamming_distance = 0 ;
96
+ for (size_t j = 0 ; j < m_block_size; ++j) {
97
+ size_t token_idx = block_idx * m_block_size + j;
98
+ if (token_idx < num_tokens_in_evictable_blocks) {
99
+ // Use the indicators vector to determine the bit value of this position
100
+ int bit_value = indicators[token_idx];
101
+ if (bit_value != anchor_point[j]) {
102
+ hamming_distance++;
103
+ }
104
+ }
105
+ }
106
+ block_distances.emplace_back (hamming_distance, block_idx);
107
+ }
108
+ return block_distances;
109
+ }
110
+
111
+ // step 4: get_representative_blocks()
112
+ std::vector<std::size_t > KVCrushAlgorithm::get_representative_blocks_kvcrush (
113
+
114
+ size_t num_tokens_in_evictable_blocks,
115
+ std::vector<std::pair<size_t , size_t >>& block_distances,
116
+ const std::vector<size_t >& blocks_eligible_for_kvcrush) {
117
+ // Step 4: Find the representative blocks
118
+ // Filter block indices that are in blocks_eligible_for_kvcrush
119
+ std::vector<size_t > filtered_block_indices;
120
+ filtered_block_indices.reserve (block_distances.size ());
121
+
122
+ for (const auto & entry : block_distances) {
123
+ size_t block_idx = entry.second ;
124
+ // Check if block_idx is in blocks_eligible_for_kvcrush
125
+ if (std::find (blocks_eligible_for_kvcrush.begin (), blocks_eligible_for_kvcrush.end (), block_idx) !=
126
+ blocks_eligible_for_kvcrush.end ()) {
127
+ filtered_block_indices.push_back (block_idx);
128
+ }
129
+ }
130
+ // Sort filtered_block_indices based on Hamming distance
131
+ std::sort (filtered_block_indices.begin (), filtered_block_indices.end (), [&](size_t a, size_t b) {
132
+ return block_distances[a].first < block_distances[b].first ;
133
+ });
134
+ // select kvcrush_budget number of blocks from filtered_block_indices, uniformly spaced
135
+ size_t num_blocks_to_retain = std::min (filtered_block_indices.size (), m_kvcrush_config.get_budget ());
136
+ size_t step = filtered_block_indices.size () / num_blocks_to_retain;
137
+ std::vector<std::size_t > kvcrush_retained_block_indices;
138
+ kvcrush_retained_block_indices.reserve (num_blocks_to_retain);
139
+ for (size_t i = 0 ; i < num_blocks_to_retain; ++i) {
140
+ size_t idx = i * step;
141
+ if (idx < filtered_block_indices.size ()) {
142
+ kvcrush_retained_block_indices.push_back (filtered_block_indices[idx]);
143
+ }
144
+ }
145
+
146
+ return kvcrush_retained_block_indices;
147
+ }
148
+
149
+ std::vector<std::size_t > KVCrushAlgorithm::get_indices_of_blocks_to_retain_using_kvcrush (
150
+
151
+ size_t num_tokens_in_evictable_blocks,
152
+ std::vector<std::size_t >& evicted_block_indices,
153
+ const std::vector<double >& layer_scores) {
154
+ // step 1: Create indicators_kvcrush makes binary feature vectors based on top-k/2 scores
155
+ const auto & blocks_eligible_for_kvcrush = evicted_block_indices; // only the blocks that are evicted by the score
156
+ // based eviction are eligible for kvcrush
157
+
158
+ std::vector<size_t > indicators =
159
+ create_indicators_kvcrush (num_tokens_in_evictable_blocks, evicted_block_indices, layer_scores);
160
+
161
+ // Step 2: Create anchor_point based on the selected anchor point type
162
+ std::vector<size_t > anchor_point = create_anchor_point_kvcrush (num_tokens_in_evictable_blocks, indicators);
163
+
164
+ // Step 3: Calculate Hamming distances between anchor point and each block, where each block is represented by
165
+ // its binary feature vector called indicators
166
+ std::vector<std::pair<size_t , size_t >> block_distances =
167
+ calculate_hamming_distance_kvcrush (num_tokens_in_evictable_blocks, indicators, anchor_point);
168
+
169
+ // Step 4: Find the representative blocks
170
+ // Filter block indices that are in blocks_eligible_for_kvcrush
171
+ return get_representative_blocks_kvcrush (num_tokens_in_evictable_blocks,
172
+ block_distances,
173
+ blocks_eligible_for_kvcrush);
174
+ }
175
+
176
+ } // namespace ov::genai
0 commit comments