-
Notifications
You must be signed in to change notification settings - Fork 13.2k
imatrix: calculate activation-based statistics for new format (GGUF) imatrices #14891
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from 1 commit
09bc7c2
2097f03
78ddb47
9744a4a
cce514a
b7fb362
9b841eb
ee2509f
fc8f925
4c01f51
a32a2ec
4d1325e
5324558
fce05aa
be60469
a6155a8
2117c4e
90cb1be
f1c2a4c
c39c4e2
adbff66
5e40cf4
b373934
906548a
aea9b31
49996a1
4c3fea8
88854c9
030ed3c
c7959ed
3e9d53c
e0d6471
dadd90e
5bb2def
c5ecdaa
59af503
9467963
6fe51e1
dcac206
89051cd
2756617
42bfe3b
240a965
8589ef4
030ec53
d4b0d89
e3149a2
4a487ea
97d839c
d19e6c9
12607d3
a96013f
2e80323
44ea7dd
f6934b9
1f72bc1
630750f
5aca256
3e26364
69b351b
6371902
70dd25b
8f1aa78
8d0e276
7448bdb
0c3a019
63f3449
193d5bb
5932eef
a28ee30
bc38936
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -38,10 +38,12 @@ static const char * const LLM_KV_IMATRIX_CHUNK_COUNT = "imatrix.chunk_count"; | |
static const char * const LLM_KV_IMATRIX_CHUNK_SIZE = "imatrix.chunk_size"; | ||
|
||
struct Stats { | ||
std::vector<float> activations; | ||
std::vector<float> values; | ||
std::vector<int64_t> counts; | ||
}; | ||
|
||
//ToDo: rename sqract variables to be more generic like 'values' | ||
struct tensor_statistics { | ||
std::string tensor; | ||
Stats stats; | ||
|
@@ -139,14 +141,28 @@ static void compute_statistics(std::vector<tensor_statistics> & tstats, const st | |
const int row_size = e.values.size() / n_mat; | ||
|
||
std::vector<float> activations; | ||
activations.reserve(e.values.size()); | ||
|
||
for (int i = 0; i < n_mat; ++i) { | ||
for (int j = 0; j < row_size; ++j) { | ||
activations.push_back(e.values[i*row_size + j] / e.counts[i]); | ||
if (e.activations.empty()) { | ||
activations.reserve(e.values.size()); | ||
|
||
for (int i = 0; i < n_mat; ++i) { | ||
for (int j = 0; j < row_size; ++j) { | ||
activations.push_back(e.values[i*row_size + j] / e.counts[i]); | ||
} | ||
} | ||
} else { | ||
activations.reserve(e.activations.size()); | ||
|
||
for (int i = 0; i < n_mat; ++i) { | ||
for (int j = 0; j < row_size; ++j) { | ||
activations.push_back(e.activations[i*row_size + j] / e.counts[i]); | ||
} | ||
} | ||
} | ||
Comment on lines
+178
to
194
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. When the sums of activations are available, this is completely ignoring the sums of squared activations???? All of the new statistics are done over the per-channel means. This doesn't seem right. The sums of squared activations are used for quantization importance, and if they're completely ignored, then the statistics are possibly meaningless for importance purposes. The mean and mean of squared activations together should allow calculating per-channel variance and stuff like that. Not sure how to turn that into per-tensor stats, though it's likely possible somehow. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's correct, and by design. When available, using mean activations instead yields better statistics since the direction of the change (minus sign) is now available. The ECS stat (dot product of cossim and l2 norm), for example, correctly identifies attn_output and ffn_down as the most sensitive to quantisation. This is not possible with mean of squared activations. The idea of deriving the per-channel variance through mean and mean of squared activations is quite interesting. I'll look into for a future release. |
||
|
||
|
||
|
||
//ToDo: rename act_ variables to be more generic like 'values' | ||
const float act_total = std::accumulate(activations.begin(), activations.end(), 0.0f); | ||
const float act_max = *std::max_element(activations.begin(), activations.end()); | ||
const float act_min = *std::min_element(activations.begin(), activations.end()); | ||
|
@@ -282,6 +298,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * | |
e.counts.resize(n_as, e.counts[0]); | ||
} | ||
if (e.values.empty()) { | ||
e.activations.resize(src1->ne[0]*n_as, 0); | ||
e.values.resize(src1->ne[0]*n_as, 0); | ||
e.counts.resize(n_as, 0); | ||
} | ||
|
@@ -313,6 +330,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * | |
e.counts[ex]++; | ||
|
||
for (int64_t j = 0; j < src1->ne[0]; ++j) { | ||
e.activations[e_start + j] += x[j]; | ||
e.values[e_start + j] += x[j] * x[j]; | ||
if (!std::isfinite((float)e.values[e_start + j])) { | ||
LOG_ERR("%f detected in %s\n", (float)e.values[e_start + j], wname.c_str()); | ||
|
@@ -338,6 +356,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * | |
const int64_t n_mat = src1->ne[2] * src1->ne[3]; | ||
|
||
if (e.values.empty()) { | ||
e.activations.resize(src1->ne[0] * n_mat, 0); | ||
e.values.resize(src1->ne[0] * n_mat, 0); | ||
e.counts.resize(n_mat, 0); | ||
} | ||
|
@@ -359,6 +378,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void * | |
const float * x = (const float *) (data + row * src1->nb[1] + i2 * src1->nb[2] + i3 * src1->ne[3]); | ||
e.counts[mat_id]++; | ||
for (int64_t j = 0; j < src1->ne[0]; ++j) { | ||
e.activations[mat_start + j] += x[j]; | ||
e.values[mat_start + j] += x[j] * x[j]; | ||
if (!std::isfinite((float)e.values[j])) { | ||
LOG_ERR("%f detected in %s\n", (float)e.values[j], wname.c_str()); | ||
|
@@ -532,6 +552,7 @@ void IMatrixCollector::save_imatrix(int32_t n_chunk) const { | |
} | ||
|
||
to_store.push_back(kv.first); | ||
data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.activations.size(), GGML_MEM_ALIGN); | ||
data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.values.size(), GGML_MEM_ALIGN); | ||
data_size += GGML_PAD(ggml_tensor_overhead() + sizeof(float) * kv.second.counts.size(), GGML_MEM_ALIGN); | ||
} | ||
|
@@ -584,6 +605,16 @@ void IMatrixCollector::save_imatrix(int32_t n_chunk) const { | |
|
||
gguf_add_tensor(ctx_gguf, in_sum2); | ||
gguf_add_tensor(ctx_gguf, counts); | ||
|
||
if (!stat.activations.empty()) { | ||
const int32_t nact = (int32_t) stat.activations.size(); | ||
struct ggml_tensor * in_sum = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nact / nmat, nmat); | ||
ggml_format_name(in_sum, "%s.in_sum", name.c_str()); // ToDo: consider a better name. 'in_act' maybe? | ||
EAddario marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
for (int32_t j = 0; j < nval; ++j) { | ||
EAddario marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
((float *) in_sum->data)[j] = (float) stat.activations[j]; | ||
} | ||
gguf_add_tensor(ctx_gguf, in_sum); | ||
} | ||
} | ||
} | ||
|
||
|
@@ -722,14 +753,15 @@ bool IMatrixCollector::load_imatrix(const char * file_name) { | |
} | ||
} | ||
|
||
const std::string in_sum_suffix{ ".in_sum" }; | ||
const std::string in_sum2_suffix{ ".in_sum2" }; | ||
const std::string counts_suffix{ ".counts" }; | ||
|
||
// Could re-use m_stats instead, but this allows | ||
// checking for completeness of *each* loaded imatrix file | ||
// and also makes it easier to re-use a similar implementation in quantize.cpp | ||
// Using an ordered map to get a deterministic iteration order. | ||
std::map<std::string, std::pair<struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for; | ||
std::map<std::string, std::tuple<struct ggml_tensor *, struct ggml_tensor *, struct ggml_tensor *>> sums_counts_for; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of a tuple, a small There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think keeping it as a tuple simplifies the code and aids maintainability, but if this approach would be a blocker for merging, happy to change. |
||
|
||
for (struct ggml_tensor * cur = ggml_get_first_tensor(ctx); cur; cur = ggml_get_next_tensor(ctx, cur)) { | ||
std::string name = cur->name; | ||
|
@@ -738,19 +770,24 @@ bool IMatrixCollector::load_imatrix(const char * file_name) { | |
|
||
if (string_remove_suffix(name, in_sum2_suffix)) { | ||
// in_sum2 | ||
sums_counts_for[std::move(name)].first = cur; | ||
std::get<0>(sums_counts_for[std::move(name)]) = cur; | ||
} else if (string_remove_suffix(name, counts_suffix)) { | ||
// counts | ||
sums_counts_for[std::move(name)].second = cur; | ||
} else { | ||
std::get<1>(sums_counts_for[std::move(name)]) = cur; | ||
} else if (string_remove_suffix(name, in_sum_suffix)) { | ||
// in_sum | ||
std::get<2>(sums_counts_for[std::move(name)]) = cur; | ||
} | ||
else { | ||
// ignore other tensors | ||
} | ||
} | ||
|
||
for (const auto & sc : sums_counts_for) { | ||
const std::string & name = sc.first; | ||
const struct ggml_tensor * in_sum2 = sc.second.first; | ||
const struct ggml_tensor * counts = sc.second.second; | ||
const struct ggml_tensor * in_sum2 = std::get<0>(sc.second); | ||
const struct ggml_tensor * counts = std::get<1>(sc.second); | ||
const struct ggml_tensor * in_sum = std::get<2>(sc.second); | ||
EAddario marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
||
if (!in_sum2 || !counts) { | ||
LOG_ERR("%s: mismatched sums and counts for %s\n", __func__, name.c_str()); | ||
|
@@ -764,6 +801,7 @@ bool IMatrixCollector::load_imatrix(const char * file_name) { | |
int64_t nval = ggml_nelements(in_sum2); | ||
if (e.values.empty()) { | ||
e.values.resize(nval, 0.0f); | ||
e.activations.resize(nval, 0.0f); | ||
} else if ((size_t) nval != e.values.size()) { | ||
LOG_ERR("%s: mismatched sums size for %s: %zu != %zu\n", __func__, name.c_str(), (size_t) nval, e.values.size()); | ||
gguf_free(ctx_gguf); | ||
|
@@ -791,6 +829,12 @@ bool IMatrixCollector::load_imatrix(const char * file_name) { | |
for (int64_t j = 0; j < ncounts; j++) { | ||
e.counts[j] += std::lround(((const float *) counts->data)[j]); | ||
} | ||
// ToDo: fix blow up when GGUF does not have in_sum | ||
if (in_sum->data != nullptr) { | ||
for (int64_t j = 0; j < nval; j++) { | ||
e.activations[j] += ((const float *) in_sum->data)[j]; | ||
} | ||
} | ||
} | ||
|
||
// TODO: extract into its own method; this is also used by the legacy format | ||
|
Uh oh!
There was an error while loading. Please reload this page.