Skip to content

Commit 00b7a4b

Browse files
committed
talk-llama : sync llama.cpp
1 parent 04b0a76 commit 00b7a4b

File tree

2 files changed

+85
-22
lines changed

2 files changed

+85
-22
lines changed

examples/talk-llama/llama.cpp

Lines changed: 71 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1903,6 +1903,28 @@ static void llama_kv_cache_seq_shift(
19031903
cache.head = new_head != cache.size ? new_head : 0;
19041904
}
19051905

1906+
static void llama_kv_cache_seq_div(
1907+
struct llama_kv_cache & cache,
1908+
llama_seq_id seq_id,
1909+
llama_pos p0,
1910+
llama_pos p1,
1911+
int d) {
1912+
if (p0 < 0) p0 = 0;
1913+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1914+
1915+
for (uint32_t i = 0; i < cache.size; ++i) {
1916+
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1917+
cache.has_shift = true;
1918+
1919+
{
1920+
llama_pos p_old = cache.cells[i].pos;
1921+
cache.cells[i].pos /= d;
1922+
cache.cells[i].delta += cache.cells[i].pos - p_old;
1923+
}
1924+
}
1925+
}
1926+
}
1927+
19061928
//
19071929
// model loading and saving
19081930
//
@@ -2180,7 +2202,11 @@ struct llama_model_loader {
21802202
type_max = type;
21812203
}
21822204

2183-
// LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
2205+
// TODO: make runtime configurable
2206+
#if 0
2207+
struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
2208+
LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
2209+
#endif
21842210
}
21852211

21862212
switch (type_max) {
@@ -2196,6 +2222,8 @@ struct llama_model_loader {
21962222
case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
21972223
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
21982224
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
2225+
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
2226+
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
21992227
default:
22002228
{
22012229
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -2558,7 +2586,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
25582586
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
25592587

25602588
// K-quants
2561-
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K";
2589+
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
2590+
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
25622591
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
25632592
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
25642593
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
@@ -2567,6 +2596,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
25672596
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
25682597
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
25692598
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
2599+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
2600+
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
25702601

25712602
default: return "unknown, may not work";
25722603
}
@@ -2801,6 +2832,7 @@ static void llm_load_hparams(
28012832
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
28022833

28032834
switch (hparams.n_layer) {
2835+
case 24: model.type = e_model::MODEL_1B; break;
28042836
case 32: model.type = e_model::MODEL_3B; break;
28052837
default: model.type = e_model::MODEL_UNKNOWN;
28062838
}
@@ -3117,7 +3149,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
31173149
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
31183150
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
31193151
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
3120-
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
3152+
if (ml.n_elements >= 1e12) {
3153+
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
3154+
} else if (ml.n_elements >= 1e9) {
3155+
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
3156+
} else if (ml.n_elements >= 1e6) {
3157+
LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, ml.n_elements*1e-6);
3158+
} else {
3159+
LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, ml.n_elements*1e-3);
3160+
}
31213161
if (ml.n_bytes < GiB) {
31223162
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
31233163
} else {
@@ -4772,7 +4812,6 @@ struct llm_build_context {
47724812
const int64_t n_embd_head = hparams.n_embd_head_v;
47734813
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
47744814
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4775-
GGML_ASSERT(n_embd_gqa == n_embd);
47764815

47774816
struct ggml_tensor * cur;
47784817
struct ggml_tensor * inpL;
@@ -4896,7 +4935,6 @@ struct llm_build_context {
48964935
const int64_t n_embd_head = hparams.n_embd_head_v;
48974936
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
48984937
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4899-
GGML_ASSERT(n_embd_gqa == n_embd);
49004938

49014939
struct ggml_tensor * cur;
49024940
struct ggml_tensor * pos;
@@ -4995,9 +5033,7 @@ struct llm_build_context {
49955033
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
49965034

49975035
const int64_t n_embd_head = hparams.n_embd_head_v;
4998-
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
49995036
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5000-
GGML_ASSERT(n_embd_gqa == n_embd);
50015037

50025038
const int64_t n_rot = n_embd_head_k / 2;
50035039

@@ -5209,9 +5245,7 @@ struct llm_build_context {
52095245
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
52105246

52115247
const int64_t n_embd_head = hparams.n_embd_head_v;
5212-
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
52135248
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5214-
GGML_ASSERT(n_embd_gqa == n_embd);
52155249

52165250
struct ggml_tensor * cur;
52175251
struct ggml_tensor * inpL;
@@ -5304,7 +5338,6 @@ struct llm_build_context {
53045338
const int64_t n_embd_head = hparams.n_embd_head_v;
53055339
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
53065340
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5307-
GGML_ASSERT(n_embd_gqa == n_embd);
53085341

53095342
struct ggml_tensor * cur;
53105343
struct ggml_tensor * inpL;
@@ -5400,7 +5433,6 @@ struct llm_build_context {
54005433
const int64_t n_embd_head = hparams.n_embd_head_v;
54015434
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
54025435
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5403-
GGML_ASSERT(n_embd_gqa == n_embd);
54045436

54055437
struct ggml_tensor * cur;
54065438
struct ggml_tensor * inpL;
@@ -5727,7 +5759,6 @@ struct llm_build_context {
57275759
const int64_t n_embd_head = hparams.n_embd_head_v;
57285760
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
57295761
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5730-
GGML_ASSERT(n_embd_gqa == n_embd);
57315762

57325763
struct ggml_tensor * cur;
57335764
struct ggml_tensor * attn_norm_output;
@@ -5951,7 +5982,6 @@ struct llm_build_context {
59515982
const int64_t n_embd_head = hparams.n_embd_head_v;
59525983
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
59535984
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5954-
GGML_ASSERT(n_embd_gqa == n_embd);
59555985

59565986
struct ggml_tensor * cur;
59575987
struct ggml_tensor * pos;
@@ -8926,10 +8956,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
89268956
// TODO: explore better strategies
89278957
new_type = GGML_TYPE_Q8_0;
89288958
}
8929-
} else if (name.find("ffn_down.weight") != std::string::npos) {
8959+
} else if (name.find("ffn_down") != std::string::npos) {
89308960
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8961+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
8962+
if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K;
8963+
}
89318964
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8932-
new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8965+
new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K
89338966
: arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
89348967
: GGML_TYPE_Q3_K;
89358968
}
@@ -8938,14 +8971,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
89388971
}
89398972
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
89408973
if (arch == LLM_ARCH_FALCON) {
8941-
new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8974+
new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K :
89428975
use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
89438976
} else {
89448977
if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
89458978
}
89468979
}
89478980
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8948-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) {
8981+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) {
89498982
new_type = GGML_TYPE_Q5_K;
89508983
}
89518984
++qs.i_feed_forward_w2;
@@ -8963,9 +8996,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
89638996
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
89648997
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
89658998
}
8966-
else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
8967-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8968-
}
8999+
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
9000+
//else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
9001+
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
9002+
//}
89699003
// This can be used to reduce the size of the Q5_K_S model.
89709004
// The associated PPL increase is fully in line with the size reduction
89719005
//else {
@@ -9014,6 +9048,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
90149048

90159049
// K-quants
90169050
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
9051+
case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
90179052
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
90189053
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
90199054
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -9022,6 +9057,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
90229057
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
90239058
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
90249059
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
9060+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
9061+
case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
90259062

90269063
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
90279064
}
@@ -9070,7 +9107,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
90709107
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
90719108
++qs.n_attention_wv;
90729109
}
9073-
else if (name.find("ffn_down.weight") != std::string::npos) {
9110+
else if (name.find("ffn_down") != std::string::npos) {
90749111
++qs.n_feed_forward_w2;
90759112
}
90769113
}
@@ -10146,9 +10183,21 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
1014610183
}
1014710184

1014810185
void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
10186+
if (delta == 0) {
10187+
return;
10188+
}
10189+
1014910190
llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
1015010191
}
1015110192

10193+
void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
10194+
if (d == 1) {
10195+
return;
10196+
}
10197+
10198+
llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
10199+
}
10200+
1015210201
// Returns the *maximum* size of the state
1015310202
size_t llama_get_state_size(const struct llama_context * ctx) {
1015410203
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
@@ -10881,7 +10930,7 @@ void llama_print_timings(struct llama_context * ctx) {
1088110930
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
1088210931
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
1088310932
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
10884-
LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
10933+
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
1088510934
}
1088610935

1088710936
void llama_reset_timings(struct llama_context * ctx) {

examples/talk-llama/llama.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,9 @@ extern "C" {
103103
LLAMA_FTYPE_MOSTLY_Q5_K_S = 16, // except 1d tensors
104104
LLAMA_FTYPE_MOSTLY_Q5_K_M = 17, // except 1d tensors
105105
LLAMA_FTYPE_MOSTLY_Q6_K = 18, // except 1d tensors
106+
LLAMA_FTYPE_MOSTLY_IQ2_XXS = 19, // except 1d tensors
107+
LLAMA_FTYPE_MOSTLY_IQ2_XS = 20, // except 1d tensors
108+
LLAMA_FTYPE_MOSTLY_Q2_K_S = 21, // except 1d tensors
106109

107110
LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
108111
};
@@ -484,6 +487,17 @@ extern "C" {
484487
llama_pos p1,
485488
llama_pos delta);
486489

490+
// Integer division of the positions by factor of `d > 1`
491+
// If the KV cache is RoPEd, the KV data is updated accordingly
492+
// p0 < 0 : [0, p1]
493+
// p1 < 0 : [p0, inf)
494+
LLAMA_API void llama_kv_cache_seq_div(
495+
struct llama_context * ctx,
496+
llama_seq_id seq_id,
497+
llama_pos p0,
498+
llama_pos p1,
499+
int d);
500+
487501
//
488502
// State / sessions
489503
//

0 commit comments

Comments
 (0)