@@ -1903,6 +1903,28 @@ static void llama_kv_cache_seq_shift(
1903
1903
cache.head = new_head != cache.size ? new_head : 0 ;
1904
1904
}
1905
1905
1906
+ static void llama_kv_cache_seq_div (
1907
+ struct llama_kv_cache & cache,
1908
+ llama_seq_id seq_id,
1909
+ llama_pos p0,
1910
+ llama_pos p1,
1911
+ int d) {
1912
+ if (p0 < 0 ) p0 = 0 ;
1913
+ if (p1 < 0 ) p1 = std::numeric_limits<llama_pos>::max ();
1914
+
1915
+ for (uint32_t i = 0 ; i < cache.size ; ++i) {
1916
+ if (cache.cells [i].has_seq_id (seq_id) && cache.cells [i].pos >= p0 && cache.cells [i].pos < p1) {
1917
+ cache.has_shift = true ;
1918
+
1919
+ {
1920
+ llama_pos p_old = cache.cells [i].pos ;
1921
+ cache.cells [i].pos /= d;
1922
+ cache.cells [i].delta += cache.cells [i].pos - p_old;
1923
+ }
1924
+ }
1925
+ }
1926
+ }
1927
+
1906
1928
//
1907
1929
// model loading and saving
1908
1930
//
@@ -2180,7 +2202,11 @@ struct llama_model_loader {
2180
2202
type_max = type;
2181
2203
}
2182
2204
2183
- // LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
2205
+ // TODO: make runtime configurable
2206
+ #if 0
2207
+ struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
2208
+ LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
2209
+ #endif
2184
2210
}
2185
2211
2186
2212
switch (type_max) {
@@ -2196,6 +2222,8 @@ struct llama_model_loader {
2196
2222
case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break ;
2197
2223
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break ;
2198
2224
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break ;
2225
+ case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break ;
2226
+ case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break ;
2199
2227
default :
2200
2228
{
2201
2229
LLAMA_LOG_WARN (" %s: unknown type %s\n " , __func__, ggml_type_name (type_max));
@@ -2558,7 +2586,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2558
2586
case LLAMA_FTYPE_MOSTLY_Q8_0: return " Q8_0" ;
2559
2587
2560
2588
// K-quants
2561
- case LLAMA_FTYPE_MOSTLY_Q2_K: return " Q2_K" ;
2589
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return " Q2_K - Medium" ;
2590
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S: return " Q2_K - Small" ;
2562
2591
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return " Q3_K - Small" ;
2563
2592
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return " Q3_K - Medium" ;
2564
2593
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return " Q3_K - Large" ;
@@ -2567,6 +2596,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2567
2596
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return " Q5_K - Small" ;
2568
2597
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return " Q5_K - Medium" ;
2569
2598
case LLAMA_FTYPE_MOSTLY_Q6_K: return " Q6_K" ;
2599
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return " IQ2_XSS - 2.0625 bpw" ;
2600
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS: return " IQ2_XS - 2.3125 bpw" ;
2570
2601
2571
2602
default : return " unknown, may not work" ;
2572
2603
}
@@ -2801,6 +2832,7 @@ static void llm_load_hparams(
2801
2832
ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
2802
2833
2803
2834
switch (hparams.n_layer ) {
2835
+ case 24 : model.type = e_model::MODEL_1B; break ;
2804
2836
case 32 : model.type = e_model::MODEL_3B; break ;
2805
2837
default : model.type = e_model::MODEL_UNKNOWN;
2806
2838
}
@@ -3117,7 +3149,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3117
3149
LLAMA_LOG_INFO (" %s: rope_finetuned = %s\n " , __func__, hparams.rope_finetuned ? " yes" : " unknown" );
3118
3150
LLAMA_LOG_INFO (" %s: model type = %s\n " , __func__, llama_model_type_name (model.type ));
3119
3151
LLAMA_LOG_INFO (" %s: model ftype = %s\n " , __func__, llama_model_ftype_name (model.ftype ).c_str ());
3120
- LLAMA_LOG_INFO (" %s: model params = %.2f B\n " , __func__, ml.n_elements *1e-9 );
3152
+ if (ml.n_elements >= 1e12 ) {
3153
+ LLAMA_LOG_INFO (" %s: model params = %.2f T\n " , __func__, ml.n_elements *1e-12 );
3154
+ } else if (ml.n_elements >= 1e9 ) {
3155
+ LLAMA_LOG_INFO (" %s: model params = %.2f B\n " , __func__, ml.n_elements *1e-9 );
3156
+ } else if (ml.n_elements >= 1e6 ) {
3157
+ LLAMA_LOG_INFO (" %s: model params = %.2f M\n " , __func__, ml.n_elements *1e-6 );
3158
+ } else {
3159
+ LLAMA_LOG_INFO (" %s: model params = %.2f K\n " , __func__, ml.n_elements *1e-3 );
3160
+ }
3121
3161
if (ml.n_bytes < GiB) {
3122
3162
LLAMA_LOG_INFO (" %s: model size = %.2f MiB (%.2f BPW) \n " , __func__, ml.n_bytes /1024.0 /1024.0 , ml.n_bytes *8.0 /ml.n_elements );
3123
3163
} else {
@@ -4772,7 +4812,6 @@ struct llm_build_context {
4772
4812
const int64_t n_embd_head = hparams.n_embd_head_v ;
4773
4813
const int64_t n_embd_gqa = hparams.n_embd_v_gqa ();
4774
4814
GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
4775
- GGML_ASSERT (n_embd_gqa == n_embd);
4776
4815
4777
4816
struct ggml_tensor * cur;
4778
4817
struct ggml_tensor * inpL;
@@ -4896,7 +4935,6 @@ struct llm_build_context {
4896
4935
const int64_t n_embd_head = hparams.n_embd_head_v ;
4897
4936
const int64_t n_embd_gqa = hparams.n_embd_v_gqa ();
4898
4937
GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
4899
- GGML_ASSERT (n_embd_gqa == n_embd);
4900
4938
4901
4939
struct ggml_tensor * cur;
4902
4940
struct ggml_tensor * pos;
@@ -4995,9 +5033,7 @@ struct llm_build_context {
4995
5033
struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, LLAMA_MAX_NODES, false );
4996
5034
4997
5035
const int64_t n_embd_head = hparams.n_embd_head_v ;
4998
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa ();
4999
5036
GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
5000
- GGML_ASSERT (n_embd_gqa == n_embd);
5001
5037
5002
5038
const int64_t n_rot = n_embd_head_k / 2 ;
5003
5039
@@ -5209,9 +5245,7 @@ struct llm_build_context {
5209
5245
struct ggml_cgraph * gf = ggml_new_graph_custom (ctx0, LLAMA_MAX_NODES, false );
5210
5246
5211
5247
const int64_t n_embd_head = hparams.n_embd_head_v ;
5212
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa ();
5213
5248
GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
5214
- GGML_ASSERT (n_embd_gqa == n_embd);
5215
5249
5216
5250
struct ggml_tensor * cur;
5217
5251
struct ggml_tensor * inpL;
@@ -5304,7 +5338,6 @@ struct llm_build_context {
5304
5338
const int64_t n_embd_head = hparams.n_embd_head_v ;
5305
5339
const int64_t n_embd_gqa = hparams.n_embd_v_gqa ();
5306
5340
GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
5307
- GGML_ASSERT (n_embd_gqa == n_embd);
5308
5341
5309
5342
struct ggml_tensor * cur;
5310
5343
struct ggml_tensor * inpL;
@@ -5400,7 +5433,6 @@ struct llm_build_context {
5400
5433
const int64_t n_embd_head = hparams.n_embd_head_v ;
5401
5434
const int64_t n_embd_gqa = hparams.n_embd_v_gqa ();
5402
5435
GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
5403
- GGML_ASSERT (n_embd_gqa == n_embd);
5404
5436
5405
5437
struct ggml_tensor * cur;
5406
5438
struct ggml_tensor * inpL;
@@ -5727,7 +5759,6 @@ struct llm_build_context {
5727
5759
const int64_t n_embd_head = hparams.n_embd_head_v ;
5728
5760
const int64_t n_embd_gqa = hparams.n_embd_v_gqa ();
5729
5761
GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
5730
- GGML_ASSERT (n_embd_gqa == n_embd);
5731
5762
5732
5763
struct ggml_tensor * cur;
5733
5764
struct ggml_tensor * attn_norm_output;
@@ -5951,7 +5982,6 @@ struct llm_build_context {
5951
5982
const int64_t n_embd_head = hparams.n_embd_head_v ;
5952
5983
const int64_t n_embd_gqa = hparams.n_embd_v_gqa ();
5953
5984
GGML_ASSERT (n_embd_head == hparams.n_embd_head_k );
5954
- GGML_ASSERT (n_embd_gqa == n_embd);
5955
5985
5956
5986
struct ggml_tensor * cur;
5957
5987
struct ggml_tensor * pos;
@@ -8926,10 +8956,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8926
8956
// TODO: explore better strategies
8927
8957
new_type = GGML_TYPE_Q8_0;
8928
8958
}
8929
- } else if (name.find (" ffn_down.weight " ) != std::string::npos) {
8959
+ } else if (name.find (" ffn_down" ) != std::string::npos) {
8930
8960
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8961
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
8962
+ if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2 /8 ) new_type = GGML_TYPE_Q4_K;
8963
+ }
8931
8964
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8932
- new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8965
+ new_type = qs.i_feed_forward_w2 < qs. n_feed_forward_w2 / 16 ? GGML_TYPE_Q5_K
8933
8966
: arch != LLM_ARCH_FALCON || use_more_bits (qs.i_feed_forward_w2 , qs.n_feed_forward_w2 ) ? GGML_TYPE_Q4_K
8934
8967
: GGML_TYPE_Q3_K;
8935
8968
}
@@ -8938,14 +8971,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8938
8971
}
8939
8972
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8940
8973
if (arch == LLM_ARCH_FALCON) {
8941
- new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8974
+ new_type = qs.i_feed_forward_w2 < qs. n_feed_forward_w2 / 16 ? GGML_TYPE_Q6_K :
8942
8975
use_more_bits (qs.i_feed_forward_w2 , qs.n_feed_forward_w2 ) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8943
8976
} else {
8944
8977
if (use_more_bits (qs.i_feed_forward_w2 , qs.n_feed_forward_w2 )) new_type = GGML_TYPE_Q6_K;
8945
8978
}
8946
8979
}
8947
8980
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits (qs.i_feed_forward_w2 , qs.n_feed_forward_w2 )) new_type = GGML_TYPE_Q6_K;
8948
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4 ) {
8981
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs. n_feed_forward_w2 / 8 ) {
8949
8982
new_type = GGML_TYPE_Q5_K;
8950
8983
}
8951
8984
++qs.i_feed_forward_w2 ;
@@ -8963,9 +8996,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8963
8996
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
8964
8997
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
8965
8998
}
8966
- else if (name.find (" ffn_gate.weight" ) != std::string::npos || name.find (" ffn_up.weight" ) != std::string::npos) {
8967
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8968
- }
8999
+ // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
9000
+ // else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
9001
+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
9002
+ // }
8969
9003
// This can be used to reduce the size of the Q5_K_S model.
8970
9004
// The associated PPL increase is fully in line with the size reduction
8971
9005
// else {
@@ -9014,6 +9048,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9014
9048
9015
9049
// K-quants
9016
9050
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break ;
9051
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break ;
9017
9052
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
9018
9053
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
9019
9054
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break ;
@@ -9022,6 +9057,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9022
9057
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
9023
9058
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break ;
9024
9059
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break ;
9060
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break ;
9061
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break ;
9025
9062
9026
9063
default : throw std::runtime_error (format (" invalid output file type %d\n " , ftype));
9027
9064
}
@@ -9070,7 +9107,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9070
9107
if (name.find (" attn_v.weight" ) != std::string::npos || name.find (" attn_qkv.weight" ) != std::string::npos) {
9071
9108
++qs.n_attention_wv ;
9072
9109
}
9073
- else if (name.find (" ffn_down.weight " ) != std::string::npos) {
9110
+ else if (name.find (" ffn_down" ) != std::string::npos) {
9074
9111
++qs.n_feed_forward_w2 ;
9075
9112
}
9076
9113
}
@@ -10146,9 +10183,21 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
10146
10183
}
10147
10184
10148
10185
void llama_kv_cache_seq_shift (struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
10186
+ if (delta == 0 ) {
10187
+ return ;
10188
+ }
10189
+
10149
10190
llama_kv_cache_seq_shift (ctx->kv_self , seq_id, p0, p1, delta);
10150
10191
}
10151
10192
10193
+ void llama_kv_cache_seq_div (struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
10194
+ if (d == 1 ) {
10195
+ return ;
10196
+ }
10197
+
10198
+ llama_kv_cache_seq_div (ctx->kv_self , seq_id, p0, p1, d);
10199
+ }
10200
+
10152
10201
// Returns the *maximum* size of the state
10153
10202
size_t llama_get_state_size (const struct llama_context * ctx) {
10154
10203
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
@@ -10881,7 +10930,7 @@ void llama_print_timings(struct llama_context * ctx) {
10881
10930
__func__, timings.t_p_eval_ms , timings.n_p_eval , timings.t_p_eval_ms / timings.n_p_eval , 1e3 / timings.t_p_eval_ms * timings.n_p_eval );
10882
10931
LLAMA_LOG_INFO (" %s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n " ,
10883
10932
__func__, timings.t_eval_ms , timings.n_eval , timings.t_eval_ms / timings.n_eval , 1e3 / timings.t_eval_ms * timings.n_eval );
10884
- LLAMA_LOG_INFO (" %s: total time = %10.2f ms\n " , __func__, (timings.t_end_ms - timings.t_start_ms ));
10933
+ LLAMA_LOG_INFO (" %s: total time = %10.2f ms / %5d tokens \n " , __func__, (timings.t_end_ms - timings.t_start_ms ), (timings. n_p_eval + timings. n_eval ));
10885
10934
}
10886
10935
10887
10936
void llama_reset_timings (struct llama_context * ctx) {
0 commit comments