@@ -1792,8 +1792,9 @@ struct llama_model_loader {
1792
1792
1793
1793
llama_model_loader (const std::string & fname, bool use_mmap) : file(fname.c_str(), " rb" ) {
1794
1794
struct gguf_init_params params = {
1795
- /* .no_alloc = */ true ,
1796
- /* .ctx = */ &ctx_meta,
1795
+ /* .no_alloc = */ true ,
1796
+ /* .ctx = */ &ctx_meta,
1797
+ /* .extra_tensors = */ 1 ,
1797
1798
};
1798
1799
1799
1800
ctx_gguf = gguf_init_from_file (fname.c_str (), params);
@@ -2100,6 +2101,25 @@ struct llama_model_loader {
2100
2101
done_size += ggml_nbytes (cur);
2101
2102
}
2102
2103
}
2104
+
2105
+ // must be called before calc_sizes
2106
+ void clone_tensor (const char * src_name, const char * dst_name) {
2107
+ int src_idx = gguf_find_tensor (ctx_gguf, src_name);
2108
+ GGML_ASSERT (src_idx >= 0 );
2109
+
2110
+ struct ggml_tensor * src = ggml_get_tensor (ctx_meta, src_name);
2111
+ size_t src_offset = gguf_get_tensor_offset (ctx_gguf, src_idx);
2112
+
2113
+ struct ggml_tensor * cur = ggml_new_tensor (ctx_meta, src->type , src->n_dims , src->ne );
2114
+ GGML_ASSERT (cur);
2115
+
2116
+ ggml_set_name (cur, dst_name);
2117
+ gguf_add_tensor (ctx_gguf, cur);
2118
+ gguf_set_tensor_offset (ctx_gguf, n_tensors, src_offset);
2119
+ n_tensors++;
2120
+ n_elements += ggml_nelements (cur);
2121
+ n_bytes += ggml_nbytes (cur);
2122
+ }
2103
2123
};
2104
2124
2105
2125
//
@@ -2666,6 +2686,11 @@ static void llm_load_tensors(
2666
2686
2667
2687
model.n_gpu_layers = n_gpu_layers;
2668
2688
2689
+ // MPT output is tied to (same as) wte in original model
2690
+ if (model.arch == LLM_ARCH_MPT) {
2691
+ ml.clone_tensor (" token_embd.weight" , " output.weight" );
2692
+ }
2693
+
2669
2694
size_t ctx_size;
2670
2695
size_t mmapped_size;
2671
2696
0 commit comments