Skip to content

Commit 0cbcb60

Browse files
cmp-nctggerganov
authored andcommitted
llava : support for Yi-VL and fix for mobileVLM (ggml-org#5093)
* Support for Yi-VL, templating fix for mobileVLM * ws * Update examples/llava/clip.cpp Co-authored-by: Georgi Gerganov <[email protected]> * Update llava-cli.cpp * Update clip.cpp bugfix for new conversions --------- Co-authored-by: Georgi Gerganov <[email protected]>
1 parent b8bdfb3 commit 0cbcb60

File tree

2 files changed

+92
-10
lines changed

2 files changed

+92
-10
lines changed

examples/llava/clip.cpp

Lines changed: 63 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,7 @@ static std::string format(const char * fmt, ...) {
9898

9999
enum projector_type {
100100
PROJECTOR_TYPE_MLP,
101+
PROJECTOR_TYPE_MLP_NORM,
101102
PROJECTOR_TYPE_LDP,
102103
PROJECTOR_TYPE_UNKNOWN,
103104
};
@@ -304,10 +305,18 @@ struct clip_vision_model {
304305
struct ggml_tensor * projection;
305306

306307
// LLaVA projection
307-
struct ggml_tensor * mm_0_w;
308-
struct ggml_tensor * mm_0_b;
309-
struct ggml_tensor * mm_2_w;
310-
struct ggml_tensor * mm_2_b;
308+
struct ggml_tensor * mm_0_w = NULL;
309+
struct ggml_tensor * mm_0_b = NULL;
310+
struct ggml_tensor * mm_2_w = NULL;
311+
struct ggml_tensor * mm_2_b = NULL;
312+
313+
// Yi type models with mlp+normalization projection
314+
struct ggml_tensor * mm_1_w = NULL; // Yi type models have 0, 1, 3, 4
315+
struct ggml_tensor * mm_1_b = NULL;
316+
struct ggml_tensor * mm_3_w = NULL;
317+
struct ggml_tensor * mm_3_b = NULL;
318+
struct ggml_tensor * mm_4_w = NULL;
319+
struct ggml_tensor * mm_4_b = NULL;
311320

312321
// MobileVLM projection
313322
struct ggml_tensor * mm_model_mlp_1_w;
@@ -460,6 +469,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
460469
// pre-layernorm
461470
{
462471
embeddings = ggml_norm(ctx0, embeddings, eps);
472+
ggml_set_name(embeddings, "pre_ln");
463473

464474
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.pre_ln_w), model.pre_ln_b);
465475
}
@@ -575,6 +585,27 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
575585

576586
embeddings = ggml_mul_mat(ctx0, model.mm_2_w, embeddings);
577587
embeddings = ggml_add(ctx0, embeddings, model.mm_2_b);
588+
589+
} else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
590+
embeddings = ggml_mul_mat(ctx0, model.mm_0_w, embeddings);
591+
embeddings = ggml_add(ctx0, embeddings, model.mm_0_b);
592+
// ggml_tensor_printf(embeddings, "mm_0_w",0,true,false);
593+
// First LayerNorm
594+
embeddings = ggml_norm(ctx0, embeddings, eps);
595+
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_1_w),
596+
model.mm_1_b);
597+
598+
// GELU activation
599+
embeddings = ggml_gelu(ctx0, embeddings);
600+
601+
// Second linear layer
602+
embeddings = ggml_mul_mat(ctx0, model.mm_3_w, embeddings);
603+
embeddings = ggml_add(ctx0, embeddings, model.mm_3_b);
604+
605+
// Second LayerNorm
606+
embeddings = ggml_norm(ctx0, embeddings, eps);
607+
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.mm_4_w),
608+
model.mm_4_b);
578609
}
579610
else if (ctx->proj_type == PROJECTOR_TYPE_LDP) {
580611
// MobileVLM projector
@@ -808,6 +839,11 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
808839
else {
809840
new_clip->proj_type = PROJECTOR_TYPE_MLP;
810841
}
842+
if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
843+
if (gguf_find_tensor(ctx, format(TN_LLAVA_PROJ, 3, "weight").c_str()) != -1) {
844+
new_clip->proj_type = PROJECTOR_TYPE_MLP_NORM;
845+
}
846+
}
811847
}
812848

813849
#ifdef GGML_USE_CUBLAS
@@ -956,11 +992,29 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
956992
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
957993

958994
// LLaVA projection
959-
if (new_clip->proj_type == PROJECTOR_TYPE_MLP) {
995+
if (new_clip->proj_type == PROJECTOR_TYPE_MLP || new_clip->proj_type == PROJECTOR_TYPE_MLP_NORM) {
960996
vision_model.mm_0_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "weight"));
961997
vision_model.mm_0_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 0, "bias"));
962-
vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
963-
vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
998+
try {
999+
// Yi-type llava
1000+
vision_model.mm_1_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "weight"));
1001+
vision_model.mm_1_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 1, "bias"));
1002+
} catch (std::runtime_error & e) { }
1003+
try {
1004+
// missing in Yi-type llava
1005+
vision_model.mm_2_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "weight"));
1006+
vision_model.mm_2_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 2, "bias"));
1007+
} catch (std::runtime_error & e) { }
1008+
try {
1009+
// Yi-type llava
1010+
vision_model.mm_3_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "weight"));
1011+
vision_model.mm_3_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 3, "bias"));
1012+
} catch (std::runtime_error & e) { }
1013+
try {
1014+
// Yi-type llava
1015+
vision_model.mm_4_w = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "weight"));
1016+
vision_model.mm_4_b = get_tensor(new_clip->ctx_data, format(TN_LLAVA_PROJ, 4, "bias"));
1017+
} catch (std::runtime_error & e) { }
9641018
}
9651019
else if (new_clip->proj_type == PROJECTOR_TYPE_LDP) {
9661020
// MobileVLM projection
@@ -1432,6 +1486,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
14321486
}
14331487
else if (ctx->proj_type == PROJECTOR_TYPE_MLP) {
14341488
return ctx->vision_model.mm_2_b->ne[0];
1489+
} else if (ctx->proj_type == PROJECTOR_TYPE_MLP_NORM) {
1490+
return ctx->vision_model.mm_3_b->ne[0];
14351491
}
14361492
else {
14371493
std::string proj_type = PROJECTOR_TYPE_NAMES[ctx->proj_type];

examples/llava/llava-cli.cpp

Lines changed: 29 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -148,10 +148,35 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
148148
const int max_tgt_len = params->n_predict < 0 ? 256 : params->n_predict;
149149
const bool add_bos = llama_should_add_bos_token(llama_get_model(ctx_llava->ctx_llama));
150150

151-
// llava chat format is "<system_prompt>\nUSER:<image_embeddings>\n<textual_prompt>\nASSISTANT:"
152-
eval_string(ctx_llava->ctx_llama, "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:", params->n_batch, &n_past, add_bos);
151+
std::string system_prompt, user_prompt;
152+
size_t image_pos = prompt.find("<image>");
153+
if (image_pos != std::string::npos) {
154+
// new templating mode: Provide the full prompt including system message and use <image> as a placeholder for the image
155+
156+
system_prompt = prompt.substr(0, image_pos);
157+
user_prompt = prompt.substr(image_pos + std::string("<image>").length());
158+
// We replace \n with actual newlines in user_prompt, just in case -e was not used in templating string
159+
size_t pos = 0;
160+
while ((pos = user_prompt.find("\\n", pos)) != std::string::npos) {
161+
user_prompt.replace(pos, 2, "\n");
162+
pos += 1; // Advance past the replaced newline
163+
}
164+
while ((pos = system_prompt.find("\\n", pos)) != std::string::npos) {
165+
system_prompt.replace(pos, 2, "\n");
166+
pos += 1; // Advance past the replaced newline
167+
}
168+
169+
printf("system_prompt: %s\n", system_prompt.c_str());
170+
printf("user_prompt: %s\n", user_prompt.c_str());
171+
} else {
172+
// llava-1.5 native mode
173+
system_prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\nUSER:";
174+
user_prompt = prompt + "\nASSISTANT:";
175+
}
176+
177+
eval_string(ctx_llava->ctx_llama, system_prompt.c_str(), params->n_batch, &n_past, add_bos);
153178
llava_eval_image_embed(ctx_llava->ctx_llama, image_embed, params->n_batch, &n_past);
154-
eval_string(ctx_llava->ctx_llama, (prompt + "\nASSISTANT:").c_str(), params->n_batch, &n_past, false);
179+
eval_string(ctx_llava->ctx_llama, user_prompt.c_str(), params->n_batch, &n_past, false);
155180

156181
// generate the response
157182

@@ -162,6 +187,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
162187
for (int i = 0; i < max_tgt_len; i++) {
163188
const char * tmp = sample(ctx_sampling, ctx_llava->ctx_llama, &n_past);
164189
if (strcmp(tmp, "</s>") == 0) break;
190+
if (strstr(tmp, "###")) break; // Yi-VL behavior
165191

166192
printf("%s", tmp);
167193
fflush(stdout);

0 commit comments

Comments
 (0)