Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 16 additions & 4 deletions ggml/src/ggml-cpu/ops.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4580,9 +4580,15 @@ static void ggml_compute_forward_get_rows_f16(

GGML_ASSERT(i01 >= 0 && i01 < ne01);

ggml_cpu_fp16_to_fp32(
(const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
// Supports both F16 and F32 as dst type.
if (dst->type == GGML_TYPE_F16)
ggml_vec_cpy_f16(nc,
(ggml_fp16_t *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
(ggml_fp16_t *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
else
ggml_cpu_fp16_to_fp32(
(const ggml_fp16_t*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
}
}

Expand Down Expand Up @@ -4662,9 +4668,15 @@ static void ggml_compute_forward_get_rows_f32(

GGML_ASSERT(i01 >= 0 && i01 < ne01);

ggml_vec_cpy_f32(nc,
// Supports both F16 and F32 as dst type.
if (dst->type == GGML_TYPE_F32)
ggml_vec_cpy_f32(nc,
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
else
ggml_cpu_fp32_to_fp16(
(const float*) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
(ggml_fp16_t *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
}
}

Expand Down
1 change: 1 addition & 0 deletions ggml/src/ggml-cpu/vec.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ inline static void ggml_vec_sub_f16 (const int n, ggml_fp16_t * z, const ggml_fp
}
inline static void ggml_vec_set_f32 (const int n, float * x, const float v) { for (int i = 0; i < n; ++i) x[i] = v; }
inline static void ggml_vec_cpy_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
inline static void ggml_vec_cpy_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) { for (int i = 0; i < n; ++i) y[i] = x[i]; }
inline static void ggml_vec_neg_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = -x[i]; }
inline static void ggml_vec_neg_f16 (const int n, ggml_fp16_t * y, const ggml_fp16_t * x) {
for (int i = 0; i < n; ++i) {
Expand Down
21 changes: 16 additions & 5 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -3024,7 +3024,10 @@ struct ggml_tensor * ggml_mul_mat(
GGML_ASSERT(!ggml_is_transposed(a));

const int64_t ne[4] = { a->ne[1], b->ne[1], b->ne[2], b->ne[3] };
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
// Tensor a is the weight, with its type determined by the model file.
// Tensor b is the activation, i.e., the intermediate computation result.
// Here, the destination type (dst) is kept the same as the input activation type.
struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne);

result->op = GGML_OP_MUL_MAT;
result->src[0] = a;
Expand Down Expand Up @@ -3073,7 +3076,9 @@ struct ggml_tensor * ggml_mul_mat_id(
GGML_ASSERT(ids->ne[0] % b->ne[1] == 0); // can broadcast

const int64_t ne[4] = { as->ne[1], ids->ne[0], b->ne[2], 1 };
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
// Tensor b is the activation, i.e., the intermediate computation result.
// Here, the destination type (dst) is kept the same as the input activation type.
struct ggml_tensor * result = ggml_new_tensor(ctx, b->type, 4, ne);

result->op = GGML_OP_MUL_MAT_ID;
result->src[0] = as;
Expand Down Expand Up @@ -3628,7 +3633,9 @@ struct ggml_tensor * ggml_get_rows(
GGML_ASSERT(b->type == GGML_TYPE_I32);

// TODO: implement non F32 return
enum ggml_type type = GGML_TYPE_F32;
// TODO: Automatically select the destination type based on parameters,
// environment variables, or backend support. Hard code F16 for example.
enum ggml_type type = GGML_TYPE_F16;
if (a->type == GGML_TYPE_I32) {
type = a->type;
}
Expand Down Expand Up @@ -3676,7 +3683,8 @@ struct ggml_tensor * ggml_set_rows(
GGML_ASSERT(b->ne[2] % c->ne[1] == 0);
GGML_ASSERT(b->ne[3] % c->ne[2] == 0);
GGML_ASSERT(c->ne[3] == 1);
GGML_ASSERT(b->type == GGML_TYPE_F32);
// b->type also can be F16.
//GGML_ASSERT(b->type == GGML_TYPE_F32);
GGML_ASSERT(c->type == GGML_TYPE_I64 || c->type == GGML_TYPE_I32);

GGML_ASSERT(ggml_is_contiguous_rows(a));
Expand Down Expand Up @@ -5003,7 +5011,10 @@ struct ggml_tensor * ggml_flash_attn_ext(

// permute(0, 2, 1, 3)
int64_t ne[4] = { v->ne[0], q->ne[2], q->ne[1], q->ne[3] };
struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne);
// The types of k and v are the same as those in the KV cache,
// while q is an intermediate computation result.
// Here, the destination type (dst) is kept the same as the type of q.
struct ggml_tensor * result = ggml_new_tensor(ctx, q->type, 4, ne);

float params[] = { scale, max_bias, logit_softcap };
ggml_set_op_params(result, params, sizeof(params));
Expand Down
32 changes: 32 additions & 0 deletions src/llama-graph.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1937,6 +1937,38 @@ void llm_graph_context::build_pooling(
ggml_build_forward_expand(gf, cur);
}

void llm_graph_context::cast_outputs() const {
ggml_tensor * ori_embd = res->t_embd;
if (cparams.embeddings && res->t_embd->type != GGML_TYPE_F32) {
ggml_tensor * embd = res->t_embd;
embd = ggml_cast(ctx0, embd, GGML_TYPE_F32);
cb(embd, "result_embd_cast", -1);
ggml_build_forward_expand(gf, embd);
res->t_embd = embd;
}

if (cparams.embeddings && res->t_embd_pooled->type != GGML_TYPE_F32) {
// if LLAMA_POOLING_TYPE_NONE, embd_pooled == embd
if (res->t_embd_pooled == ori_embd) {
res->t_embd_pooled = res->t_embd;
} else {
ggml_tensor * embd_pooled = res->t_embd_pooled;
embd_pooled = ggml_cast(ctx0, embd_pooled, GGML_TYPE_F32);
cb(embd_pooled, "result_embd_pooled_cast", -1);
ggml_build_forward_expand(gf, embd_pooled);
res->t_embd_pooled = embd_pooled;
}
}

if(res->t_logits->type != GGML_TYPE_F32) {
ggml_tensor * logits = res->t_logits;
logits = ggml_cast(ctx0, logits, GGML_TYPE_F32);
cb(logits, "result_logits_cast", -1);
ggml_build_forward_expand(gf, logits);
res->t_logits = logits;
}
}

int32_t llama_relative_position_bucket(llama_pos x, llama_pos y, uint64_t n_buckets, bool bidirectional) {
// TODO move to hparams if a T5 variant appears that uses a different value
const int64_t max_distance = 128;
Expand Down
2 changes: 2 additions & 0 deletions src/llama-graph.h
Original file line number Diff line number Diff line change
Expand Up @@ -814,6 +814,8 @@ struct llm_graph_context {
ggml_tensor * cls_b,
ggml_tensor * cls_out,
ggml_tensor * cls_out_b) const;

void cast_outputs() const;
};

// TODO: better name
Expand Down
3 changes: 3 additions & 0 deletions src/llama-model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19618,6 +19618,9 @@ ggml_cgraph * llama_model::build_graph(const llm_graph_params & params) const {
// add on pooling layer
llm->build_pooling(cls, cls_b, cls_out, cls_out_b);

// cast output to F32
llm->cast_outputs();

return llm->res->get_gf();
}

Expand Down
Loading