Skip to content

Commit 4524290

Browse files
authored
Use correct type of pooling for embedding models (#5500)
Use correct type of pooling for embedding models
1 parent c06e45d commit 4524290

File tree

5 files changed

+94
-31
lines changed

5 files changed

+94
-31
lines changed

convert-hf-to-gguf.py

Lines changed: 23 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1650,7 +1650,29 @@ def __init__(self, *args, **kwargs):
16501650
def set_gguf_parameters(self):
16511651
super().set_gguf_parameters()
16521652
self.gguf_writer.add_causal_attention(False)
1653-
self.gguf_writer.add_pooling_layer(True)
1653+
1654+
# get pooling path
1655+
with open(self.dir_model / "modules.json", encoding="utf-8") as f:
1656+
modules = json.load(f)
1657+
pooling_path = None
1658+
for mod in modules:
1659+
if mod["type"] == "sentence_transformers.models.Pooling":
1660+
pooling_path = mod["path"]
1661+
break
1662+
1663+
# get pooling type
1664+
pooling_type = gguf.PoolingType.NONE
1665+
if pooling_path is not None:
1666+
with open(self.dir_model / pooling_path / "config.json", encoding="utf-8") as f:
1667+
pooling = json.load(f)
1668+
if pooling["pooling_mode_mean_tokens"]:
1669+
pooling_type = gguf.PoolingType.MEAN
1670+
elif pooling["pooling_mode_cls_token"]:
1671+
pooling_type = gguf.PoolingType.CLS
1672+
else:
1673+
raise NotImplementedError("Only MEAN and CLS pooling types supported")
1674+
1675+
self.gguf_writer.add_pooling_type(pooling_type.value)
16541676

16551677
def set_vocab(self):
16561678
path = self.dir_model

gguf-py/gguf/constants.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ class LLM:
4040
TENSOR_DATA_LAYOUT = "{arch}.tensor_data_layout"
4141
EXPERT_COUNT = "{arch}.expert_count"
4242
EXPERT_USED_COUNT = "{arch}.expert_used_count"
43-
POOLING_LAYER = "{arch}.pooling_layer"
43+
POOLING_TYPE = "{arch}.pooling_type"
4444

4545
class Attention:
4646
HEAD_COUNT = "{arch}.attention.head_count"
@@ -561,6 +561,12 @@ class RopeScalingType(Enum):
561561
YARN = 'yarn'
562562

563563

564+
class PoolingType(IntEnum):
565+
NONE = 0
566+
MEAN = 1
567+
CLS = 2
568+
569+
564570
class GGMLQuantizationType(IntEnum):
565571
F32 = 0
566572
F16 = 1

gguf-py/gguf/gguf_writer.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
GGUFValueType,
2020
Keys,
2121
RopeScalingType,
22+
PoolingType,
2223
TokenType,
2324
)
2425

@@ -360,8 +361,8 @@ def add_layer_norm_rms_eps(self, value: float) -> None:
360361
def add_causal_attention(self, value: bool) -> None:
361362
self.add_bool(Keys.Attention.CAUSAL.format(arch=self.arch), value)
362363

363-
def add_pooling_layer(self, value: bool) -> None:
364-
self.add_bool(Keys.LLM.POOLING_LAYER.format(arch=self.arch), value)
364+
def add_pooling_type(self, value: PoolingType) -> None:
365+
self.add_uint32(Keys.LLM.POOLING_TYPE.format(arch=self.arch), value)
365366

366367
def add_rope_dimension_count(self, count: int) -> None:
367368
self.add_uint32(Keys.Rope.DIMENSION_COUNT.format(arch=self.arch), count)

llama.cpp

Lines changed: 55 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -256,7 +256,7 @@ enum llm_kv {
256256
LLM_KV_TENSOR_DATA_LAYOUT,
257257
LLM_KV_EXPERT_COUNT,
258258
LLM_KV_EXPERT_USED_COUNT,
259-
LLM_KV_POOLING_LAYER,
259+
LLM_KV_POOLING_TYPE,
260260

261261
LLM_KV_ATTENTION_HEAD_COUNT,
262262
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -314,7 +314,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
314314
{ LLM_KV_TENSOR_DATA_LAYOUT, "%s.tensor_data_layout" },
315315
{ LLM_KV_EXPERT_COUNT, "%s.expert_count" },
316316
{ LLM_KV_EXPERT_USED_COUNT, "%s.expert_used_count" },
317-
{ LLM_KV_POOLING_LAYER, "%s.pooling_layer" },
317+
{ LLM_KV_POOLING_TYPE , "%s.pooling_type" },
318318

319319
{ LLM_KV_ATTENTION_HEAD_COUNT, "%s.attention.head_count" },
320320
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, "%s.attention.head_count_kv" },
@@ -1561,7 +1561,7 @@ struct llama_hparams {
15611561
float f_max_alibi_bias;
15621562

15631563
bool causal_attn = true;
1564-
bool pooling_layer = false;
1564+
uint32_t pooling_type = LLAMA_POOLING_NONE;
15651565

15661566

15671567
bool operator!=(const llama_hparams & other) const {
@@ -1924,7 +1924,8 @@ struct llama_context {
19241924
struct ggml_tensor * inp_pos; // I32 [n_batch]
19251925
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
19261926
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
1927-
struct ggml_tensor * inp_sum; // F32 [n_batch, n_batch]
1927+
struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
1928+
struct ggml_tensor * inp_cls; // I32 [n_batch]
19281929

19291930
#ifdef GGML_USE_MPI
19301931
ggml_mpi_context * ctx_mpi = NULL;
@@ -3086,7 +3087,7 @@ static void llm_load_hparams(
30863087
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
30873088
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
30883089
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3089-
ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer);
3090+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
30903091

30913092
switch (hparams.n_layer) {
30923093
case 3:
@@ -3107,7 +3108,7 @@ static void llm_load_hparams(
31073108
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
31083109
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
31093110
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
3110-
ml.get_key(LLM_KV_POOLING_LAYER, hparams.pooling_layer);
3111+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
31113112

31123113
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
31133114
model.type = e_model::MODEL_137M;
@@ -4934,7 +4935,7 @@ struct llm_build_context {
49344935
const int32_t n_orig_ctx;
49354936

49364937
const bool do_rope_shift;
4937-
const bool do_pooling;
4938+
const uint32_t pooling_type;
49384939

49394940
const llm_build_cb & cb;
49404941

@@ -4978,7 +4979,7 @@ struct llm_build_context {
49784979
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
49794980
n_orig_ctx (cparams.n_yarn_orig_ctx),
49804981
do_rope_shift (worst_case || kv_self.has_shift),
4981-
do_pooling (hparams.pooling_layer && cparams.do_pooling),
4982+
pooling_type (cparams.do_pooling ? hparams.pooling_type : (uint32_t)LLAMA_POOLING_NONE),
49824983
cb (cb),
49834984
buf_compute_meta (lctx.buf_compute_meta) {
49844985
// all initializations should be done in init()
@@ -5835,7 +5836,8 @@ struct llm_build_context {
58355836
// get input vectors with right size
58365837
const size_t stride1 = n_tokens * ggml_type_size(lctx.inp_tokens->type);
58375838
struct ggml_tensor * inp_pos = ggml_view_1d(ctx0, lctx.inp_pos, n_tokens, 0);
5838-
struct ggml_tensor * inp_sum = ggml_view_2d(ctx0, lctx.inp_sum, n_tokens, n_tokens, stride1, 0);
5839+
struct ggml_tensor * inp_mean = ggml_view_2d(ctx0, lctx.inp_mean, n_tokens, n_tokens, stride1, 0);
5840+
struct ggml_tensor * inp_cls = ggml_view_1d(ctx0, lctx.inp_cls, n_tokens, 0);
58395841

58405842
// construct input embeddings (token, type, position)
58415843
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
@@ -5952,8 +5954,12 @@ struct llm_build_context {
59525954
cur = inpL;
59535955

59545956
// pooling layer
5955-
if (do_pooling) {
5956-
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_sum);
5957+
if (pooling_type == LLAMA_POOLING_MEAN) {
5958+
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
5959+
} else if (pooling_type == LLAMA_POOLING_CLS) {
5960+
cur = ggml_get_rows(ctx0, cur, inp_cls);
5961+
} else {
5962+
GGML_ASSERT(pooling_type == LLAMA_POOLING_NONE && "Invalid pooling type");
59575963
}
59585964
cb(cur, "result_embd", -1);
59595965

@@ -7501,15 +7507,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
75017507
}
75027508
}
75037509

7504-
{
7505-
assert(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
7506-
float * data = (float *) lctx.inp_sum->data;
7507-
7508-
for (int i = 0; i < batch.n_tokens; ++i) {
7509-
data[i] = 1.0f/float(batch.n_tokens);
7510-
}
7511-
}
7512-
75137510
if (kv_self.has_shift) {
75147511
const int64_t n_ctx = cparams.n_ctx;
75157512

@@ -7522,17 +7519,46 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
75227519
}
75237520
}
75247521

7525-
if (hparams.pooling_layer && cparams.do_pooling) {
7522+
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
75267523
const int64_t n_tokens = batch.n_tokens;
75277524

7528-
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_sum->buffer));
7529-
float * data = (float *) lctx.inp_sum->data;
7525+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
7526+
float * data = (float *) lctx.inp_mean->data;
75307527

7531-
memset(lctx.inp_sum->data, 0, batch.n_tokens * batch.n_tokens * ggml_element_size(lctx.inp_sum));
7528+
memset(lctx.inp_mean->data, 0, n_tokens * n_tokens * ggml_element_size(lctx.inp_mean));
75327529

7530+
std::vector<uint64_t> sum(n_tokens, 0);
75337531
for (int i = 0; i < n_tokens; ++i) {
75347532
const llama_seq_id seq_id = batch.seq_id[i][0];
7535-
data[seq_id*n_tokens + i] = 1.0f;
7533+
sum[seq_id] += 1;
7534+
}
7535+
7536+
std::vector<float> div(n_tokens, 0.0f);
7537+
for (int i = 0; i < n_tokens; ++i) {
7538+
const uint64_t s = sum[i];
7539+
if (s > 0) {
7540+
div[i] = 1.0f/float(s);
7541+
}
7542+
}
7543+
7544+
for (int i = 0; i < n_tokens; ++i) {
7545+
const llama_seq_id seq_id = batch.seq_id[i][0];
7546+
data[seq_id*n_tokens + i] = div[seq_id];
7547+
}
7548+
}
7549+
7550+
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
7551+
const int64_t n_tokens = batch.n_tokens;
7552+
7553+
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
7554+
uint32_t * data = (uint32_t *) lctx.inp_cls->data;
7555+
7556+
for (int i = 0; i < n_tokens; ++i) {
7557+
const llama_seq_id seq_id = batch.seq_id[i][0];
7558+
const llama_pos pos = batch.pos[i];
7559+
if (pos == 0) {
7560+
data[seq_id] = i;
7561+
}
75367562
}
75377563
}
75387564
}
@@ -11417,14 +11443,16 @@ struct llama_context * llama_new_context_with_model(
1141711443
ctx->inp_pos = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
1141811444
ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
1141911445
ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
11420-
ctx->inp_sum = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
11446+
ctx->inp_mean = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
11447+
ctx->inp_cls = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
1142111448

1142211449
ggml_set_name(ctx->inp_tokens, "inp_tokens");
1142311450
ggml_set_name(ctx->inp_embd, "inp_embd");
1142411451
ggml_set_name(ctx->inp_pos, "inp_pos");
1142511452
ggml_set_name(ctx->inp_KQ_mask, "inp_KQ_mask");
1142611453
ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
11427-
ggml_set_name(ctx->inp_sum, "inp_sum");
11454+
ggml_set_name(ctx->inp_mean, "inp_mean");
11455+
ggml_set_name(ctx->inp_cls, "inp_cls");
1142811456

1142911457
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
1143011458

llama.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -112,6 +112,12 @@ extern "C" {
112112
LLAMA_ROPE_SCALING_MAX_VALUE = LLAMA_ROPE_SCALING_YARN,
113113
};
114114

115+
enum llama_pooling_type {
116+
LLAMA_POOLING_NONE = 0,
117+
LLAMA_POOLING_MEAN = 1,
118+
LLAMA_POOLING_CLS = 2,
119+
};
120+
115121
enum llama_split_mode {
116122
LLAMA_SPLIT_NONE = 0, // single GPU
117123
LLAMA_SPLIT_LAYER = 1, // split layers and KV across GPUs

0 commit comments

Comments
 (0)