@@ -256,7 +256,7 @@ enum llm_kv {
256
256
LLM_KV_TENSOR_DATA_LAYOUT,
257
257
LLM_KV_EXPERT_COUNT,
258
258
LLM_KV_EXPERT_USED_COUNT,
259
- LLM_KV_POOLING_LAYER ,
259
+ LLM_KV_POOLING_TYPE ,
260
260
261
261
LLM_KV_ATTENTION_HEAD_COUNT,
262
262
LLM_KV_ATTENTION_HEAD_COUNT_KV,
@@ -314,7 +314,7 @@ static std::map<llm_kv, const char *> LLM_KV_NAMES = {
314
314
{ LLM_KV_TENSOR_DATA_LAYOUT, " %s.tensor_data_layout" },
315
315
{ LLM_KV_EXPERT_COUNT, " %s.expert_count" },
316
316
{ LLM_KV_EXPERT_USED_COUNT, " %s.expert_used_count" },
317
- { LLM_KV_POOLING_LAYER , " %s.pooling_layer " },
317
+ { LLM_KV_POOLING_TYPE , " %s.pooling_type " },
318
318
319
319
{ LLM_KV_ATTENTION_HEAD_COUNT, " %s.attention.head_count" },
320
320
{ LLM_KV_ATTENTION_HEAD_COUNT_KV, " %s.attention.head_count_kv" },
@@ -1561,7 +1561,7 @@ struct llama_hparams {
1561
1561
float f_max_alibi_bias;
1562
1562
1563
1563
bool causal_attn = true ;
1564
- bool pooling_layer = false ;
1564
+ uint32_t pooling_type = LLAMA_POOLING_NONE ;
1565
1565
1566
1566
1567
1567
bool operator !=(const llama_hparams & other) const {
@@ -1924,7 +1924,8 @@ struct llama_context {
1924
1924
struct ggml_tensor * inp_pos; // I32 [n_batch]
1925
1925
struct ggml_tensor * inp_KQ_mask; // F32 [n_ctx, n_batch]
1926
1926
struct ggml_tensor * inp_K_shift; // I32 [n_ctx]
1927
- struct ggml_tensor * inp_sum; // F32 [n_batch, n_batch]
1927
+ struct ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
1928
+ struct ggml_tensor * inp_cls; // I32 [n_batch]
1928
1929
1929
1930
#ifdef GGML_USE_MPI
1930
1931
ggml_mpi_context * ctx_mpi = NULL ;
@@ -3086,7 +3087,7 @@ static void llm_load_hparams(
3086
3087
ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
3087
3088
ml.get_key (LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn );
3088
3089
ml.get_key (LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type );
3089
- ml.get_key (LLM_KV_POOLING_LAYER , hparams.pooling_layer );
3090
+ ml.get_key (LLM_KV_POOLING_TYPE , hparams.pooling_type );
3090
3091
3091
3092
switch (hparams.n_layer ) {
3092
3093
case 3 :
@@ -3107,7 +3108,7 @@ static void llm_load_hparams(
3107
3108
ml.get_key (LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps );
3108
3109
ml.get_key (LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn );
3109
3110
ml.get_key (LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type );
3110
- ml.get_key (LLM_KV_POOLING_LAYER , hparams.pooling_layer );
3111
+ ml.get_key (LLM_KV_POOLING_TYPE , hparams.pooling_type );
3111
3112
3112
3113
if (hparams.n_layer == 12 && hparams.n_embd == 768 ) {
3113
3114
model.type = e_model::MODEL_137M;
@@ -4934,7 +4935,7 @@ struct llm_build_context {
4934
4935
const int32_t n_orig_ctx;
4935
4936
4936
4937
const bool do_rope_shift;
4937
- const bool do_pooling ;
4938
+ const uint32_t pooling_type ;
4938
4939
4939
4940
const llm_build_cb & cb;
4940
4941
@@ -4978,7 +4979,7 @@ struct llm_build_context {
4978
4979
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
4979
4980
n_orig_ctx (cparams.n_yarn_orig_ctx),
4980
4981
do_rope_shift (worst_case || kv_self.has_shift),
4981
- do_pooling ( hparams.pooling_layer && cparams.do_pooling ),
4982
+ pooling_type (cparams.do_pooling ? hparams.pooling_type : ( uint32_t )LLAMA_POOLING_NONE ),
4982
4983
cb (cb),
4983
4984
buf_compute_meta (lctx.buf_compute_meta) {
4984
4985
// all initializations should be done in init()
@@ -5835,7 +5836,8 @@ struct llm_build_context {
5835
5836
// get input vectors with right size
5836
5837
const size_t stride1 = n_tokens * ggml_type_size (lctx.inp_tokens ->type );
5837
5838
struct ggml_tensor * inp_pos = ggml_view_1d (ctx0, lctx.inp_pos , n_tokens, 0 );
5838
- struct ggml_tensor * inp_sum = ggml_view_2d (ctx0, lctx.inp_sum , n_tokens, n_tokens, stride1, 0 );
5839
+ struct ggml_tensor * inp_mean = ggml_view_2d (ctx0, lctx.inp_mean , n_tokens, n_tokens, stride1, 0 );
5840
+ struct ggml_tensor * inp_cls = ggml_view_1d (ctx0, lctx.inp_cls , n_tokens, 0 );
5839
5841
5840
5842
// construct input embeddings (token, type, position)
5841
5843
inpL = llm_build_inp_embd (ctx0, hparams, batch, model.tok_embd , lctx.inp_tokens , lctx.inp_embd , cb);
@@ -5952,8 +5954,12 @@ struct llm_build_context {
5952
5954
cur = inpL;
5953
5955
5954
5956
// pooling layer
5955
- if (do_pooling) {
5956
- cur = ggml_mul_mat (ctx0, ggml_cont (ctx0, ggml_transpose (ctx0, cur)), inp_sum);
5957
+ if (pooling_type == LLAMA_POOLING_MEAN) {
5958
+ cur = ggml_mul_mat (ctx0, ggml_cont (ctx0, ggml_transpose (ctx0, cur)), inp_mean);
5959
+ } else if (pooling_type == LLAMA_POOLING_CLS) {
5960
+ cur = ggml_get_rows (ctx0, cur, inp_cls);
5961
+ } else {
5962
+ GGML_ASSERT (pooling_type == LLAMA_POOLING_NONE && " Invalid pooling type" );
5957
5963
}
5958
5964
cb (cur, " result_embd" , -1 );
5959
5965
@@ -7501,15 +7507,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7501
7507
}
7502
7508
}
7503
7509
7504
- {
7505
- assert (ggml_backend_buffer_is_host (lctx.inp_sum ->buffer ));
7506
- float * data = (float *) lctx.inp_sum ->data ;
7507
-
7508
- for (int i = 0 ; i < batch.n_tokens ; ++i) {
7509
- data[i] = 1 .0f /float (batch.n_tokens );
7510
- }
7511
- }
7512
-
7513
7510
if (kv_self.has_shift ) {
7514
7511
const int64_t n_ctx = cparams.n_ctx ;
7515
7512
@@ -7522,17 +7519,46 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
7522
7519
}
7523
7520
}
7524
7521
7525
- if (hparams. pooling_layer && cparams. do_pooling ) {
7522
+ if (cparams. do_pooling && hparams. pooling_type == LLAMA_POOLING_MEAN ) {
7526
7523
const int64_t n_tokens = batch.n_tokens ;
7527
7524
7528
- GGML_ASSERT (ggml_backend_buffer_is_host (lctx.inp_sum ->buffer ));
7529
- float * data = (float *) lctx.inp_sum ->data ;
7525
+ GGML_ASSERT (ggml_backend_buffer_is_host (lctx.inp_mean ->buffer ));
7526
+ float * data = (float *) lctx.inp_mean ->data ;
7530
7527
7531
- memset (lctx.inp_sum ->data , 0 , batch. n_tokens * batch. n_tokens * ggml_element_size (lctx.inp_sum ));
7528
+ memset (lctx.inp_mean ->data , 0 , n_tokens * n_tokens * ggml_element_size (lctx.inp_mean ));
7532
7529
7530
+ std::vector<uint64_t > sum (n_tokens, 0 );
7533
7531
for (int i = 0 ; i < n_tokens; ++i) {
7534
7532
const llama_seq_id seq_id = batch.seq_id [i][0 ];
7535
- data[seq_id*n_tokens + i] = 1 .0f ;
7533
+ sum[seq_id] += 1 ;
7534
+ }
7535
+
7536
+ std::vector<float > div (n_tokens, 0 .0f );
7537
+ for (int i = 0 ; i < n_tokens; ++i) {
7538
+ const uint64_t s = sum[i];
7539
+ if (s > 0 ) {
7540
+ div[i] = 1 .0f /float (s);
7541
+ }
7542
+ }
7543
+
7544
+ for (int i = 0 ; i < n_tokens; ++i) {
7545
+ const llama_seq_id seq_id = batch.seq_id [i][0 ];
7546
+ data[seq_id*n_tokens + i] = div[seq_id];
7547
+ }
7548
+ }
7549
+
7550
+ if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_CLS) {
7551
+ const int64_t n_tokens = batch.n_tokens ;
7552
+
7553
+ GGML_ASSERT (ggml_backend_buffer_is_host (lctx.inp_cls ->buffer ));
7554
+ uint32_t * data = (uint32_t *) lctx.inp_cls ->data ;
7555
+
7556
+ for (int i = 0 ; i < n_tokens; ++i) {
7557
+ const llama_seq_id seq_id = batch.seq_id [i][0 ];
7558
+ const llama_pos pos = batch.pos [i];
7559
+ if (pos == 0 ) {
7560
+ data[seq_id] = i;
7561
+ }
7536
7562
}
7537
7563
}
7538
7564
}
@@ -11417,14 +11443,16 @@ struct llama_context * llama_new_context_with_model(
11417
11443
ctx->inp_pos = ggml_new_tensor_1d (ctx->ctx_input , GGML_TYPE_I32, cparams.n_batch );
11418
11444
ctx->inp_KQ_mask = ggml_new_tensor_2d (ctx->ctx_input , GGML_TYPE_F32, cparams.n_ctx , cparams.n_batch );
11419
11445
ctx->inp_K_shift = ggml_new_tensor_1d (ctx->ctx_input , GGML_TYPE_I32, cparams.n_ctx );
11420
- ctx->inp_sum = ggml_new_tensor_2d (ctx->ctx_input , GGML_TYPE_F32, cparams.n_batch , cparams.n_batch );
11446
+ ctx->inp_mean = ggml_new_tensor_2d (ctx->ctx_input , GGML_TYPE_F32, cparams.n_batch , cparams.n_batch );
11447
+ ctx->inp_cls = ggml_new_tensor_1d (ctx->ctx_input , GGML_TYPE_I32, cparams.n_batch );
11421
11448
11422
11449
ggml_set_name (ctx->inp_tokens , " inp_tokens" );
11423
11450
ggml_set_name (ctx->inp_embd , " inp_embd" );
11424
11451
ggml_set_name (ctx->inp_pos , " inp_pos" );
11425
11452
ggml_set_name (ctx->inp_KQ_mask , " inp_KQ_mask" );
11426
11453
ggml_set_name (ctx->inp_K_shift , " inp_K_shift" );
11427
- ggml_set_name (ctx->inp_sum , " inp_sum" );
11454
+ ggml_set_name (ctx->inp_mean , " inp_mean" );
11455
+ ggml_set_name (ctx->inp_cls , " inp_cls" );
11428
11456
11429
11457
ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft (ctx->ctx_input , llama_default_buffer_type_cpu (true ));
11430
11458
0 commit comments