@@ -4416,6 +4416,19 @@ const ggml_tensor * llama_model::get_tensor(const char * name) const {
4416
4416
return it->second;
4417
4417
}
4418
4418
4419
+ ggml_tensor * llama_model::get_rope_factors(uint32_t n_ctx_per_seq, int il) const {
4420
+ // choose long/short freq factors based on the context size
4421
+ if (layers[il].rope_freqs != nullptr) {
4422
+ return layers[il].rope_freqs;
4423
+ }
4424
+
4425
+ if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
4426
+ return layers[il].rope_long;
4427
+ }
4428
+
4429
+ return layers[il].rope_short;
4430
+ }
4431
+
4419
4432
struct llm_build_llama : public llm_graph_context {
4420
4433
llm_build_llama(const llama_model & model, const llm_graph_params & params, ggml_cgraph * gf) : llm_graph_context(params) {
4421
4434
const int64_t n_embd_head = hparams.n_embd_head_v;
@@ -4456,7 +4469,7 @@ struct llm_build_llama : public llm_graph_context {
4456
4469
// self-attention
4457
4470
{
4458
4471
// rope freq factors for llama3; may return nullptr for llama2 and other models
4459
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
4472
+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
4460
4473
4461
4474
// compute Q and K and RoPE them
4462
4475
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -4681,7 +4694,7 @@ struct llm_build_deci : public llm_graph_context {
4681
4694
} else if (n_head > 0) {
4682
4695
// self-attention
4683
4696
// rope freq factors for llama3; may return nullptr for llama2 and other models
4684
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
4697
+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
4685
4698
4686
4699
// compute Q and K and RoPE them
4687
4700
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -7141,7 +7154,7 @@ struct llm_build_phi3 : public llm_graph_context {
7141
7154
// self-attention
7142
7155
{
7143
7156
// rope freq factors for 128k context
7144
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
7157
+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
7145
7158
7146
7159
ggml_tensor* attn_norm_output = build_norm(inpL,
7147
7160
model.layers[il].attn_norm,
@@ -7893,7 +7906,7 @@ struct llm_build_minicpm3 : public llm_graph_context {
7893
7906
for (int il = 0; il < n_layer; ++il) {
7894
7907
ggml_tensor * inpSA = inpL;
7895
7908
7896
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
7909
+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
7897
7910
7898
7911
// norm
7899
7912
cur = build_norm(inpL,
@@ -8961,7 +8974,7 @@ struct llm_build_cohere2 : public llm_graph_context {
8961
8974
// self-attention
8962
8975
{
8963
8976
// rope freq factors for 128k context
8964
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
8977
+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
8965
8978
8966
8979
// compute Q and K and RoPE them
8967
8980
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -9899,7 +9912,7 @@ struct llm_build_deepseek : public llm_graph_context {
9899
9912
// self-attention
9900
9913
{
9901
9914
// rope freq factors for llama3; may return nullptr for llama2 and other models
9902
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
9915
+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
9903
9916
9904
9917
// compute Q and K and RoPE them
9905
9918
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -11264,7 +11277,7 @@ struct llm_build_exaone : public llm_graph_context {
11264
11277
// self-attention
11265
11278
{
11266
11279
// rope freq factors for llama3; may return nullptr for llama2 and other models
11267
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
11280
+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
11268
11281
11269
11282
// compute Q and K and RoPE them
11270
11283
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -12645,7 +12658,7 @@ struct llm_build_bailingmoe : public llm_graph_context {
12645
12658
// self-attention
12646
12659
{
12647
12660
// rope freq factors for llama3; may return nullptr for llama2 and other models
12648
- ggml_tensor * rope_factors = static_cast<const llama_kv_cache_unified *>(memory)->cbs .get_rope_factors(n_ctx_per_seq, il);
12661
+ ggml_tensor * rope_factors = model .get_rope_factors(n_ctx_per_seq, il);
12649
12662
12650
12663
// compute Q and K and RoPE them
12651
12664
ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
@@ -12768,28 +12781,6 @@ struct llm_build_bailingmoe : public llm_graph_context {
12768
12781
llama_memory_i * llama_model::create_memory(const llama_memory_params & params, llama_cparams & cparams) const {
12769
12782
llama_memory_i * res;
12770
12783
12771
- const bool offload = cparams.offload_kqv;
12772
-
12773
- auto get_buft = [this, offload](int il) {
12774
- const char * dev_name = "CPU";
12775
-
12776
- ggml_backend_buffer_type_t buft;
12777
- if (offload) {
12778
- auto * dev = dev_layer(il);
12779
- buft = ggml_backend_dev_buffer_type(dev);
12780
-
12781
- dev_name = ggml_backend_dev_name(dev);
12782
- } else {
12783
- buft = ggml_backend_cpu_buffer_type();
12784
- }
12785
-
12786
- LLAMA_LOG_DEBUG("layer %3d: dev = %s\n", il, dev_name);
12787
-
12788
- return buft;
12789
- };
12790
-
12791
- LLAMA_LOG_DEBUG("%s: n_ctx = %u\n", __func__, cparams.n_ctx);
12792
-
12793
12784
switch (arch) {
12794
12785
case LLM_ARCH_MAMBA:
12795
12786
case LLM_ARCH_RWKV6:
@@ -12798,13 +12789,10 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
12798
12789
case LLM_ARCH_ARWKV7:
12799
12790
{
12800
12791
res = new llama_kv_cache_recurrent(
12801
- hparams,
12802
- {
12803
- /*.get_rope_factors =*/ nullptr,
12804
- /*.get_buft =*/ get_buft,
12805
- },
12792
+ *this,
12806
12793
GGML_TYPE_F32,
12807
12794
GGML_TYPE_F32,
12795
+ cparams.offload_kqv,
12808
12796
std::max((uint32_t) 1, cparams.n_seq_max));
12809
12797
} break;
12810
12798
default:
@@ -12816,25 +12804,11 @@ llama_memory_i * llama_model::create_memory(const llama_memory_params & params,
12816
12804
LLAMA_LOG_DEBUG("%s: n_ctx = %u (padded)\n", __func__, cparams.n_ctx);
12817
12805
12818
12806
res = new llama_kv_cache_unified(
12819
- hparams,
12820
- {
12821
- /*.get_rope_factors =*/ [this](uint32_t n_ctx_per_seq, int il) {
12822
- // choose long/short freq factors based on the context size
12823
- if (layers[il].rope_freqs != nullptr) {
12824
- return layers[il].rope_freqs;
12825
- }
12826
-
12827
- if (n_ctx_per_seq > hparams.n_ctx_orig_yarn) {
12828
- return layers[il].rope_long;
12829
- }
12830
-
12831
- return layers[il].rope_short;
12832
- },
12833
- /*.get_buft =*/ get_buft,
12834
- },
12807
+ *this,
12835
12808
params.type_k,
12836
12809
params.type_v,
12837
12810
!cparams.flash_attn,
12811
+ cparams.offload_kqv,
12838
12812
cparams.n_ctx,
12839
12813
padding);
12840
12814
}
0 commit comments