@@ -2960,6 +2960,7 @@ struct server_context {
2960
2960
new_tokens[i - n_discard] = new_tokens[i];
2961
2961
}
2962
2962
2963
+ new_tokens.resize (slot.cache_tokens .size () - n_discard);
2963
2964
slot.cache_tokens .clear ();
2964
2965
slot.cache_tokens .insert (new_tokens);
2965
2966
}
@@ -3095,12 +3096,12 @@ struct server_context {
3095
3096
// we should never reach this
3096
3097
GGML_ABORT (" not supported by multimodal" );
3097
3098
}
3098
- llama_tokens curr_tokens = slot.prompt_tokens .get_text_tokens (); // copy
3099
3099
const int n_left = slot.n_ctx - slot.params .n_keep ;
3100
3100
3101
3101
const int n_block_size = n_left / 2 ;
3102
3102
const int erased_blocks = (slot.n_prompt_tokens - slot.params .n_keep - n_block_size) / n_block_size;
3103
3103
3104
+ const llama_tokens & curr_tokens = slot.prompt_tokens .get_text_tokens ();
3104
3105
llama_tokens new_tokens (
3105
3106
curr_tokens.begin (),
3106
3107
curr_tokens.begin () + slot.params .n_keep );
@@ -3208,10 +3209,9 @@ struct server_context {
3208
3209
// remove the non-common part from the cache
3209
3210
slot.cache_tokens .resize (slot.n_past );
3210
3211
3211
- llama_token cur_tok = slot.prompt_tokens [slot.n_past ];
3212
-
3213
3212
// check if we should process the image
3214
- if (cur_tok == LLAMA_TOKEN_NULL) {
3213
+ if (slot.n_past < slot.n_prompt_tokens
3214
+ && slot.prompt_tokens [slot.n_past ] == LLAMA_TOKEN_NULL) {
3215
3215
// process the image
3216
3216
int32_t new_n_past;
3217
3217
int32_t res = slot.prompt_tokens .process_chunk (ctx, mctx, slot.n_past , slot.id , new_n_past);
0 commit comments