llamamodel: fix incorrect use of batch API

cebtenzzre · cebtenzzre · commit c68e8813fa1b · 2023-12-01T13:40:11.000-05:00
I filed an upstream PR to discuss this: ggml-org/llama.cpp#4274 Also, make sure to free the batch when we're done with it.
diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp
@@ -71,8 +71,9 @@ static int llama_sample_top_p_top_k(
         int top_k,
         float top_p,
         float temp,
-        float repeat_penalty) {
-    auto logits = llama_get_logits(ctx);
+        float repeat_penalty,
+        int32_t pos) {
+    auto logits = llama_get_logits_ith(ctx, pos);
     auto n_vocab = llama_n_vocab(llama_get_model(ctx));
     // Populate initial list of all candidates
     std::vector<llama_token_data> candidates;
@@ -274,26 +275,30 @@ LLModel::Token LLamaModel::sampleToken(PromptContext &promptCtx) const
     return llama_sample_top_p_top_k(d_ptr->ctx,
         promptCtx.tokens.data() + promptCtx.tokens.size() - n_prev_toks,
         n_prev_toks, promptCtx.top_k, promptCtx.top_p, promptCtx.temp,
-        promptCtx.repeat_penalty);
+        promptCtx.repeat_penalty, promptCtx.n_last_batch_tokens - 1);
 }
 
 bool LLamaModel::evalTokens(PromptContext &ctx, const std::vector<int32_t> &tokens) const
 {
     llama_batch batch = llama_batch_init(tokens.size(), 0, 1);
 
     batch.n_tokens = tokens.size();
+    ctx.n_last_batch_tokens = tokens.size();
 
     for (int32_t i = 0; i < batch.n_tokens; i++) {
-        batch.token[i]  = tokens[i];
-        batch.pos[i]    = ctx.n_past + i;
-        batch.seq_id[i] = 0;
-        batch.logits[i] = false;
+        batch.token   [i] = tokens[i];
+        batch.pos     [i] = ctx.n_past + i;
+        batch.n_seq_id[i] = 1;
+        batch.seq_id  [i][0] = 0;
+        batch.logits  [i] = false;
     }
 
     // llama_decode will output logits only for the last token of the prompt
     batch.logits[batch.n_tokens - 1] = true;
 
-    return llama_decode(d_ptr->ctx, batch) == 0;
+    int res = llama_decode(d_ptr->ctx, batch);
+    llama_batch_free(batch);
+    return res == 0;
 }
 
 int32_t LLamaModel::contextLength() const
diff --git a/gpt4all-backend/llmodel.h b/gpt4all-backend/llmodel.h
@@ -54,8 +54,8 @@ class LLModel {
         int32_t n_batch = 9;
         float   repeat_penalty = 1.10f;
         int32_t repeat_last_n = 64;     // last n tokens to penalize
-        float   contextErase = 0.75f;   // percent of context to erase if we exceed the context
-            // window
+        float   contextErase = 0.75f;   // percent of context to erase if we exceed the context window
+        int32_t n_last_batch_tokens = 0;
     };
 
     struct GPUDevice {