llamamodel: fix setting of n_threads

cebtenzzre · cebtenzzre · commit 702810f2ac23 · 2023-11-30T18:22:09.000-05:00
We weren't setting n_threads_batch, and setThreadCount was a no-op,
because we're using llama_decode which doesn't take an n_threads
argument.
diff --git a/gpt4all-backend/llamamodel.cpp b/gpt4all-backend/llamamodel.cpp
@@ -162,6 +162,10 @@ bool LLamaModel::loadModel(const std::string &modelPath)
     d_ptr->ctx_params.seed   = params.seed;
     d_ptr->ctx_params.f16_kv = params.memory_f16;
 
+    d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
+    d_ptr->ctx_params.n_threads       = d_ptr->n_threads;
+    d_ptr->ctx_params.n_threads_batch = d_ptr->n_threads;
+
 #ifdef GGML_USE_METAL
     if (llama_verbose()) {
         std::cerr << "llama.cpp: using Metal" << std::endl;
@@ -206,14 +210,14 @@ bool LLamaModel::loadModel(const std::string &modelPath)
     }
 #endif
 
-    d_ptr->n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
     d_ptr->modelLoaded = true;
     fflush(stderr);
     return true;
 }
 
 void LLamaModel::setThreadCount(int32_t n_threads) {
     d_ptr->n_threads = n_threads;
+    llama_set_n_threads(d_ptr->ctx, n_threads, n_threads);
 }
 
 int32_t LLamaModel::threadCount() const {