Hostfix: remove not needed params from load_model (#2209)

qnixsynapse · web-flow · commit 3a638267c260 · 2025-06-12T15:17:26.000+07:00
* refactor: remove --pooling flag from model loading

The --pooling flag was removed as the mean pooling functionality not needed in chat models. This fixes the regression

* feat(local-engine): add ctx_len parameter support

Adds support for the ctx_len parameter by appending --ctx-size with its value. Removed outdated parameter mappings from the kParamsMap to reflect current implementation details and ensure consistency.

* feat: add conditional model parameters based on path

When the model path contains both "jan" and "nano" (case-insensitive), automatically add
speculative decoding parameters to adjust generation behavior. This improves
flexibility by enabling environment-specific configurations without manual
parameter tuning. Also includes necessary headers for string manipulation and
fixes whitespace in ctx_len handling.

* chore: remove redundant comment

The comment was redundant as the code's purpose is clear without it, improving readability.
diff --git a/engine/extensions/local-engine/local_engine.cc b/engine/extensions/local-engine/local_engine.cc
@@ -1,6 +1,9 @@
 #include "local_engine.h"
+#include <algorithm>
 #include <random>
+#include <string>
 #include <thread>
+#include <string.h>
 #include <unordered_set>
 #include "utils/curl_utils.h"
 #include "utils/json_helper.h"
@@ -20,6 +23,7 @@ const std::unordered_set<std::string> kIgnoredParams = {
     "user_prompt",  "min_keep",        "mirostat",   "mirostat_eta",
     "mirostat_tau", "text_model",      "version",    "n_probs",
     "object",       "penalize_nl",     "precision",  "size",
+    "flash_attn",
     "stop",         "tfs_z",           "typ_p",      "caching_enabled"};
 
 const std::unordered_map<std::string, std::string> kParamsMap = {
@@ -42,18 +46,24 @@ int GenerateRandomInteger(int min, int max) {
   std::uniform_int_distribution<> dis(
       min, max);  // Distribution for the desired range
 
-  return dis(gen);  // Generate and return a random integer within the range
+  return dis(gen);
 }
 
 std::vector<std::string> ConvertJsonToParamsVector(const Json::Value& root) {
   std::vector<std::string> res;
-  std::string errors;
 
   for (const auto& member : root.getMemberNames()) {
     if (member == "model_path" || member == "llama_model_path") {
       if (!root[member].isNull()) {
+        const std::string path = root[member].asString();
         res.push_back("--model");
-        res.push_back(root[member].asString());
+        res.push_back(path);
+
+        // If path contains both "Jan" and "nano", case-insensitive, add special params
+        std::string lowered = path;
+        std::transform(lowered.begin(), lowered.end(), lowered.begin(), [](unsigned char c) {
+          return std::tolower(c);
+        });
       }
       continue;
     } else if (kIgnoredParams.find(member) != kIgnoredParams.end()) {
@@ -85,8 +95,15 @@ std::vector<std::string> ConvertJsonToParamsVector(const Json::Value& root) {
         res.push_back("--ignore_eos");
       }
       continue;
+    } else if (member == "ctx_len") {
+      if (!root[member].isNull()) {
+        res.push_back("--ctx-size");
+        res.push_back(root[member].asString());
+      }
+      continue;
     }
 
+    // Generic handling for other members
     res.push_back("--" + member);
     if (root[member].isString()) {
       res.push_back(root[member].asString());
@@ -105,14 +122,15 @@ std::vector<std::string> ConvertJsonToParamsVector(const Json::Value& root) {
         ss << "\"" << value.asString() << "\"";
         first = false;
       }
-      ss << "] ";
+      ss << "]";
       res.push_back(ss.str());
     }
   }
 
   return res;
 }
 
+
 constexpr const auto kMinDataChunkSize = 6u;
 
 struct OaiInfo {
@@ -561,8 +579,6 @@ void LocalEngine::LoadModel(std::shared_ptr<Json::Value> json_body,
   params.push_back("--port");
   params.push_back(std::to_string(s.port));
 
-  params.push_back("--pooling");
-  params.push_back("mean");
 
   params.push_back("--jinja");
 
diff --git a/engine/services/model_service.cc b/engine/services/model_service.cc
@@ -165,8 +165,8 @@ ModelService::ModelService(std::shared_ptr<DatabaseService> db_service,
       download_service_{download_service},
       inference_svc_(inference_service),
       engine_svc_(engine_svc),
-      task_queue_(task_queue) {
-        // ProcessBgrTasks();
+      task_queue_(task_queue){
+          // ProcessBgrTasks();
       };
 
 void ModelService::ForceIndexingModelList() {
@@ -557,6 +557,8 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
   if (auto& o = params_override["ctx_len"]; !o.isNull()) {
     ctx_len = o.asInt();
   }
+  Json::Value model_load_params;
+  json_helper::MergeJson(model_load_params, params_override);
 
   try {
     constexpr const int kDefautlContextLength = 8192;
@@ -630,6 +632,8 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
 #else
         json_data["model_path"] =
             fmu::ToAbsoluteCortexDataPath(fs::path(mc.files[0])).string();
+        model_load_params["model_path"] =
+            fmu::ToAbsoluteCortexDataPath(fs::path(mc.files[0])).string();
 #endif
       } else {
         LOG_WARN << "model_path is empty";
@@ -642,6 +646,8 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
 #else
         json_data["mmproj"] =
             fmu::ToAbsoluteCortexDataPath(fs::path(mc.mmproj)).string();
+        model_load_params["model_path"] =
+            fmu::ToAbsoluteCortexDataPath(fs::path(mc.mmproj)).string();
 #endif
       }
       json_data["system_prompt"] = mc.system_template;
@@ -655,15 +661,14 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
     }
 
     json_data["model"] = model_handle;
+    model_load_params["model"] = model_handle;
     if (auto& cpt = custom_prompt_template; !cpt.value_or("").empty()) {
       auto parse_prompt_result = string_utils::ParsePrompt(cpt.value());
       json_data["system_prompt"] = parse_prompt_result.system_prompt;
       json_data["user_prompt"] = parse_prompt_result.user_prompt;
       json_data["ai_prompt"] = parse_prompt_result.ai_prompt;
     }
 
-    json_helper::MergeJson(json_data, params_override);
-
     // Set default cpu_threads if it is not configured
     if (!json_data.isMember("cpu_threads")) {
       json_data["cpu_threads"] = GetCpuThreads();
@@ -686,12 +691,12 @@ cpp::result<StartModelResult, std::string> ModelService::StartModel(
 
     assert(!!inference_svc_);
 
-    auto ir =
-        inference_svc_->LoadModel(std::make_shared<Json::Value>(json_data));
+    auto ir = inference_svc_->LoadModel(
+        std::make_shared<Json::Value>(model_load_params));
     auto status = std::get<0>(ir)["status_code"].asInt();
     auto data = std::get<1>(ir);
 
-    if (status == drogon::k200OK) {      
+    if (status == drogon::k200OK) {
       return StartModelResult{/* .success = */ true,
                               /* .warning = */ may_fallback_res.value()};
     } else if (status == drogon::k409Conflict) {
@@ -1031,13 +1036,15 @@ ModelService::MayFallbackToCpu(const std::string& model_path, int ngl,
   auto es = hardware::EstimateLLaMACppRun(model_path, rc);
 
   if (!!es && (*es).gpu_mode.vram_MiB > free_vram_MiB && is_cuda) {
-    CTL_WRN("Not enough VRAM - " << "required: " << (*es).gpu_mode.vram_MiB
-                                 << ", available: " << free_vram_MiB);
+    CTL_WRN("Not enough VRAM - "
+            << "required: " << (*es).gpu_mode.vram_MiB
+            << ", available: " << free_vram_MiB);
   }
 
   if (!!es && (*es).cpu_mode.ram_MiB > free_ram_MiB) {
-    CTL_WRN("Not enough RAM - " << "required: " << (*es).cpu_mode.ram_MiB
-                                << ", available: " << free_ram_MiB);
+    CTL_WRN("Not enough RAM - "
+            << "required: " << (*es).cpu_mode.ram_MiB
+            << ", available: " << free_ram_MiB);
   }
 
   return warning;