Skip to content

Commit 4064e7c

Browse files
authored
Merge branch 'ggerganov:master' into mistral.cpp
2 parents 92bb200 + 15fa07a commit 4064e7c

File tree

117 files changed

+3096
-1229
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

117 files changed

+3096
-1229
lines changed

.devops/llama-server.Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ ARG UBUNTU_VERSION=22.04
33
FROM ubuntu:$UBUNTU_VERSION AS build
44

55
RUN apt-get update && \
6-
apt-get install -y build-essential git libcurl4-openssl-dev curl
6+
apt-get install -y build-essential git libcurl4-openssl-dev
77

88
WORKDIR /app
99

@@ -16,7 +16,7 @@ RUN make -j$(nproc) llama-server
1616
FROM ubuntu:$UBUNTU_VERSION AS runtime
1717

1818
RUN apt-get update && \
19-
apt-get install -y libcurl4-openssl-dev libgomp1
19+
apt-get install -y libcurl4-openssl-dev libgomp1 curl
2020

2121
COPY --from=build /app/llama-server /llama-server
2222

.devops/nix/package.nix

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -126,16 +126,9 @@ let
126126
++ optionals useMetalKit [ MetalKit ];
127127

128128
cudaBuildInputs = with cudaPackages; [
129-
cuda_cccl.dev # <nv/target>
130-
131-
# A temporary hack for reducing the closure size, remove once cudaPackages
132-
# have stopped using lndir: https://github.com/NixOS/nixpkgs/issues/271792
133-
cuda_cudart.dev
134-
cuda_cudart.lib
135-
cuda_cudart.static
136-
libcublas.dev
137-
libcublas.lib
138-
libcublas.static
129+
cuda_cudart
130+
cuda_cccl # <nv/target>
131+
libcublas
139132
];
140133

141134
rocmBuildInputs = with rocmPackages; [

.github/workflows/build.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -860,7 +860,8 @@ jobs:
860860
mkdir build
861861
cd build
862862
cmake .. -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_CUDA=ON -DBUILD_SHARED_LIBS=ON
863-
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1))
863+
cmake --build . --config Release -j $((${env:NUMBER_OF_PROCESSORS} - 1)) -t ggml
864+
cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
864865
865866
- name: Determine tag name
866867
id: tag

CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,8 @@ set(LLAMA_BIN_INSTALL_DIR ${CMAKE_INSTALL_BINDIR} CACHE PATH "Location o
139139
# determining _precisely_ which defines are necessary for the llama-config
140140
# package.
141141
#
142-
get_directory_property(GGML_DIR_DEFINES DIRECTORY ggml/src COMPILE_DEFINITIONS)
142+
get_target_property(GGML_DIRECTORY ggml SOURCE_DIR)
143+
get_directory_property(GGML_DIR_DEFINES DIRECTORY ${GGML_DIRECTORY} COMPILE_DEFINITIONS)
143144
get_target_property(GGML_TARGET_DEFINES ggml COMPILE_DEFINITIONS)
144145
set(GGML_TRANSIENT_DEFINES ${GGML_TARGET_DEFINES} ${GGML_DIR_DEFINES})
145146
get_target_property(GGML_LINK_LIBRARIES ggml LINK_LIBRARIES)

CONTRIBUTING.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
- Execute [the full CI locally on your machine](ci/README.md) before publishing
66
- Please rate the complexity of your PR (i.e. `Review Complexity : Low`, `Review Complexity : Medium`, `Review Complexity : High`). This makes it easier for maintainers to triage the PRs.
77
- The PR template has a series of review complexity checkboxes `[ ]` that [you can mark as](https://docs.github.com/en/get-started/writing-on-github/working-with-advanced-formatting/about-task-lists) `[X]` for your convenience
8+
- Consider allowing write access to your branch for faster review
89
- If your PR becomes stale, don't hesitate to ping the maintainers in the comments
910

1011
# Pull requests (for collaborators)

Makefile

Lines changed: 24 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -890,15 +890,16 @@ ggml/src/ggml-metal-embed.o: \
890890
ggml/src/ggml-common.h
891891
@echo "Embedding Metal library"
892892
@sed -e '/#include "ggml-common.h"/r ggml/src/ggml-common.h' -e '/#include "ggml-common.h"/d' < ggml/src/ggml-metal.metal > ggml/src/ggml-metal-embed.metal
893-
$(eval TEMP_ASSEMBLY=$(shell mktemp))
894-
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)
895-
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)
896-
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)
897-
@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)
898-
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)
899-
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)
900-
@$(AS) $(TEMP_ASSEMBLY) -o $@
901-
@rm -f ${TEMP_ASSEMBLY}
893+
$(eval TEMP_ASSEMBLY=$(shell mktemp -d))
894+
@echo ".section __DATA, __ggml_metallib" > $(TEMP_ASSEMBLY)/ggml-metal-embed.s
895+
@echo ".globl _ggml_metallib_start" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
896+
@echo "_ggml_metallib_start:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
897+
@echo ".incbin \"ggml/src/ggml-metal-embed.metal\"" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
898+
@echo ".globl _ggml_metallib_end" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
899+
@echo "_ggml_metallib_end:" >> $(TEMP_ASSEMBLY)/ggml-metal-embed.s
900+
$(CC) $(CFLAGS) -c $(TEMP_ASSEMBLY)/ggml-metal-embed.s -o $@
901+
@rm -f ${TEMP_ASSEMBLY}/ggml-metal-embed.s
902+
@rmdir ${TEMP_ASSEMBLY}
902903
endif
903904
endif # GGML_METAL
904905

@@ -1617,42 +1618,41 @@ llama-q8dot: pocs/vdot/q8dot.cpp ggml/src/ggml.o \
16171618
# Mark legacy binary targets as .PHONY so that they are always checked.
16181619
.PHONY: main quantize perplexity embedding server finetune
16191620

1621+
# Define the object file target
1622+
examples/deprecation-warning/deprecation-warning.o: examples/deprecation-warning/deprecation-warning.cpp
1623+
$(CXX) $(CXXFLAGS) -c $< -o $@
1624+
16201625
# NOTE: We currently will always build the deprecation-warning `main` and `server` binaries to help users migrate.
16211626
# Eventually we will want to remove these target from building all the time.
1622-
main: examples/deprecation-warning/deprecation-warning.cpp
1623-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1624-
$(CXX) $(CXXFLAGS) $(filter-out $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1627+
main: examples/deprecation-warning/deprecation-warning.o
1628+
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
16251629
@echo "NOTICE: The 'main' binary is deprecated. Please use 'llama-cli' instead."
16261630

1627-
server: examples/deprecation-warning/deprecation-warning.cpp
1628-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1629-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1631+
server: examples/deprecation-warning/deprecation-warning.o
1632+
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
16301633
@echo "NOTICE: The 'server' binary is deprecated. Please use 'llama-server' instead."
16311634

1632-
quantize: examples/deprecation-warning/deprecation-warning.cpp
1635+
quantize: examples/deprecation-warning/deprecation-warning.o
16331636
ifneq (,$(wildcard quantize))
1634-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1635-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1637+
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
16361638
@echo "#########"
16371639
@echo "WARNING: The 'quantize' binary is deprecated. Please use 'llama-quantize' instead."
16381640
@echo " Remove the 'quantize' binary to remove this warning."
16391641
@echo "#########"
16401642
endif
16411643

1642-
perplexity: examples/deprecation-warning/deprecation-warning.cpp
1644+
perplexity: examples/deprecation-warning/deprecation-warning.o
16431645
ifneq (,$(wildcard perplexity))
1644-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1645-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1646+
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
16461647
@echo "#########"
16471648
@echo "WARNING: The 'perplexity' binary is deprecated. Please use 'llama-perplexity' instead."
16481649
@echo " Remove the 'perplexity' binary to remove this warning."
16491650
@echo "#########"
16501651
endif
16511652

1652-
embedding: examples/deprecation-warning/deprecation-warning.cpp
1653+
embedding: examples/deprecation-warning/deprecation-warning.o
16531654
ifneq (,$(wildcard embedding))
1654-
$(CXX) $(CXXFLAGS) -c $< -o $(call GET_OBJ_FILE, $<)
1655-
$(CXX) $(CXXFLAGS) $(filter-out %.h $<,$^) $(call GET_OBJ_FILE, $<) -o $@ $(LDFLAGS)
1655+
$(CXX) $(CXXFLAGS) $< -o $@ $(LDFLAGS)
16561656
@echo "#########"
16571657
@echo "WARNING: The 'embedding' binary is deprecated. Please use 'llama-embedding' instead."
16581658
@echo " Remove the 'embedding' binary to remove this warning."

README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,16 @@ Typically finetunes of the base models below are supported as well.
9595
- [x] [SEA-LION](https://huggingface.co/models?search=sea-lion)
9696
- [x] [GritLM-7B](https://huggingface.co/GritLM/GritLM-7B) + [GritLM-8x7B](https://huggingface.co/GritLM/GritLM-8x7B)
9797
- [x] [OLMo](https://allenai.org/olmo)
98+
- [x] [Granite models](https://huggingface.co/collections/ibm-granite/granite-code-models-6624c5cec322e4c148c8b330)
9899
- [x] [GPT-NeoX](https://github.com/EleutherAI/gpt-neox) + [Pythia](https://github.com/EleutherAI/pythia)
100+
- [x] [Snowflake-Arctic MoE](https://huggingface.co/collections/Snowflake/arctic-66290090abe542894a5ac520)
101+
- [x] [Smaug](https://huggingface.co/models?search=Smaug)
102+
- [x] [Poro 34B](https://huggingface.co/LumiOpen/Poro-34B)
103+
- [x] [Bitnet b1.58 models](https://huggingface.co/1bitLLM)
104+
- [x] [Flan T5](https://huggingface.co/models?search=flan-t5)
105+
- [x] [Open Elm models](https://huggingface.co/collections/apple/openelm-instruct-models-6619ad295d7ae9f868b759ca)
99106
- [x] [ChatGLM3-6b](https://huggingface.co/THUDM/chatglm3-6b) + [ChatGLM4-9b](https://huggingface.co/THUDM/glm-4-9b)
107+
- [x] [SmolLM](https://huggingface.co/collections/HuggingFaceTB/smollm-6695016cad7167254ce15966)
100108

101109
(instructions for supporting more models: [HOWTO-add-model.md](./docs/development/HOWTO-add-model.md))
102110

@@ -145,6 +153,7 @@ Unless otherwise noted these projects are open-source with permissive licensing:
145153
- [Faraday](https://faraday.dev/) (proprietary)
146154
- [LMStudio](https://lmstudio.ai/) (proprietary)
147155
- [Layla](https://play.google.com/store/apps/details?id=com.laylalite) (proprietary)
156+
- [ramalama](https://github.com/containers/ramalama) (MIT)
148157
- [LocalAI](https://github.com/mudler/LocalAI) (MIT)
149158
- [LostRuins/koboldcpp](https://github.com/LostRuins/koboldcpp) (AGPL)
150159
- [Mozilla-Ocho/llamafile](https://github.com/Mozilla-Ocho/llamafile)

common/common.cpp

Lines changed: 53 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -684,14 +684,24 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
684684
}
685685
if (arg == "--lora") {
686686
CHECK_ARG
687-
params.lora_adapter.emplace_back(argv[i], 1.0f);
687+
params.lora_adapters.push_back({
688+
std::string(argv[i]),
689+
1.0,
690+
});
688691
return true;
689692
}
690693
if (arg == "--lora-scaled") {
691694
CHECK_ARG
692-
const char* lora_adapter = argv[i];
695+
std::string lora_adapter = argv[i];
693696
CHECK_ARG
694-
params.lora_adapter.emplace_back(lora_adapter, std::stof(argv[i]));
697+
params.lora_adapters.push_back({
698+
lora_adapter,
699+
std::stof(argv[i]),
700+
});
701+
return true;
702+
}
703+
if (arg == "--lora-init-without-apply") {
704+
params.lora_init_without_apply = true;
695705
return true;
696706
}
697707
if (arg == "--control-vector") {
@@ -1634,7 +1644,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
16341644
options.push_back({ "server", " --host HOST", "ip address to listen (default: %s)", params.hostname.c_str() });
16351645
options.push_back({ "server", " --port PORT", "port to listen (default: %d)", params.port });
16361646
options.push_back({ "server", " --path PATH", "path to serve static files from (default: %s)", params.public_path.c_str() });
1637-
options.push_back({ "server", " --embedding(s)", "enable embedding endpoint (default: %s)", params.embedding ? "enabled" : "disabled" });
1647+
options.push_back({ "server", " --embedding(s)", "restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled" });
16381648
options.push_back({ "server", " --api-key KEY", "API key to use for authentication (default: none)" });
16391649
options.push_back({ "server", " --api-key-file FNAME", "path to file containing API keys (default: none)" });
16401650
options.push_back({ "server", " --ssl-key-file FNAME", "path to file a PEM-encoded SSL private key" });
@@ -1654,6 +1664,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
16541664
"https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template" });
16551665
options.push_back({ "server", "-sps, --slot-prompt-similarity SIMILARITY",
16561666
"how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity });
1667+
options.push_back({ "server", " --lora-init-without-apply", "load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: %s)", params.lora_init_without_apply ? "enabled" : "disabled"});
16571668

16581669
#ifndef LOG_DISABLE_LOGS
16591670
options.push_back({ "logging" });
@@ -2039,8 +2050,8 @@ std::string fs_get_cache_file(const std::string & filename) {
20392050
//
20402051
// Model utils
20412052
//
2042-
2043-
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
2053+
struct llama_init_result llama_init_from_gpt_params(gpt_params & params) {
2054+
llama_init_result iparams;
20442055
auto mparams = llama_model_params_from_gpt_params(params);
20452056

20462057
llama_model * model = nullptr;
@@ -2055,7 +2066,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20552066

20562067
if (model == NULL) {
20572068
fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
2058-
return std::make_tuple(nullptr, nullptr);
2069+
return iparams;
20592070
}
20602071

20612072
auto cparams = llama_context_params_from_gpt_params(params);
@@ -2064,7 +2075,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20642075
if (lctx == NULL) {
20652076
fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
20662077
llama_free_model(model);
2067-
return std::make_tuple(nullptr, nullptr);
2078+
return iparams;
20682079
}
20692080

20702081
if (!params.control_vectors.empty()) {
@@ -2075,7 +2086,7 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20752086
if (cvec.n_embd == -1) {
20762087
llama_free(lctx);
20772088
llama_free_model(model);
2078-
return std::make_tuple(nullptr, nullptr);
2089+
return iparams;
20792090
}
20802091

20812092
int err = llama_control_vector_apply(lctx,
@@ -2087,21 +2098,26 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
20872098
if (err) {
20882099
llama_free(lctx);
20892100
llama_free_model(model);
2090-
return std::make_tuple(nullptr, nullptr);
2101+
return iparams;
20912102
}
20922103
}
20932104

2094-
for (unsigned int i = 0; i < params.lora_adapter.size(); ++i) {
2095-
const std::string & lora_adapter = std::get<0>(params.lora_adapter[i]);
2096-
float lora_scale = std::get<1>(params.lora_adapter[i]);
2097-
auto adapter = llama_lora_adapter_init(model, lora_adapter.c_str());
2098-
if (adapter == nullptr) {
2099-
fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
2105+
// load and optionally apply lora adapters
2106+
for (auto & la : params.lora_adapters) {
2107+
llama_lora_adapter_container loaded_la;
2108+
loaded_la.path = la.path;
2109+
loaded_la.scale = la.scale;
2110+
loaded_la.adapter = llama_lora_adapter_init(model, la.path.c_str());
2111+
if (loaded_la.adapter == nullptr) {
2112+
fprintf(stderr, "%s: error: failed to apply lora adapter '%s'\n", __func__, la.path.c_str());
21002113
llama_free(lctx);
21012114
llama_free_model(model);
2102-
return std::make_tuple(nullptr, nullptr);
2115+
return iparams;
21032116
}
2104-
llama_lora_adapter_set(lctx, adapter, lora_scale);
2117+
iparams.lora_adapters.push_back(loaded_la); // copy to list of loaded adapters
2118+
}
2119+
if (!params.lora_init_without_apply) {
2120+
llama_lora_adapters_apply(lctx, iparams.lora_adapters);
21052121
}
21062122

21072123
if (params.ignore_eos) {
@@ -2135,7 +2151,18 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
21352151
llama_reset_timings(lctx);
21362152
}
21372153

2138-
return std::make_tuple(model, lctx);
2154+
iparams.model = model;
2155+
iparams.context = lctx;
2156+
return iparams;
2157+
}
2158+
2159+
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters) {
2160+
llama_lora_adapter_clear(ctx);
2161+
for (auto & la : lora_adapters) {
2162+
if (la.scale != 0.0f) {
2163+
llama_lora_adapter_set(ctx, la.adapter, la.scale);
2164+
}
2165+
}
21392166
}
21402167

21412168
struct llama_model_params llama_model_params_from_gpt_params(const gpt_params & params) {
@@ -3160,19 +3187,18 @@ void yaml_dump_non_result_info(FILE * stream, const gpt_params & params, const l
31603187
}
31613188

31623189
fprintf(stream, "lora:\n");
3163-
for (std::tuple<std::string, float> la : params.lora_adapter) {
3164-
if (std::get<1>(la) != 1.0f) {
3165-
continue;
3190+
for (auto & la : params.lora_adapters) {
3191+
if (la.scale == 1.0f) {
3192+
fprintf(stream, " - %s\n", la.path.c_str());
31663193
}
3167-
fprintf(stream, " - %s\n", std::get<0>(la).c_str());
31683194
}
31693195
fprintf(stream, "lora_scaled:\n");
3170-
for (std::tuple<std::string, float> la : params.lora_adapter) {
3171-
if (std::get<1>(la) == 1.0f) {
3172-
continue;
3196+
for (auto & la : params.lora_adapters) {
3197+
if (la.scale != 1.0f) {
3198+
fprintf(stream, " - %s: %f\n", la.path.c_str(), la.scale);
31733199
}
3174-
fprintf(stream, " - %s: %f\n", std::get<0>(la).c_str(), std::get<1>(la));
31753200
}
3201+
fprintf(stream, "lora_init_without_apply: %s # default: false\n", params.lora_init_without_apply ? "true" : "false");
31763202
fprintf(stream, "main_gpu: %d # default: 0\n", params.main_gpu);
31773203
fprintf(stream, "min_keep: %d # default: 0 (disabled)\n", sparams.min_keep);
31783204
fprintf(stream, "mirostat: %d # default: 0 (disabled)\n", sparams.mirostat);

common/common.h

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,15 @@
3333

3434
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
3535

36+
struct llama_lora_adapter_info {
37+
std::string path;
38+
float scale;
39+
};
40+
41+
struct llama_lora_adapter_container : llama_lora_adapter_info {
42+
struct llama_lora_adapter * adapter;
43+
};
44+
3645
// build info
3746
extern int LLAMA_BUILD_NUMBER;
3847
extern char const * LLAMA_COMMIT;
@@ -126,8 +135,8 @@ struct gpt_params {
126135
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
127136
std::vector<llama_model_kv_override> kv_overrides;
128137

129-
// TODO: avoid tuple, use struct
130-
std::vector<std::tuple<std::string, float>> lora_adapter; // lora adapter path with user defined scale
138+
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
139+
std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
131140

132141
std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
133142

@@ -308,15 +317,23 @@ std::string fs_get_cache_file(const std::string & filename);
308317
// Model utils
309318
//
310319

311-
// TODO: avoid tuplue, use struct
312-
std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params);
320+
struct llama_init_result {
321+
struct llama_model * model = nullptr;
322+
struct llama_context * context = nullptr;
323+
std::vector<llama_lora_adapter_container> lora_adapters;
324+
};
325+
326+
struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
313327

314328
struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
315329
struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
316330

317331
struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
318332
struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
319333

334+
// clear LoRA adapters from context, then apply new list of adapters
335+
void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
336+
320337
// Batch utils
321338

322339
void llama_batch_clear(struct llama_batch & batch);

0 commit comments

Comments
 (0)