Skip to content

Commit 02dc83e

Browse files
authored
feat(llm): adds serveral settings for llamacpp and ollama (#1703)
1 parent 410bf7a commit 02dc83e

File tree

10 files changed

+91
-8
lines changed

10 files changed

+91
-8
lines changed

private_gpt/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""private-gpt."""
2+
23
import logging
34
import os
45

private_gpt/components/llm/llm_component.py

Lines changed: 25 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -39,16 +39,23 @@ def __init__(self, settings: Settings) -> None:
3939
) from e
4040

4141
prompt_style = get_prompt_style(settings.llamacpp.prompt_style)
42-
42+
settings_kwargs = {
43+
"tfs_z": settings.llamacpp.tfs_z, # ollama and llama-cpp
44+
"top_k": settings.llamacpp.top_k, # ollama and llama-cpp
45+
"top_p": settings.llamacpp.top_p, # ollama and llama-cpp
46+
"repeat_penalty": settings.llamacpp.repeat_penalty, # ollama llama-cpp
47+
"n_gpu_layers": -1,
48+
"offload_kqv": True,
49+
}
4350
self.llm = LlamaCPP(
4451
model_path=str(models_path / settings.llamacpp.llm_hf_model_file),
45-
temperature=0.1,
52+
temperature=settings.llm.temperature,
4653
max_new_tokens=settings.llm.max_new_tokens,
4754
context_window=settings.llm.context_window,
4855
generate_kwargs={},
4956
callback_manager=LlamaIndexSettings.callback_manager,
5057
# All to GPU
51-
model_kwargs={"n_gpu_layers": -1, "offload_kqv": True},
58+
model_kwargs=settings_kwargs,
5259
# transform inputs into Llama2 format
5360
messages_to_prompt=prompt_style.messages_to_prompt,
5461
completion_to_prompt=prompt_style.completion_to_prompt,
@@ -108,8 +115,22 @@ def __init__(self, settings: Settings) -> None:
108115
) from e
109116

110117
ollama_settings = settings.ollama
118+
119+
settings_kwargs = {
120+
"tfs_z": ollama_settings.tfs_z, # ollama and llama-cpp
121+
"num_predict": ollama_settings.num_predict, # ollama only
122+
"top_k": ollama_settings.top_k, # ollama and llama-cpp
123+
"top_p": ollama_settings.top_p, # ollama and llama-cpp
124+
"repeat_last_n": ollama_settings.repeat_last_n, # ollama
125+
"repeat_penalty": ollama_settings.repeat_penalty, # ollama llama-cpp
126+
}
127+
111128
self.llm = Ollama(
112-
model=ollama_settings.llm_model, base_url=ollama_settings.api_base
129+
model=ollama_settings.llm_model,
130+
base_url=ollama_settings.api_base,
131+
temperature=settings.llm.temperature,
132+
context_window=settings.llm.context_window,
133+
additional_kwargs=settings_kwargs,
113134
)
114135
case "mock":
115136
self.llm = MockLLM()

private_gpt/components/vector_store/vector_store_component.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -137,9 +137,11 @@ def get_retriever(
137137
index=index,
138138
similarity_top_k=similarity_top_k,
139139
doc_ids=context_filter.docs_ids if context_filter else None,
140-
filters=_doc_id_metadata_filter(context_filter)
141-
if self.settings.vectorstore.database != "qdrant"
142-
else None,
140+
filters=(
141+
_doc_id_metadata_filter(context_filter)
142+
if self.settings.vectorstore.database != "qdrant"
143+
else None
144+
),
143145
)
144146

145147
def close(self) -> None:

private_gpt/launcher.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""FastAPI app creation, logger configuration and main API routes."""
2+
23
import logging
34

45
from fastapi import Depends, FastAPI, Request

private_gpt/server/utils/auth.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
* https://fastapi.tiangolo.com/tutorial/security/
1313
* https://fastapi.tiangolo.com/tutorial/dependencies/dependencies-in-path-operation-decorators/
1414
"""
15+
1516
# mypy: ignore-errors
1617
# Disabled mypy error: All conditional function variants must have identical signatures
1718
# We are changing the implementation of the authenticated method, based on

private_gpt/settings/settings.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,10 @@ class LLMSettings(BaseModel):
9898
"like `HuggingFaceH4/zephyr-7b-beta`. If not set, will load a tokenizer matching "
9999
"gpt-3.5-turbo LLM.",
100100
)
101+
temperature: float = Field(
102+
0.1,
103+
description="The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual.",
104+
)
101105

102106

103107
class VectorstoreSettings(BaseModel):
@@ -119,6 +123,23 @@ class LlamaCPPSettings(BaseModel):
119123
),
120124
)
121125

126+
tfs_z: float = Field(
127+
1.0,
128+
description="Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.",
129+
)
130+
top_k: int = Field(
131+
40,
132+
description="Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)",
133+
)
134+
top_p: float = Field(
135+
0.9,
136+
description="Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)",
137+
)
138+
repeat_penalty: float = Field(
139+
1.1,
140+
description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)",
141+
)
142+
122143

123144
class HuggingFaceSettings(BaseModel):
124145
embedding_hf_model_name: str = Field(
@@ -184,6 +205,30 @@ class OllamaSettings(BaseModel):
184205
None,
185206
description="Model to use. Example: 'nomic-embed-text'.",
186207
)
208+
tfs_z: float = Field(
209+
1.0,
210+
description="Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.",
211+
)
212+
num_predict: int = Field(
213+
None,
214+
description="Maximum number of tokens to predict when generating text. (Default: 128, -1 = infinite generation, -2 = fill context)",
215+
)
216+
top_k: int = Field(
217+
40,
218+
description="Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)",
219+
)
220+
top_p: float = Field(
221+
0.9,
222+
description="Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)",
223+
)
224+
repeat_last_n: int = Field(
225+
64,
226+
description="Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)",
227+
)
228+
repeat_penalty: float = Field(
229+
1.1,
230+
description="Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)",
231+
)
187232

188233

189234
class UISettings(BaseModel):

private_gpt/ui/ui.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""This file should be imported only and only if you want to run the UI locally."""
2+
23
import itertools
34
import logging
45
import time

settings-ollama.yaml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ llm:
55
mode: ollama
66
max_new_tokens: 512
77
context_window: 3900
8+
temperature: 0.1 #The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)
89

910
embedding:
1011
mode: ollama
@@ -13,10 +14,14 @@ ollama:
1314
llm_model: mistral
1415
embedding_model: nomic-embed-text
1516
api_base: http://localhost:11434
17+
tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting.
18+
top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
19+
top_p: 0.9 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
20+
repeat_last_n: 64 # Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx)
21+
repeat_penalty: 1.2 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
1622

1723
vectorstore:
1824
database: qdrant
1925

2026
qdrant:
2127
path: local_data/private_gpt/qdrant
22-

settings.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,16 @@ llm:
3939
# Should be matching the selected model
4040
max_new_tokens: 512
4141
context_window: 3900
42+
temperature: 0.1 # The temperature of the model. Increasing the temperature will make the model answer more creatively. A value of 0.1 would be more factual. (Default: 0.1)
4243

4344
llamacpp:
4445
prompt_style: "mistral"
4546
llm_hf_repo_id: TheBloke/Mistral-7B-Instruct-v0.2-GGUF
4647
llm_hf_model_file: mistral-7b-instruct-v0.2.Q4_K_M.gguf
48+
tfs_z: 1.0 # Tail free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting
49+
top_k: 40 # Reduces the probability of generating nonsense. A higher value (e.g. 100) will give more diverse answers, while a lower value (e.g. 10) will be more conservative. (Default: 40)
50+
top_p: 1.0 # Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. (Default: 0.9)
51+
repeat_penalty: 1.1 # Sets how strongly to penalize repetitions. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient. (Default: 1.1)
4752

4853
embedding:
4954
# Should be matching the value above in most cases

tests/server/utils/test_simple_auth.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
is currently architecture (it is hard to patch the `settings` and the app while
66
the tests are directly importing them).
77
"""
8+
89
from typing import Annotated
910

1011
import pytest

0 commit comments

Comments
 (0)