Skip to content

Commit 665cc11

Browse files
committed
add lowvram parameter
1 parent 222cbbb commit 665cc11

File tree

4 files changed

+8
-3
lines changed

4 files changed

+8
-3
lines changed

Makefile

+2-2
Original file line numberDiff line numberDiff line change
@@ -160,8 +160,8 @@ ifdef LLAMA_HIPBLAS
160160
CC := $(ROCM_PATH)/llvm/bin/clang
161161
CXX := $(ROCM_PATH)/llvm/bin/clang++
162162
GPU_TARGETS = gfx900 gfx906 gfx908 gfx90a gfx1030
163-
LLAMA_CUDA_DMMV_X ?= 128
164-
LLAMA_CUDA_DMMV_Y ?= 4
163+
LLAMA_CUDA_DMMV_X ?= 64
164+
LLAMA_CUDA_DMMV_Y ?= 2
165165
CFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
166166
CXXFLAGS += -DGGML_USE_HIPBLAS -DGGML_USE_CUBLAS $(shell $(ROCM_PATH)/bin/hipconfig -C)
167167
LDFLAGS += -L/opt/rocm/lib -Wl,-rpath=$(ROCM_PATH)/lib -lhipblas -lamdhip64

expose.h

+1
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ struct load_model_inputs
88
const int max_context_length;
99
const int batch_size;
1010
const bool f16_kv;
11+
const bool low_vram;
1112
const char * executable_path;
1213
const char * model_filename;
1314
const char * lora_filename;

gpttype_adapter.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -371,6 +371,7 @@ ModelLoadResult gpttype_load_model(const load_model_inputs inputs, FileFormat in
371371
//llama_ctx_paran_parts = -1;
372372
llama_ctx_params.seed = -1;
373373
llama_ctx_params.f16_kv = inputs.f16_kv;
374+
llama_ctx_params.low_vram = inputs.low_vram;
374375
llama_ctx_params.logits_all = false;
375376
llama_ctx_params.use_mmap = inputs.use_mmap;
376377
llama_ctx_params.use_mlock = inputs.use_mlock;

koboldcpp.py

+4-1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ class load_model_inputs(ctypes.Structure):
1616
("max_context_length", ctypes.c_int),
1717
("batch_size", ctypes.c_int),
1818
("f16_kv", ctypes.c_bool),
19+
("low_vram", ctypes.c_bool),
1920
("executable_path", ctypes.c_char_p),
2021
("model_filename", ctypes.c_char_p),
2122
("lora_filename", ctypes.c_char_p),
@@ -150,6 +151,7 @@ def load_model(model_filename):
150151
inputs.batch_size = 8
151152
inputs.max_context_length = maxctx #initial value to use for ctx, can be overwritten
152153
inputs.threads = args.threads
154+
inputs.low_vram = args.lowvram
153155
inputs.blasthreads = args.blasthreads
154156
inputs.f16_kv = True
155157
inputs.use_mmap = (not args.nommap)
@@ -646,7 +648,7 @@ def onDropdownChange(event):
646648
#load all the vars
647649
args.threads = int(threads_var.get())
648650
args.gpulayers = int(gpu_layers_var.get())
649-
651+
650652
args.stream = (stream.get()==1)
651653
args.smartcontext = (smartcontext.get()==1)
652654
args.launch = (launchbrowser.get()==1)
@@ -861,6 +863,7 @@ def main(args):
861863
parser.add_argument("--hordeconfig", help="Sets the display model name to something else, for easy use on AI Horde. Optional additional parameters set the horde max genlength and max ctxlen.",metavar=('[hordename]', '[hordelength] [hordectx]'), nargs='+')
862864
compatgroup = parser.add_mutually_exclusive_group()
863865
compatgroup.add_argument("--noblas", help="Do not use OpenBLAS for accelerated prompt ingestion", action='store_true')
866+
parser.add_argument("--lowvram", help="Do not keep scratch memory in VRAM for CUDA", action='store_true')
864867
compatgroup.add_argument("--useclblast", help="Use CLBlast instead of OpenBLAS for prompt ingestion. Must specify exactly 2 arguments, platform ID and device ID (e.g. --useclblast 1 0).", type=int, choices=range(0,9), nargs=2)
865868
parser.add_argument("--gpulayers", help="Set number of layers to offload to GPU when using CLBlast. Requires CLBlast.",metavar=('[GPU layers]'), type=int, default=0)
866869
args = parser.parse_args()

0 commit comments

Comments
 (0)