Skip to content

Commit 0cb8a9e

Browse files
committed
Merge remote-tracking branch 'Johannes/cuda-scratch-size-adjust' into concedo_experimental
# Conflicts: # llama.cpp
2 parents 67cb0b2 + 600bf6d commit 0cb8a9e

File tree

1 file changed

+35
-3
lines changed

1 file changed

+35
-3
lines changed

llama.cpp

+35-3
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,7 @@ enum e_model {
6767
MODEL_65B,
6868
};
6969

70+
static const size_t kB = 1024;
7071
static const size_t MB = 1024*1024;
7172

7273
// computed for n_ctx == 2048
@@ -130,6 +131,34 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
130131
return k_sizes;
131132
}
132133

134+
// amount of VRAM needed per batch size to hold temporary results
135+
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
136+
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_BASE()
137+
{
138+
static std::map<e_model, size_t> k_sizes = {
139+
{ MODEL_3B, 512ull * kB },
140+
{ MODEL_7B, 512ull * kB },
141+
{ MODEL_13B, 640ull * kB },
142+
{ MODEL_30B, 768ull * kB },
143+
{ MODEL_65B, 1536ull * kB },
144+
};
145+
return k_sizes;
146+
}
147+
148+
// amount of VRAM needed per batch size and context to hold temporary results
149+
// the values for 3b and 65b are not derived from testing but instead chosen conservatively
150+
static const std::map<e_model, size_t> & VRAM_REQ_SCRATCH_PER_CONTEXT()
151+
{
152+
static std::map<e_model, size_t> k_sizes = {
153+
{ MODEL_3B, 128ull },
154+
{ MODEL_7B, 128ull },
155+
{ MODEL_13B, 160ull },
156+
{ MODEL_30B, 208ull },
157+
{ MODEL_65B, 416ull },
158+
};
159+
return k_sizes;
160+
}
161+
133162
// default hparams (LLaMA 7B)
134163
struct llama_hparams {
135164
uint32_t n_vocab = 32000;
@@ -1114,11 +1143,14 @@ static void llama_model_load_internal(
11141143
fprintf(stderr, "%s: not allocating a VRAM scratch buffer due to low VRAM option\n", __func__);
11151144
ggml_cuda_set_scratch_size(0); // disable scratch
11161145
} else {
1117-
vram_scratch = n_batch * MB * bigctxmul;
1146+
const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE().at(model.type);
1147+
const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT().at(model.type);
1148+
vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
11181149
ggml_cuda_set_scratch_size(vram_scratch);
11191150
if (n_gpu_layers > 0) {
1120-
fprintf(stderr, "%s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n",
1121-
__func__, vram_scratch / MB);
1151+
fprintf(stderr, "%s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n",
1152+
__func__, vram_scratch_base / kB, vram_scratch_per_context,
1153+
(vram_scratch + MB - 1) / MB); // round up
11221154
}
11231155
}
11241156
#endif // GGML_USE_CUBLAS

0 commit comments

Comments
 (0)