@@ -67,6 +67,7 @@ enum e_model {
67
67
MODEL_65B,
68
68
};
69
69
70
+ static const size_t kB = 1024 ;
70
71
static const size_t MB = 1024 *1024 ;
71
72
72
73
// computed for n_ctx == 2048
@@ -130,6 +131,34 @@ static const std::map<e_model, size_t> & MEM_REQ_EVAL()
130
131
return k_sizes;
131
132
}
132
133
134
+ // amount of VRAM needed per batch size to hold temporary results
135
+ // the values for 3b and 65b are not derived from testing but instead chosen conservatively
136
+ static const std::map<e_model, size_t > & VRAM_REQ_SCRATCH_BASE ()
137
+ {
138
+ static std::map<e_model, size_t > k_sizes = {
139
+ { MODEL_3B, 512ull * kB },
140
+ { MODEL_7B, 512ull * kB },
141
+ { MODEL_13B, 640ull * kB },
142
+ { MODEL_30B, 768ull * kB },
143
+ { MODEL_65B, 1536ull * kB },
144
+ };
145
+ return k_sizes;
146
+ }
147
+
148
+ // amount of VRAM needed per batch size and context to hold temporary results
149
+ // the values for 3b and 65b are not derived from testing but instead chosen conservatively
150
+ static const std::map<e_model, size_t > & VRAM_REQ_SCRATCH_PER_CONTEXT ()
151
+ {
152
+ static std::map<e_model, size_t > k_sizes = {
153
+ { MODEL_3B, 128ull },
154
+ { MODEL_7B, 128ull },
155
+ { MODEL_13B, 160ull },
156
+ { MODEL_30B, 208ull },
157
+ { MODEL_65B, 416ull },
158
+ };
159
+ return k_sizes;
160
+ }
161
+
133
162
// default hparams (LLaMA 7B)
134
163
struct llama_hparams {
135
164
uint32_t n_vocab = 32000 ;
@@ -1114,11 +1143,14 @@ static void llama_model_load_internal(
1114
1143
fprintf (stderr, " %s: not allocating a VRAM scratch buffer due to low VRAM option\n " , __func__);
1115
1144
ggml_cuda_set_scratch_size (0 ); // disable scratch
1116
1145
} else {
1117
- vram_scratch = n_batch * MB * bigctxmul;
1146
+ const size_t vram_scratch_base = VRAM_REQ_SCRATCH_BASE ().at (model.type );
1147
+ const size_t vram_scratch_per_context = VRAM_REQ_SCRATCH_PER_CONTEXT ().at (model.type );
1148
+ vram_scratch = n_batch * (vram_scratch_base + n_ctx * vram_scratch_per_context);
1118
1149
ggml_cuda_set_scratch_size (vram_scratch);
1119
1150
if (n_gpu_layers > 0 ) {
1120
- fprintf (stderr, " %s: allocating batch_size x 1 MB = %zd MB VRAM for the scratch buffer\n " ,
1121
- __func__, vram_scratch / MB);
1151
+ fprintf (stderr, " %s: allocating batch_size x (%zd kB + n_ctx x %zd B) = %zd MB VRAM for the scratch buffer\n " ,
1152
+ __func__, vram_scratch_base / kB , vram_scratch_per_context,
1153
+ (vram_scratch + MB - 1 ) / MB); // round up
1122
1154
}
1123
1155
}
1124
1156
#endif // GGML_USE_CUBLAS
0 commit comments