@@ -1156,6 +1156,7 @@ static void llama_model_load_internal(
1156
1156
}
1157
1157
}
1158
1158
#endif // GGML_USE_CUBLAS
1159
+
1159
1160
#if defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1160
1161
const int n_gpu = std::min (n_gpu_layers, int (hparams.n_layer ));
1161
1162
@@ -1164,6 +1165,10 @@ static void llama_model_load_internal(
1164
1165
fprintf (stderr, " %s: offloading non-repeating layers to GPU\n " , __func__);
1165
1166
}
1166
1167
size_t vram_kv_cache = 0 ;
1168
+
1169
+ #ifdef GGML_USE_CUBLAS
1170
+ const int max_backend_supported_layers = hparams.n_layer + 3 ;
1171
+ const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3 ;
1167
1172
if (n_gpu_layers > (int ) hparams.n_layer + 1 ) {
1168
1173
if (low_vram) {
1169
1174
fprintf (stderr, " %s: cannot offload v cache to GPU due to low VRAM option\n " , __func__);
@@ -1180,14 +1185,18 @@ static void llama_model_load_internal(
1180
1185
vram_kv_cache += MEM_REQ_KV_SELF ().at (model.type ) / 2 ;
1181
1186
}
1182
1187
}
1183
- const int max_offloadable_layers = low_vram ? hparams.n_layer + 1 : hparams.n_layer + 3 ;
1188
+ #elif defined(GGML_USE_CLBLAST)
1189
+ const int max_backend_supported_layers = hparams.n_layer + 1 ;
1190
+ const int max_offloadable_layers = hparams.n_layer + 1 ;
1191
+ #endif // GGML_USE_CUBLAS
1192
+
1184
1193
fprintf (stderr, " %s: offloaded %d/%d layers to GPU\n " ,
1185
- __func__, std::min (n_gpu_layers, max_offloadable_layers), hparams. n_layer + 3 );
1194
+ __func__, std::min (n_gpu_layers, max_offloadable_layers), max_backend_supported_layers );
1186
1195
fprintf (stderr, " %s: total VRAM used: %zu MB\n " ,
1187
1196
__func__, (vram_weights + vram_scratch + vram_kv_cache + MB - 1 ) / MB); // round up
1188
1197
#else
1189
1198
(void ) n_gpu_layers;
1190
- #endif
1199
+ #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
1191
1200
}
1192
1201
1193
1202
// populate `tensors_by_name`
0 commit comments