Skip to content

Commit 00a415d

Browse files
committed
llama : limit max batch size to n_batch
1 parent 937966d commit 00a415d

File tree

2 files changed

+3
-7
lines changed

2 files changed

+3
-7
lines changed

ggml-backend.c

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1609,20 +1609,17 @@ static enum ggml_status ggml_backend_sched_compute_splits(ggml_backend_sched_t s
16091609
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
16101610
ggml_backend_event_synchronize(sched->events[split_backend_id][sched->cur_copy]);
16111611
} else {
1612-
//printf("%s: sync %s\n", __func__, ggml_backend_name(split_backend));
16131612
ggml_backend_synchronize(split_backend);
16141613
}
16151614
ggml_backend_tensor_copy(input, input_cpy);
16161615
} else {
16171616
if (sched->events[split_backend_id][sched->cur_copy] != NULL) {
16181617
ggml_backend_event_wait(split_backend, sched->events[split_backend_id][sched->cur_copy]);
16191618
} else {
1620-
//printf("%s: sync %s %s\n", __func__, ggml_backend_name(split_backend), ggml_backend_name(input_backend));
16211619
ggml_backend_synchronize(split_backend);
16221620
ggml_backend_synchronize(input_backend);
16231621
}
16241622

1625-
// split_backend waits on input_backend and then copies the data
16261623
ggml_backend_tensor_copy_async(input_backend, split_backend, input, input_cpy);
16271624
}
16281625
}

llama.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -8770,9 +8770,8 @@ static int llama_decode_internal(
87708770

87718771
GGML_ASSERT((!batch_all.token && batch_all.embd) || (batch_all.token && !batch_all.embd)); // NOLINT
87728772

8773-
GGML_ASSERT(n_tokens_all <= cparams.n_ctx);
8773+
GGML_ASSERT(n_tokens_all <= cparams.n_batch);
87748774

8775-
//const int64_t t_start_us = ggml_time_us();
87768775
if (lctx.t_compute_start_us == 0) {
87778776
lctx.t_compute_start_us = ggml_time_us();
87788777
}
@@ -12959,8 +12958,8 @@ struct llama_context * llama_new_context_with_model(
1295912958
// graph outputs buffer
1296012959
{
1296112960
// resized during inference, reserve maximum
12962-
ctx->logits_size = hparams.n_vocab*cparams.n_ctx;
12963-
ctx->embd_size = params.embeddings ? hparams.n_embd*cparams.n_ctx : 0;
12961+
ctx->logits_size = hparams.n_vocab*cparams.n_batch;
12962+
ctx->embd_size = params.embeddings ? hparams.n_embd*cparams.n_batch : 0;
1296412963

1296512964
const size_t buf_output_size = (ctx->logits_size + ctx->embd_size)*sizeof(float);
1296612965

0 commit comments

Comments
 (0)