Skip to content

use fprintf for diagnostic output, keep printf only for printing model output #48

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Mar 13, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 46 additions & 46 deletions main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ struct llama_model {

// load the model's weights from a file
bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab & vocab, int n_ctx) {
printf("%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());
fprintf(stderr, "%s: loading model from '%s' - please wait ...\n", __func__, fname.c_str());

auto fin = std::ifstream(fname, std::ios::binary);
if (!fin) {
Expand Down Expand Up @@ -124,16 +124,16 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
n_ff = ((2*(4*hparams.n_embd)/3 + hparams.n_mult - 1)/hparams.n_mult)*hparams.n_mult;
n_parts = LLAMA_N_PARTS.at(hparams.n_embd);

printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_mult = %d\n", __func__, hparams.n_mult);
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
printf("%s: f16 = %d\n", __func__, hparams.f16);
printf("%s: n_ff = %d\n", __func__, n_ff);
printf("%s: n_parts = %d\n", __func__, n_parts);
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
fprintf(stderr, "%s: n_ctx = %d\n", __func__, hparams.n_ctx);
fprintf(stderr, "%s: n_embd = %d\n", __func__, hparams.n_embd);
fprintf(stderr, "%s: n_mult = %d\n", __func__, hparams.n_mult);
fprintf(stderr, "%s: n_head = %d\n", __func__, hparams.n_head);
fprintf(stderr, "%s: n_layer = %d\n", __func__, hparams.n_layer);
fprintf(stderr, "%s: n_rot = %d\n", __func__, hparams.n_rot);
fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
fprintf(stderr, "%s: n_ff = %d\n", __func__, n_ff);
fprintf(stderr, "%s: n_parts = %d\n", __func__, n_parts);
}

// load vocab
Expand All @@ -158,7 +158,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
vocab.id_to_token[i] = word;

//if (i < 30000) {
// printf("%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
// fprintf(stderr, "%s: vocab[%d] = '%s'\n", __func__, i, word.c_str());
//}
}
}
Expand Down Expand Up @@ -217,7 +217,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab

ctx_size += (5 + 10*n_layer)*256; // object overhead

printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
fprintf(stderr, "%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}

// create the ggml context
Expand Down Expand Up @@ -304,7 +304,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab

const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);

printf("%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
fprintf(stderr, "%s: memory_size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
}

const size_t file_offset = fin.tellg();
Expand All @@ -322,7 +322,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
fname_part += "." + std::to_string(i);
}

printf("%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());
fprintf(stderr, "%s: loading model part %d/%d from '%s'\n", __func__, i+1, n_parts, fname_part.c_str());

fin = std::ifstream(fname_part, std::ios::binary);
fin.seekg(file_offset);
Expand All @@ -332,7 +332,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
int n_tensors = 0;
size_t total_size = 0;

printf("%s: ", __func__);
fprintf(stderr, "%s: ", __func__);

while (true) {
int32_t n_dims;
Expand Down Expand Up @@ -432,7 +432,7 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab

if (0) {
static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
printf("%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
fprintf(stderr, "%24s - [%5d, %5d], type = %6s, split = %d\n", name.data(), ne[0], ne[1], ftype_str[ftype], split_type);
}

size_t bpe = 0;
Expand Down Expand Up @@ -495,16 +495,16 @@ bool llama_model_load(const std::string & fname, llama_model & model, gpt_vocab
total_size += ggml_nbytes(tensor)/n_parts;
}

//printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
//fprintf(stderr, "%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
if (++n_tensors % 8 == 0) {
printf(".");
fflush(stdout);
fprintf(stderr, ".");
fflush(stderr);
}
}

printf(" done\n");
fprintf(stderr, " done\n");

printf("%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
fprintf(stderr, "%s: model size = %8.2f MB / num tensors = %d\n", __func__, total_size/1024.0/1024.0, n_tensors);
}

fin.close();
Expand Down Expand Up @@ -548,7 +548,7 @@ bool llama_eval(

if (mem_per_token > 0 && mem_per_token*N > buf_size) {
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
//fprintf(stderr, "\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);

// reallocate
buf_size = buf_size_new;
Expand Down Expand Up @@ -740,7 +740,7 @@ bool llama_eval(
if (mem_per_token == 0) {
mem_per_token = ggml_used_mem(ctx0)/N;
}
//printf("used_mem = %zu\n", ggml_used_mem(ctx0));
//fprintf(stderr, "used_mem = %zu\n", ggml_used_mem(ctx0));

ggml_free(ctx0);

Expand Down Expand Up @@ -776,7 +776,7 @@ int main(int argc, char ** argv) {
params.seed = time(NULL);
}

printf("%s: seed = %d\n", __func__, params.seed);
fprintf(stderr, "%s: seed = %d\n", __func__, params.seed);

std::mt19937 rng(params.seed);
if (params.prompt.empty()) {
Expand Down Expand Up @@ -818,13 +818,13 @@ int main(int argc, char ** argv) {
// tokenize the reverse prompt
std::vector<gpt_vocab::id> antiprompt_inp = ::llama_tokenize(vocab, params.antiprompt, false);

printf("\n");
printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
printf("%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
fprintf(stderr, "\n");
fprintf(stderr, "%s: prompt: '%s'\n", __func__, params.prompt.c_str());
fprintf(stderr, "%s: number of tokens in prompt = %zu\n", __func__, embd_inp.size());
for (int i = 0; i < (int) embd_inp.size(); i++) {
printf("%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
fprintf(stderr, "%6d -> '%s'\n", embd_inp[i], vocab.id_to_token.at(embd_inp[i]).c_str());
}
printf("\n");
fprintf(stderr, "\n");
if (params.interactive) {
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
struct sigaction sigint_action;
Expand All @@ -834,19 +834,19 @@ int main(int argc, char ** argv) {
sigaction(SIGINT, &sigint_action, NULL);
#endif

printf("%s: interactive mode on.\n", __func__);
fprintf(stderr, "%s: interactive mode on.\n", __func__);

if(antiprompt_inp.size()) {
printf("%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
printf("%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
fprintf(stderr, "%s: reverse prompt: '%s'\n", __func__, params.antiprompt.c_str());
fprintf(stderr, "%s: number of tokens in reverse prompt = %zu\n", __func__, antiprompt_inp.size());
for (int i = 0; i < (int) antiprompt_inp.size(); i++) {
printf("%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str());
fprintf(stderr, "%6d -> '%s'\n", antiprompt_inp[i], vocab.id_to_token.at(antiprompt_inp[i]).c_str());
}
printf("\n");
fprintf(stderr, "\n");
}
}
printf("sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
printf("\n\n");
fprintf(stderr, "sampling parameters: temp = %f, top_k = %d, top_p = %f, repeat_last_n = %i, repeat_penalty = %f\n", params.temp, params.top_k, params.top_p, params.repeat_last_n, params.repeat_penalty);
fprintf(stderr, "\n\n");

std::vector<gpt_vocab::id> embd;

Expand All @@ -860,7 +860,7 @@ int main(int argc, char ** argv) {


if (params.interactive) {
printf("== Running in interactive mode. ==\n"
fprintf(stderr, "== Running in interactive mode. ==\n"
#if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__))
" - Press Ctrl+C to interject at any time.\n"
#endif
Expand Down Expand Up @@ -888,7 +888,7 @@ int main(int argc, char ** argv) {
const int64_t t_start_us = ggml_time_us();

if (!llama_eval(model, params.n_threads, n_past, embd, logits, mem_per_token)) {
printf("Failed to predict\n");
fprintf(stderr, "Failed to predict\n");
return 1;
}

Expand Down Expand Up @@ -1000,7 +1000,7 @@ int main(int argc, char ** argv) {

// end of text token
if (embd.back() == 2) {
printf(" [end of text]\n");
fprintf(stderr, " [end of text]\n");
break;
}
}
Expand All @@ -1010,12 +1010,12 @@ int main(int argc, char ** argv) {
{
const int64_t t_main_end_us = ggml_time_us();

printf("\n\n");
printf("%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
fprintf(stderr, "\n\n");
fprintf(stderr, "%s: mem per token = %8zu bytes\n", __func__, mem_per_token);
fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
fprintf(stderr, "%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
fprintf(stderr, "%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
}

ggml_free(model.ctx);
Expand Down