Skip to content

Commit 213701b

Browse files
authored
Detokenizer fixes (ggml-org#8039)
* Add llama_detokenize(): - Update header files location - UNKNOWN and CONTROL are 'special pieces' - Remove space after UNKNOWN and CONTROL - Refactor llama_token_to_piece() - Add flag: clean_up_tokenization_spaces - Symmetric params for llama_tokenize() and llama_detokenize() * Update and fix tokenizer tests: - Using llama_detokenize() - Unexpected vocab type as test fail instead of error - Useful when automating tests: - If you don't know in advance the vocab type - Differenciate other loading errors - Skip unicode surrogaes and undefined - Gracefully exit threads - Using exit() is throwing random exceptions - Clean old known problematic codepoints - Minor: confusing hexadecimal codepoint * Update bruteforce random tests - Add detokenizer checks - New generator: ascii_lr_strip - New generator: apostrophe - Add more vocabs files - Detokenize special tokens. - Replace errors with '\uFFFD' when detokenizing to 'utf-8' - More edge cases - Better detokenization results check * Fix add_space_prefix, set false by default * Better leading space removal * Do not remove space when decoding special tokens * Bugfix: custom regexs splits undefined unicode codepoints * 'viking' detokenizer clean spaces
1 parent be20e7f commit 213701b

File tree

11 files changed

+501
-267
lines changed

11 files changed

+501
-267
lines changed

common/common.cpp

+21-37
Original file line numberDiff line numberDiff line change
@@ -2592,51 +2592,35 @@ std::vector<llama_token> llama_tokenize(
25922592
}
25932593

25942594
std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token, bool special) {
2595-
std::vector<char> result(8, 0);
2596-
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
2597-
if (n_tokens < 0) {
2598-
result.resize(-n_tokens);
2599-
int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size(), special);
2600-
GGML_ASSERT(check == -n_tokens);
2601-
} else {
2602-
result.resize(n_tokens);
2603-
}
2604-
2605-
return std::string(result.data(), result.size());
2606-
}
2607-
2608-
std::string llama_detokenize_spm(llama_context * ctx, const std::vector<llama_token> & tokens) {
2609-
const llama_token bos_id = llama_token_bos(llama_get_model(ctx));
2610-
26112595
std::string piece;
2612-
std::string result;
2613-
2614-
for (size_t i = 0; i < tokens.size(); ++i) {
2615-
piece = llama_token_to_piece(ctx, tokens[i]);
2616-
2617-
// remove the leading space of the first non-BOS token
2618-
if (((tokens[0] == bos_id && i == 1) || (tokens[0] != bos_id && i == 0)) && piece[0] == ' ') {
2619-
piece = piece.substr(1);
2620-
}
2621-
2622-
result += piece;
2596+
piece.resize(piece.capacity()); // using string internal cache, 15 bytes + '\n'
2597+
const int n_chars = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
2598+
if (n_chars < 0) {
2599+
piece.resize(-n_chars);
2600+
int check = llama_token_to_piece(llama_get_model(ctx), token, &piece[0], piece.size(), 0, special);
2601+
GGML_ASSERT(check == -n_chars);
2602+
}
2603+
else {
2604+
piece.resize(n_chars);
26232605
}
26242606

2625-
return result;
2607+
return piece;
26262608
}
26272609

2628-
std::string llama_detokenize_bpe(llama_context * ctx, const std::vector<llama_token> & tokens) {
2629-
std::string piece;
2630-
std::string result;
2631-
2632-
for (size_t i = 0; i < tokens.size(); ++i) {
2633-
piece = llama_token_to_piece(ctx, tokens[i]);
2634-
2635-
result += piece;
2610+
std::string llama_detokenize(llama_context * ctx, const std::vector<llama_token> & tokens, bool special) {
2611+
std::string text;
2612+
text.resize(std::max(text.capacity(), tokens.size()));
2613+
int32_t n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
2614+
if (n_chars < 0) {
2615+
text.resize(-n_chars);
2616+
n_chars = llama_detokenize(llama_get_model(ctx), tokens.data(), (int32_t)tokens.size(), &text[0], (int32_t)text.size(), false, special);
2617+
GGML_ASSERT(n_chars <= (int32_t)text.size()); // whitespace trimming is performed after per-token detokenization
26362618
}
26372619

2620+
text.resize(n_chars);
2621+
26382622
// NOTE: the original tokenizer decodes bytes after collecting the pieces.
2639-
return result;
2623+
return text;
26402624
}
26412625

26422626
bool llama_should_add_bos_token(const llama_model * model) {

common/common.h

+4-12
Original file line numberDiff line numberDiff line change
@@ -350,21 +350,13 @@ std::string llama_token_to_piece(
350350
llama_token token,
351351
bool special = true);
352352

353-
// TODO: these should be moved in llama.h C-style API under single `llama_detokenize` function
354-
// that takes into account the tokenizer type and decides how to handle the leading space
355-
//
356-
// detokenizes a vector of tokens into a string
357-
// should work similar to Python's `tokenizer.decode`
358-
// removes the leading space from the first non-BOS token
359-
std::string llama_detokenize_spm(
360-
llama_context * ctx,
361-
const std::vector<llama_token> & tokens);
362-
363353
// detokenizes a vector of tokens into a string
364354
// should work similar to Python's `tokenizer.decode`
365-
std::string llama_detokenize_bpe(
355+
// optionally renders special/control tokens
356+
std::string llama_detokenize(
366357
llama_context * ctx,
367-
const std::vector<llama_token> & tokens);
358+
const std::vector<llama_token> & tokens,
359+
bool special = true);
368360

369361
// Uses the value from the model metadata if possible, otherwise
370362
// defaults to true when model type is SPM, otherwise false.

examples/batched.swift/Sources/main.swift

+2-1
Original file line numberDiff line numberDiff line change
@@ -229,7 +229,7 @@ private func tokenize(text: String, add_bos: Bool) -> [llama_token] {
229229

230230
private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String? {
231231
var result = [CChar](repeating: 0, count: 8)
232-
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), false)
232+
let nTokens = llama_token_to_piece(model, token, &result, Int32(result.count), 0, false)
233233
if nTokens < 0 {
234234
let actualTokensCount = -Int(nTokens)
235235
result = .init(repeating: 0, count: actualTokensCount)
@@ -238,6 +238,7 @@ private func token_to_piece(token: llama_token, buffer: inout [CChar]) -> String
238238
token,
239239
&result,
240240
Int32(result.count),
241+
0,
241242
false
242243
)
243244
assert(check == actualTokensCount)

examples/llama.swiftui/llama.cpp.swift/LibLlama.swift

+2-2
Original file line numberDiff line numberDiff line change
@@ -322,15 +322,15 @@ actor LlamaContext {
322322
defer {
323323
result.deallocate()
324324
}
325-
let nTokens = llama_token_to_piece(model, token, result, 8, false)
325+
let nTokens = llama_token_to_piece(model, token, result, 8, 0, false)
326326

327327
if nTokens < 0 {
328328
let newResult = UnsafeMutablePointer<Int8>.allocate(capacity: Int(-nTokens))
329329
newResult.initialize(repeating: Int8(0), count: Int(-nTokens))
330330
defer {
331331
newResult.deallocate()
332332
}
333-
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, false)
333+
let nNewTokens = llama_token_to_piece(model, token, newResult, -nTokens, 0, false)
334334
let bufferPointer = UnsafeBufferPointer(start: newResult, count: Int(nNewTokens))
335335
return Array(bufferPointer)
336336
} else {

include/llama.h

+18-1
Original file line numberDiff line numberDiff line change
@@ -904,6 +904,7 @@ extern "C" {
904904
/// @param tokens The tokens pointer must be large enough to hold the resulting tokens.
905905
/// @return Returns the number of tokens on success, no more than n_tokens_max
906906
/// @return Returns a negative number on failure - the number of tokens that would have been returned
907+
/// @param add_special Allow to add BOS and EOS tokens if model is configured to do so.
907908
/// @param parse_special Allow tokenizing special and/or control tokens which otherwise are not exposed and treated
908909
/// as plaintext. Does not insert a leading space.
909910
LLAMA_API int32_t llama_tokenize(
@@ -918,15 +919,31 @@ extern "C" {
918919
// Token Id -> Piece.
919920
// Uses the vocabulary in the provided context.
920921
// Does not write null terminator to the buffer.
921-
// User code is responsible to remove the leading whitespace of the first non-BOS token when decoding multiple tokens.
922+
// User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')
922923
// @param special If true, special tokens are rendered in the output.
923924
LLAMA_API int32_t llama_token_to_piece(
924925
const struct llama_model * model,
925926
llama_token token,
926927
char * buf,
927928
int32_t length,
929+
int32_t lstrip,
928930
bool special);
929931

932+
/// @details Convert the provided tokens into text (inverse of llama_tokenize()).
933+
/// @param text The char pointer must be large enough to hold the resulting text.
934+
/// @return Returns the number of chars/bytes on success, no more than text_len_max.
935+
/// @return Returns a negative number on failure - the number of chars/bytes that would have been returned.
936+
/// @param remove_special Allow to remove BOS and EOS tokens if model is configured to do so.
937+
/// @param unparse_special If true, special tokens are rendered in the output.
938+
LLAMA_API int32_t llama_detokenize(
939+
const struct llama_model * model,
940+
const llama_token * tokens,
941+
int32_t n_tokens,
942+
char * text,
943+
int32_t text_len_max,
944+
bool remove_special,
945+
bool unparse_special);
946+
930947
/// Apply chat template. Inspired by hf apply_chat_template() on python.
931948
/// Both "model" and "custom_template" are optional, but at least one is required. "custom_template" has higher precedence than "model"
932949
/// NOTE: This function does not use a jinja parser. It only support a pre-defined list of template. See more: https://github.com/ggerganov/llama.cpp/wiki/Templates-supported-by-llama_chat_apply_template

0 commit comments

Comments
 (0)