Skip to content

Commit 00e3e5a

Browse files
authored
mtmd : add qwen2vl and qwen2.5vl (#13141)
* llava : add clip_n_output_tokens, deprecate clip_n_patches * mtmd : add qwen2vl and qwen2.5vl * decode_embd_batch::set_position_... * working version * deprecate llama-qwen2vl-cli * correct order W, H of clip_embd_nbytes_by_img * edit existing line in hot topics
1 parent e98b369 commit 00e3e5a

File tree

10 files changed

+196
-79
lines changed

10 files changed

+196
-79
lines changed

README.md

+1-1
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
1717
## Hot topics
1818

1919
- **GGML developer experience survey (organized and reviewed by NVIDIA):** [link](https://forms.gle/Gasw3cRgyhNEnrwK9)
20-
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli` and `gemma3-cli` https://github.com/ggml-org/llama.cpp/pull/13012, `libllava` will be deprecated
20+
- A new binary `llama-mtmd-cli` is introduced to replace `llava-cli`, `minicpmv-cli`, `gemma3-cli` ([#13012](https://github.com/ggml-org/llama.cpp/pull/13012)) and `qwen2vl-cli` ([#13141]((https://github.com/ggml-org/llama.cpp/pull/13141))), `libllava` will be deprecated
2121
- VS Code extension for FIM completions: https://github.com/ggml-org/llama.vscode
2222
- Universal [tool call support](./docs/function-calling.md) in `llama-server` https://github.com/ggml-org/llama.cpp/pull/9639
2323
- Vim/Neovim plugin for FIM completions: https://github.com/ggml-org/llama.vim

examples/llava/CMakeLists.txt

+1-7
Original file line numberDiff line numberDiff line change
@@ -64,13 +64,7 @@ endif()
6464
add_executable(llama-llava-cli deprecation-warning.cpp)
6565
add_executable(llama-gemma3-cli deprecation-warning.cpp)
6666
add_executable(llama-minicpmv-cli deprecation-warning.cpp)
67-
68-
set(TARGET llama-qwen2vl-cli)
69-
add_executable(${TARGET} qwen2vl-cli.cpp)
70-
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME llama-qwen2vl-cli)
71-
install(TARGETS ${TARGET} RUNTIME)
72-
target_link_libraries(${TARGET} PRIVATE common llava ${CMAKE_THREAD_LIBS_INIT})
73-
target_compile_features(${TARGET} PRIVATE cxx_std_17)
67+
add_executable(llama-qwen2vl-cli deprecation-warning.cpp)
7468

7569
set(TARGET llama-mtmd-cli)
7670
add_executable(${TARGET} mtmd-cli.cpp)

examples/llava/clip.cpp

+30-4
Original file line numberDiff line numberDiff line change
@@ -2825,15 +2825,18 @@ void clip_free(clip_ctx * ctx) {
28252825
delete ctx;
28262826
}
28272827

2828+
// deprecated
28282829
size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
2829-
return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
2830+
const int32_t nx = ctx->vision_model.hparams.image_size;
2831+
const int32_t ny = ctx->vision_model.hparams.image_size;
2832+
return clip_embd_nbytes_by_img(ctx, nx, ny);
28302833
}
28312834

2832-
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
2835+
size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h) {
28332836
clip_image_f32 img;
28342837
img.nx = img_w;
28352838
img.ny = img_h;
2836-
return clip_n_patches_by_img(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
2839+
return clip_n_output_tokens(ctx, &img) * clip_n_mmproj_embd(ctx) * sizeof(float);
28372840
}
28382841

28392842
int32_t clip_get_image_size(const struct clip_ctx * ctx) {
@@ -2863,14 +2866,37 @@ size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
28632866
return ctx->vision_model.hparams.image_grid_pinpoints.size();
28642867
}
28652868

2869+
// deprecated
28662870
int clip_n_patches(const struct clip_ctx * ctx) {
28672871
clip_image_f32 img;
28682872
img.nx = ctx->vision_model.hparams.image_size;
28692873
img.ny = ctx->vision_model.hparams.image_size;
2870-
return clip_n_patches_by_img(ctx, &img);
2874+
return clip_n_output_tokens(ctx, &img);
28712875
}
28722876

2877+
// deprecated
28732878
int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
2879+
return clip_n_output_tokens(ctx, img);
2880+
}
2881+
2882+
int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
2883+
const auto & params = ctx->vision_model.hparams;
2884+
const int n_total = clip_n_output_tokens(ctx, img);
2885+
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
2886+
return img->nx / (params.patch_size * 2) + (int)(img->nx % params.patch_size > 0);
2887+
}
2888+
return n_total;
2889+
}
2890+
2891+
int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
2892+
const auto & params = ctx->vision_model.hparams;
2893+
if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
2894+
return img->ny / (params.patch_size * 2) + (int)(img->ny % params.patch_size > 0);
2895+
}
2896+
return 1;
2897+
}
2898+
2899+
int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
28742900
const auto & params = ctx->vision_model.hparams;
28752901

28762902
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);

examples/llava/clip.h

+15-4
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ CLIP_API struct clip_ctx * clip_init(const char * fname, struct clip_context_par
4747
CLIP_API void clip_free(struct clip_ctx * ctx);
4848

4949
CLIP_API size_t clip_embd_nbytes(const struct clip_ctx * ctx);
50-
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w);
50+
CLIP_API size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_w, int img_h);
5151

5252
CLIP_API int32_t clip_get_image_size (const struct clip_ctx * ctx);
5353
CLIP_API int32_t clip_get_patch_size (const struct clip_ctx * ctx);
@@ -59,9 +59,20 @@ CLIP_API const char * clip_patch_merge_type(const struct clip_ctx * ctx);
5959
CLIP_API const int32_t * clip_image_grid(const struct clip_ctx * ctx);
6060
CLIP_API size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
6161

62-
CLIP_API int clip_n_patches (const struct clip_ctx * ctx);
63-
CLIP_API int clip_n_patches_by_img (const struct clip_ctx * ctx, struct clip_image_f32 * img);
64-
CLIP_API int clip_n_mmproj_embd (const struct clip_ctx * ctx);
62+
GGML_DEPRECATED(CLIP_API int clip_n_patches(const struct clip_ctx * ctx),
63+
"use clip_n_output_tokens instead");
64+
GGML_DEPRECATED(CLIP_API int clip_n_patches_by_img(const struct clip_ctx * ctx, struct clip_image_f32 * img),
65+
"use clip_n_output_tokens instead");
66+
67+
CLIP_API int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
68+
69+
// for M-RoPE, this will be the number of token positions in X and Y directions
70+
// for other models, X will be the total number of tokens and Y will be 1
71+
CLIP_API int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img);
72+
CLIP_API int clip_n_output_tokens_y(const struct clip_ctx * ctx, struct clip_image_f32 * img);
73+
74+
// this should be equal to the embedding dimension of the text model
75+
CLIP_API int clip_n_mmproj_embd(const struct clip_ctx * ctx);
6576

6677
CLIP_API int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip);
6778
CLIP_API void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size);

examples/llava/llava.cpp

+8-7
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ static struct clip_image_grid_shape get_anyres_image_grid_shape(const std::pair<
112112
}
113113

114114
// Take the image segments in a grid configuration and return the embeddings and the number of embeddings into preallocated memory (image_embd_out)
115-
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out) {
115+
static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *> & image_embd_v, struct clip_image_grid_shape grid_shape, float * image_embd_out, int * n_img_pos_out, clip_image_f32 * img_input) {
116116
struct {
117117
struct ggml_context * ctx;
118118
} model;
@@ -175,7 +175,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
175175

176176
model.ctx = ggml_init(params);
177177

178-
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_patches(ctx_clip), num_images - 1); // example: 4096 x 576 x 4
178+
struct ggml_tensor * image_features = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, clip_n_mmproj_embd(ctx_clip), clip_n_output_tokens(ctx_clip, img_input), num_images - 1); // example: 4096 x 576 x 4
179179
// ggml_tensor_printf(image_features,"image_features",__LINE__,false,false);
180180
// fill it with the image embeddings, ignoring the base
181181
for (size_t i = 1; i < num_images; i++) {
@@ -214,8 +214,8 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
214214

215215
memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
216216
// append without newline tokens (default behavior in llava_arch when not using unpad ):
217-
memcpy(image_embd_out + clip_n_patches(ctx_clip) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
218-
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_patches(ctx_clip));
217+
memcpy(image_embd_out + clip_n_output_tokens(ctx_clip, img_input) * clip_n_mmproj_embd(ctx_clip), (float*)result->data, clip_embd_nbytes(ctx_clip) * (num_images-1)); // grid patches
218+
*n_img_pos_out = static_cast<int>(result->ne[1]+clip_n_output_tokens(ctx_clip, img_input));
219219

220220
// Debug: Test single segments
221221
// Current findings: sending base image, sending a segment embedding all works similar to python
@@ -313,7 +313,7 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
313313
image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip),
314314
image_embd_v[i],
315315
clip_embd_nbytes_by_img(ctx_clip, nx, ny));
316-
n_img_pos_out += clip_n_patches_by_img(ctx_clip, img_res);
316+
n_img_pos_out += clip_n_output_tokens(ctx_clip, img_res);
317317
}
318318
*n_img_pos = n_img_pos_out;
319319
for (size_t i = 0; i < image_embd_v.size(); i++) {
@@ -342,8 +342,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
342342
}
343343
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
344344
// flat / default llava-1.5 type embedding
345-
*n_img_pos = clip_n_patches(ctx_clip);
346345
clip_image_f32 * img_res = clip_image_f32_get_img(img_res_v.get(), 0);
346+
*n_img_pos = clip_n_output_tokens(ctx_clip, img_res);
347347
bool encoded = clip_image_encode(ctx_clip, n_threads, img_res, image_embd); // image_embd shape is 576 x 4096
348348
if (!encoded) {
349349
LOG_ERR("Unable to encode image\n");
@@ -381,7 +381,8 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
381381
struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
382382

383383
int n_img_pos_out;
384-
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
384+
clip_image_f32 * img_input = clip_image_f32_get_img(img_res_v.get(), 0);
385+
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out, img_input);
385386
*n_img_pos = n_img_pos_out;
386387

387388
for (size_t i = 0; i < image_embd_v.size(); i++) {

examples/llava/mtmd-cli.cpp

+2-34
Original file line numberDiff line numberDiff line change
@@ -136,39 +136,6 @@ struct mtmd_cli_context {
136136
}
137137
};
138138

139-
struct decode_embd_batch {
140-
std::vector<llama_pos> pos;
141-
std::vector<int32_t> n_seq_id;
142-
std::vector<llama_seq_id> seq_id_0;
143-
std::vector<llama_seq_id *> seq_ids;
144-
std::vector<int8_t> logits;
145-
llama_batch batch;
146-
decode_embd_batch(float * embd, int32_t n_tokens, llama_pos pos_0, llama_seq_id seq_id) {
147-
pos .resize(n_tokens);
148-
n_seq_id.resize(n_tokens);
149-
seq_ids .resize(n_tokens + 1);
150-
logits .resize(n_tokens);
151-
seq_id_0.resize(1);
152-
seq_id_0[0] = seq_id;
153-
seq_ids [n_tokens] = nullptr;
154-
batch = {
155-
/*n_tokens =*/ n_tokens,
156-
/*tokens =*/ nullptr,
157-
/*embd =*/ embd,
158-
/*pos =*/ pos.data(),
159-
/*n_seq_id =*/ n_seq_id.data(),
160-
/*seq_id =*/ seq_ids.data(),
161-
/*logits =*/ logits.data(),
162-
};
163-
for (int i = 0; i < n_tokens; i++) {
164-
batch.pos [i] = pos_0 + i;
165-
batch.n_seq_id[i] = 1;
166-
batch.seq_id [i] = seq_id_0.data();
167-
batch.logits [i] = false;
168-
}
169-
}
170-
};
171-
172139
static int generate_response(mtmd_cli_context & ctx, common_sampler * smpl, int n_predict) {
173140
llama_tokens generated_tokens;
174141
for (int i = 0; i < n_predict; i++) {
@@ -243,7 +210,7 @@ static int eval_message(mtmd_cli_context & ctx, common_chat_msg & msg, std::vect
243210
return 1;
244211
}
245212

246-
ctx.n_past += mtmd_helper_get_n_tokens(chunks);
213+
ctx.n_past += mtmd_helper_get_n_pos(chunks);
247214

248215
return 0;
249216
}
@@ -371,6 +338,7 @@ int main(int argc, char ** argv) {
371338
}
372339
}
373340
if (g_is_interrupted) LOG("\nInterrupted by user\n");
341+
LOG("\n\n");
374342
llama_perf_context_print(ctx.lctx);
375343
return g_is_interrupted ? 130 : 0;
376344
}

0 commit comments

Comments
 (0)