@@ -842,15 +842,15 @@ static ggml_cgraph * clip_image_build_graph_llama4(clip_ctx * ctx, const clip_im
842
842
ggml_set_name (inp_raw, " inp_raw" );
843
843
ggml_set_input (inp_raw);
844
844
845
- // create patches
846
- ggml_tensor * patch_embd_view = ggml_view_4d (ctx0, model. patch_embeddings_0 ,
847
- patch_size, patch_size, 3 , hidden_size,
848
- ggml_row_size ( model.patch_embeddings_0 -> type , patch_size) ,
849
- ggml_row_size (model. patch_embeddings_0 -> type , patch_size * patch_size),
850
- ggml_row_size (model. patch_embeddings_0 -> type , patch_size * patch_size * 3 ) , 0 );
851
- ggml_tensor * inp = ggml_conv_2d (ctx0, patch_embd_view, inp_raw, patch_size, patch_size, 0 , 0 , 1 , 1 );
852
- inp = ggml_reshape_2d (ctx0, inp, num_patches, hidden_size );
853
- inp = ggml_cont (ctx0, ggml_transpose (ctx0, inp));
845
+ // Llama4UnfoldConvolution
846
+ ggml_tensor * inp;
847
+ {
848
+ ggml_tensor * kernel = ggml_reshape_4d (ctx0, model.patch_embeddings_0 ,
849
+ patch_size , patch_size, 3 , hidden_size);
850
+ inp = ggml_im2col (ctx0, kernel, inp_raw, patch_size, patch_size, 0 , 0 , 1 , 1 , true , inp_raw-> type );
851
+ inp = ggml_mul_mat (ctx0, model. patch_embeddings_0 , inp );
852
+ inp = ggml_reshape_2d (ctx0, inp, hidden_size, num_patches );
853
+ }
854
854
855
855
// add CLS
856
856
inp = ggml_concat (ctx0, inp, model.class_embedding , 1 );
@@ -3578,12 +3578,12 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
3578
3578
// last pos is always kept 0, it's for CLS
3579
3579
// dimension H
3580
3580
for (int i = 0 ; i < num_patches; i++) {
3581
- pos_data[i] = i / n_patches_per_col;
3581
+ pos_data[i] = ( i / n_patches_per_col) + 1 ;
3582
3582
}
3583
3583
set_input_i32 (" pos_h" , pos_data);
3584
3584
// dimension W
3585
3585
for (int i = 0 ; i < num_patches; i++) {
3586
- pos_data[i] = i % n_patches_per_col;
3586
+ pos_data[i] = ( i % n_patches_per_col) + 1 ;
3587
3587
}
3588
3588
set_input_i32 (" pos_w" , pos_data);
3589
3589
} break ;
0 commit comments