@@ -965,23 +965,30 @@ static ggml_cgraph * clip_image_build_graph_llama4(clip_ctx * ctx, const clip_im
965
965
ggml_row_size (cur->type , hidden_size),
966
966
ggml_row_size (cur->type , hidden_size * num_patches), 0 );
967
967
968
- cur = ggml_reshape_3d (ctx0, cur,
968
+ cur = ggml_reshape_4d (ctx0, cur,
969
969
hidden_size * scale_factor,
970
- num_patches / scale_factor,
970
+ px / scale_factor,
971
+ py,
971
972
batch_size);
972
973
cur = ggml_permute (ctx0, cur, 0 , 2 , 1 , 3 );
973
974
974
975
cur = ggml_reshape_4d (ctx0, ggml_cont (ctx0, cur),
975
976
hidden_size * scale_factor * scale_factor,
976
- py / scale_factor,
977
977
px / scale_factor,
978
+ py / scale_factor,
978
979
batch_size);
979
980
cur = ggml_permute (ctx0, cur, 0 , 2 , 1 , 3 );
980
981
982
+ cur = ggml_reshape_3d (ctx0, ggml_cont (ctx0, cur),
983
+ hidden_size * scale_factor * scale_factor,
984
+ num_patches / scale_factor / scale_factor,
985
+ batch_size);
986
+
981
987
// based on Llama4VisionMLP2 (always uses GELU activation, no bias)
982
988
cur = ggml_mul_mat (ctx0, model.mm_model_mlp_1_w , cur);
983
989
cur = ggml_gelu (ctx0, cur);
984
990
cur = ggml_mul_mat (ctx0, model.mm_model_mlp_2_w , cur);
991
+ cur = ggml_gelu (ctx0, cur);
985
992
embeddings = cur;
986
993
}
987
994
0 commit comments