@@ -27,7 +27,8 @@ use crate::pipeline::isq::IsqModelLoader;
27
27
use crate :: pipeline:: loaders:: AutoDeviceMapParams ;
28
28
use crate :: pipeline:: text_models_inputs_processor:: { FlashParams , PagedAttentionInputMetadata } ;
29
29
use crate :: pipeline:: {
30
- EitherCache , IsqModel , MultimodalPromptPrefixer , Processor , ProcessorCreator ,
30
+ EitherCache , IsqModel , Modalities , MultimodalPromptPrefixer , Processor , ProcessorCreator ,
31
+ SupportedModality ,
31
32
} ;
32
33
use crate :: utils:: varbuilder_utils:: DeviceForLoadTensor ;
33
34
use crate :: vision_models:: clip:: ClipConfig ;
@@ -104,6 +105,7 @@ pub trait VisionModelLoader: IsqModelLoader + Send + Sync + DeviceMappedModelLoa
104
105
// Default is false, specific model must override.
105
106
false
106
107
}
108
+ fn modalities ( & self , config : & str ) -> Result < Modalities > ;
107
109
fn prefixer ( & self , config : & str ) -> Arc < dyn MultimodalPromptPrefixer > ;
108
110
fn get_device_for_tensor (
109
111
& self ,
@@ -311,6 +313,10 @@ impl VisionModelLoader for AutoVisionLoader {
311
313
. supports_paged_attention ( config)
312
314
}
313
315
316
+ fn modalities ( & self , config : & str ) -> Result < Modalities > {
317
+ Self :: get_loader ( config) ?. modalities ( config)
318
+ }
319
+
314
320
fn supports_prefix_cacher ( & self , config : & str ) -> bool {
315
321
Self :: get_loader ( config)
316
322
. expect ( "AutoVisionLoader" )
@@ -499,6 +505,12 @@ impl VisionModelLoader for Phi3VLoader {
499
505
fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
500
506
Arc :: new ( Phi3VPrefixer )
501
507
}
508
+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
509
+ Ok ( Modalities {
510
+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
511
+ output : vec ! [ SupportedModality :: Text ] ,
512
+ } )
513
+ }
502
514
}
503
515
504
516
impl IsqModelLoader for Phi3VLoader {
@@ -771,6 +783,12 @@ impl VisionModelLoader for Idefics2Loader {
771
783
fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
772
784
Arc :: new ( Idefics2Prefixer )
773
785
}
786
+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
787
+ Ok ( Modalities {
788
+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
789
+ output : vec ! [ SupportedModality :: Text ] ,
790
+ } )
791
+ }
774
792
}
775
793
776
794
impl IsqModelLoader for Idefics2Loader {
@@ -1109,6 +1127,12 @@ impl VisionModelLoader for LLaVANextLoader {
1109
1127
fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
1110
1128
Arc :: new ( LLaVANextPrefixer )
1111
1129
}
1130
+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
1131
+ Ok ( Modalities {
1132
+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
1133
+ output : vec ! [ SupportedModality :: Text ] ,
1134
+ } )
1135
+ }
1112
1136
}
1113
1137
1114
1138
impl IsqModelLoader for LLaVANextLoader {
@@ -1371,6 +1395,12 @@ impl VisionModelLoader for LLaVALoader {
1371
1395
fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
1372
1396
Arc :: new ( LLaVAPrefixer )
1373
1397
}
1398
+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
1399
+ Ok ( Modalities {
1400
+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
1401
+ output : vec ! [ SupportedModality :: Text ] ,
1402
+ } )
1403
+ }
1374
1404
}
1375
1405
1376
1406
impl IsqModelLoader for LLaVALoader {
@@ -1625,6 +1655,12 @@ impl VisionModelLoader for VLlamaLoader {
1625
1655
fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
1626
1656
Arc :: new ( VLlamaPrefixer )
1627
1657
}
1658
+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
1659
+ Ok ( Modalities {
1660
+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
1661
+ output : vec ! [ SupportedModality :: Text ] ,
1662
+ } )
1663
+ }
1628
1664
}
1629
1665
1630
1666
impl IsqModelLoader for VLlamaLoader {
@@ -2009,6 +2045,12 @@ impl VisionModelLoader for Qwen2VLLoader {
2009
2045
fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
2010
2046
Arc :: new ( Qwen2VLPrefixer )
2011
2047
}
2048
+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
2049
+ Ok ( Modalities {
2050
+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
2051
+ output : vec ! [ SupportedModality :: Text ] ,
2052
+ } )
2053
+ }
2012
2054
}
2013
2055
2014
2056
impl IsqModelLoader for Qwen2VLLoader {
@@ -2297,6 +2339,12 @@ impl VisionModelLoader for Idefics3Loader {
2297
2339
fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
2298
2340
Arc :: new ( Idefics3Prefixer )
2299
2341
}
2342
+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
2343
+ Ok ( Modalities {
2344
+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
2345
+ output : vec ! [ SupportedModality :: Text ] ,
2346
+ } )
2347
+ }
2300
2348
}
2301
2349
2302
2350
impl IsqModelLoader for Idefics3Loader {
@@ -2606,6 +2654,12 @@ impl VisionModelLoader for MiniCpmOLoader {
2606
2654
fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
2607
2655
Arc :: new ( MiniCpmOPrefixer )
2608
2656
}
2657
+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
2658
+ Ok ( Modalities {
2659
+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
2660
+ output : vec ! [ SupportedModality :: Text ] ,
2661
+ } )
2662
+ }
2609
2663
}
2610
2664
2611
2665
impl IsqModelLoader for MiniCpmOLoader {
@@ -2892,6 +2946,16 @@ impl VisionModelLoader for Phi4MMLoader {
2892
2946
fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
2893
2947
Arc :: new ( Phi4MMPrefixer )
2894
2948
}
2949
+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
2950
+ Ok ( Modalities {
2951
+ input : vec ! [
2952
+ SupportedModality :: Text ,
2953
+ SupportedModality :: Vision ,
2954
+ SupportedModality :: Audio ,
2955
+ ] ,
2956
+ output : vec ! [ SupportedModality :: Text ] ,
2957
+ } )
2958
+ }
2895
2959
}
2896
2960
2897
2961
impl IsqModelLoader for Phi4MMLoader {
@@ -3213,6 +3277,12 @@ impl VisionModelLoader for Qwen2_5VLLoader {
3213
3277
fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
3214
3278
Arc :: new ( Qwen2_5VLPrefixer )
3215
3279
}
3280
+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
3281
+ Ok ( Modalities {
3282
+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
3283
+ output : vec ! [ SupportedModality :: Text ] ,
3284
+ } )
3285
+ }
3216
3286
}
3217
3287
3218
3288
impl IsqModelLoader for Qwen2_5VLLoader {
@@ -3500,6 +3570,12 @@ impl VisionModelLoader for Gemma3Loader {
3500
3570
fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
3501
3571
Arc :: new ( Gemma3Prefixer )
3502
3572
}
3573
+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
3574
+ Ok ( Modalities {
3575
+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
3576
+ output : vec ! [ SupportedModality :: Text ] ,
3577
+ } )
3578
+ }
3503
3579
}
3504
3580
3505
3581
impl IsqModelLoader for Gemma3Loader {
@@ -3827,6 +3903,12 @@ impl VisionModelLoader for Mistral3Loader {
3827
3903
fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
3828
3904
Arc :: new ( Mistral3Prefixer )
3829
3905
}
3906
+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
3907
+ Ok ( Modalities {
3908
+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
3909
+ output : vec ! [ SupportedModality :: Text ] ,
3910
+ } )
3911
+ }
3830
3912
}
3831
3913
3832
3914
impl IsqModelLoader for Mistral3Loader {
@@ -4143,6 +4225,12 @@ impl VisionModelLoader for VLlama4Loader {
4143
4225
fn prefixer ( & self , _config : & str ) -> Arc < dyn MultimodalPromptPrefixer > {
4144
4226
Arc :: new ( VLlama4Prefixer )
4145
4227
}
4228
+ fn modalities ( & self , _config : & str ) -> Result < Modalities > {
4229
+ Ok ( Modalities {
4230
+ input : vec ! [ SupportedModality :: Text , SupportedModality :: Vision ] ,
4231
+ output : vec ! [ SupportedModality :: Text ] ,
4232
+ } )
4233
+ }
4146
4234
}
4147
4235
4148
4236
impl IsqModelLoader for VLlama4Loader {
0 commit comments