diff --git a/docs/source/openvino/optimization.mdx b/docs/source/openvino/optimization.mdx
index 741c295ffb..6930c65e2c 100644
--- a/docs/source/openvino/optimization.mdx
+++ b/docs/source/openvino/optimization.mdx
@@ -373,6 +373,59 @@ Click on a ✅ to copy the command/code for the corresponding optimization case.
+
zero-shot-image-classification (OVModelForZeroShotImageClassification) |
diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py
index 4fa7f8901a..d4525af037 100644
--- a/optimum/commands/export/openvino.py
+++ b/optimum/commands/export/openvino.py
@@ -482,6 +482,7 @@ def run(self):
and (
task in ["fill-mask", "zero-shot-image-classification"]
or task.startswith("text-generation")
+ or task.startswith("text2text-generation")
or task.startswith("automatic-speech-recognition")
or task.startswith("feature-extraction")
)
@@ -491,6 +492,10 @@ def run(self):
from optimum.intel import OVModelForCausalLM
model_cls = OVModelForCausalLM
+ elif task.startswith("text2text-generation"):
+ from optimum.intel import OVModelForSeq2SeqLM
+
+ model_cls = OVModelForSeq2SeqLM
elif task == "image-text-to-text":
from optimum.intel import OVModelForVisualCausalLM
diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py
index fa014e500e..5ca7e2dfdc 100644
--- a/optimum/intel/openvino/configuration.py
+++ b/optimum/intel/openvino/configuration.py
@@ -361,6 +361,24 @@ class OVQuantizationMethod(str, Enum):
# Default configs for int8 full quantization
_DEFAULT_INT8_FQ_CONFIGS = {
+ "google-t5/t5-small": {
+ "dtype": "int8",
+ "dataset": "wikitext2",
+ "num_samples": 300,
+ "smooth_quant_alpha": -1,
+ },
+ "google-t5/t5-large": {
+ "dtype": "int8",
+ "dataset": "wikitext2",
+ "num_samples": 300,
+ "smooth_quant_alpha": -1,
+ },
+ "google-t5/t5-3b": {
+ "dtype": "int8",
+ "dataset": "wikitext2",
+ "num_samples": 300,
+ "smooth_quant_alpha": -1,
+ },
"FacebookAI/roberta-large": {
"dtype": "int8",
"dataset": "wikitext2",
diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py
index 78caf1144f..9ce4fe0208 100644
--- a/optimum/intel/openvino/modeling_seq2seq.py
+++ b/optimum/intel/openvino/modeling_seq2seq.py
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-import copy
import logging
import os
from pathlib import Path
@@ -42,9 +41,9 @@
from ...exporters.openvino import main_export
from ...exporters.openvino.stateful import model_has_state
-from .. import OVConfig, OVQuantizer
+from .. import OVConfig
from ..utils import is_transformers_version
-from .configuration import OVQuantizationConfigBase, OVWeightQuantizationConfig
+from .configuration import OVWeightQuantizationConfig
from .modeling_base import OVBaseModel
from .utils import (
ONNX_DECODER_NAME,
@@ -477,7 +476,6 @@ def _from_pretrained(
decoder_with_past_file_name = decoder_with_past_file_name or default_decoder_with_past_file_name
decoder_with_past = None
- quantization_config = cls._prepare_quantization_config(quantization_config, load_in_8bit)
compile_only = kwargs.pop("compile_only", False)
device = kwargs.pop("device", "CPU")
ov_config = kwargs.pop("ov_config", None)
@@ -521,10 +519,10 @@ def _from_pretrained(
"decoder_with_past": model_save_dir / decoder_with_past_file_name,
}
if not compile_only:
- encoder = cls.load_model(file_names["encoder"], quantization_config)
- decoder = cls.load_model(file_names["decoder"], quantization_config)
+ encoder = cls.load_model(file_names["encoder"])
+ decoder = cls.load_model(file_names["decoder"])
if use_cache and not model_has_state(decoder) and os.path.exists(file_names["decoder_with_past"]):
- decoder_with_past = cls.load_model(file_names["decoder_with_past"], quantization_config)
+ decoder_with_past = cls.load_model(file_names["decoder_with_past"])
else:
model_kwargs = {"device": device, "ov_config": ov_config, "model_save_dir": model_save_dir}
encoder = cls._compile_model(file_names["encoder"], **model_kwargs)
@@ -551,7 +549,8 @@ def _from_pretrained(
"Generation config file not found, using a generation config created from the model config."
)
- return cls(
+ quantization_config = cls._prepare_quantization_config(quantization_config, load_in_8bit)
+ model = cls(
encoder=encoder,
decoder=decoder,
decoder_with_past=decoder_with_past,
@@ -565,6 +564,17 @@ def _from_pretrained(
**kwargs,
)
+ if quantization_config is not None:
+ from optimum.intel import OVQuantizer
+
+ quantizer = OVQuantizer(model)
+ quantization_config_copy = quantization_config.clone()
+ quantization_config_copy.tokenizer = quantization_config.tokenizer or model_id
+ quantization_config_copy.processor = quantization_config.processor or model_id
+ quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))
+
+ return model
+
@classmethod
def _export(
cls,
@@ -657,12 +667,17 @@ def forward(
encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
cache_position: Optional[torch.LongTensor] = None,
+ labels: Optional[torch.LongTensor] = None,
**kwargs,
) -> Seq2SeqLMOutput:
# Encode if needed : first prediction pass
if encoder_outputs is None:
encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
+ if labels is not None and decoder_input_ids is None:
+ # get decoder inputs from shifting lm labels to the right
+ decoder_input_ids = self._shift_right(labels)
+
# Decode
if past_key_values is None or self.decoder_with_past is None:
decoder_outputs = self.decoder(
@@ -786,6 +801,28 @@ def compile(self):
for submodel_name in self._ov_submodel_names:
getattr(self, submodel_name)._compile()
+ def _shift_right(self, input_ids):
+ # Adopted from https://github.com/huggingface/transformers/blob/v4.53.1/src/transformers/models/t5/modeling_tf_t5.py#L957
+ decoder_start_token_id = self.config.decoder_start_token_id
+ pad_token_id = self.config.pad_token_id
+
+ if decoder_start_token_id is None:
+ raise ValueError(
+ "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. "
+ "See T5 docs for more information."
+ )
+
+ shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+ shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+ shifted_input_ids[..., 0] = decoder_start_token_id
+
+ if pad_token_id is None:
+ raise ValueError("self.model.config.pad_token_id has to be defined.")
+ # replace possible -100 values in labels by `pad_token_id`
+ shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+ return shifted_input_ids
+
class OVEncoder:
"""
@@ -1345,27 +1382,9 @@ def _from_pretrained(
cls,
model_id: Union[str, Path],
config: "PretrainedConfig",
- load_in_8bit: bool = False,
- quantization_config: Union[dict, OVQuantizationConfigBase] = None,
**kwargs,
):
- compile_only = kwargs.get("compile_only", False)
-
- quantization_config = cls._prepare_quantization_config(quantization_config, load_in_8bit)
- is_data_aware_quantization = quantization_config is not None and quantization_config.dataset is not None
- if not compile_only and is_data_aware_quantization:
- model = super(OVModelForSpeechSeq2Seq, cls)._from_pretrained(
- model_id, config, load_in_8bit=False, **kwargs
- )
- quantization_config_copy = copy.deepcopy(quantization_config)
- quantization_config_copy.processor = quantization_config.processor or model_id
- OVQuantizer(model).quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))
- else:
- model = super(OVModelForSpeechSeq2Seq, cls)._from_pretrained(
- model_id, config, load_in_8bit=load_in_8bit, quantization_config=quantization_config, **kwargs
- )
-
- return model
+ return super(OVModelForSpeechSeq2Seq, cls)._from_pretrained(model_id, config, **kwargs)
class DummyWhisperModel:
def __init__(self):
diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py
index 1f33584da4..a8eb22c11c 100644
--- a/optimum/intel/openvino/quantization.py
+++ b/optimum/intel/openvino/quantization.py
@@ -270,6 +270,7 @@ def build_from_quantization_config(self, config: OVQuantizationConfigBase) -> OV
OVModelForCausalLM,
OVModelForFeatureExtraction,
OVModelForMaskedLM,
+ OVModelForSeq2SeqLM,
OVModelForVisualCausalLM,
OVModelForZeroShotImageClassification,
OVSentenceTransformer,
@@ -344,7 +345,9 @@ def build_from_quantization_config(self, config: OVQuantizationConfigBase) -> OV
)
return self.build_from_dataset(config, dataset)
- elif isinstance(self.model, (OVModelForFeatureExtraction, OVSentenceTransformer, OVModelForMaskedLM)):
+ elif isinstance(
+ self.model, (OVModelForFeatureExtraction, OVSentenceTransformer, OVModelForMaskedLM, OVModelForSeq2SeqLM)
+ ):
if isinstance(config.dataset, str):
dataset_metadata = PREDEFINED_LANGUAGE_DATASETS[config.dataset]
dataset = self.load_dataset(
@@ -467,6 +470,7 @@ def build_from_dataset(
from optimum.intel import (
OVModelForFeatureExtraction,
OVModelForMaskedLM,
+ OVModelForSeq2SeqLM,
OVModelForVisualCausalLM,
OVModelForZeroShotImageClassification,
OVSentenceTransformer,
@@ -492,6 +496,7 @@ def build_from_dataset(
OVModelForMaskedLM,
OVModelForZeroShotImageClassification,
OVSentenceTransformer,
+ OVModelForSeq2SeqLM,
),
) or (is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline)):
# Prepare from raw dataset avoiding dataloader creation
@@ -504,6 +509,8 @@ def build_from_dataset(
return self._prepare_visual_causal_lm_calibration_data(quantization_config, dataset)
elif isinstance(self.model, _OVModelForWhisper):
return self._prepare_speech_to_text_calibration_data(quantization_config, dataset)
+ elif isinstance(self.model, OVModelForSeq2SeqLM):
+ return self._prepare_text_to_text_calibration_data(quantization_config, dataset)
elif is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
return self._prepare_diffusion_calibration_data(quantization_config, dataset)
elif isinstance(self.model, (OVModelForFeatureExtraction, OVSentenceTransformer, OVModelForMaskedLM)):
@@ -770,6 +777,56 @@ def _prepare_speech_to_text_calibration_data(
return OVCalibrationDataset(collected_inputs)
+ def _prepare_text_to_text_calibration_data(
+ self,
+ config: OVQuantizationConfigBase,
+ dataset: "Dataset",
+ seq_len: int = 128,
+ ) -> OVCalibrationDataset:
+ """
+ Prepares calibration data for text-to-text pipelines by inferring it on a dataset and collecting incurred inputs.
+ """
+ from optimum.intel.openvino.modeling_seq2seq import OVDecoder, OVEncoder
+
+ models: Dict[str, Union[OVEncoder, OVDecoder]] = {}
+ collected_inputs: Dict[str, List[Dict[str, Any]]] = {}
+ for submodel_name in self.model._ov_submodel_names:
+ ov_component: Union[OVEncoder, OVDecoder] = getattr(self.model, submodel_name)
+ models[submodel_name] = ov_component
+ collected_inputs[submodel_name] = []
+ ov_component._compile()
+ ov_component.request = InferRequestWrapper(
+ ov_component.request, collected_inputs[submodel_name], apply_caching=True
+ )
+ try:
+
+ def get_tokenizer():
+ if config.tokenizer is None:
+ raise ValueError("Please provide tokenizer for calibration via quantization_config.tokenizer.")
+ return AutoTokenizer.from_pretrained(config.tokenizer, trust_remote_code=config.trust_remote_code)
+
+ num_samples = config.num_samples or 128
+ dataset = list(tqdm(dataset.take(num_samples), desc="Downloading dataset", total=num_samples))
+
+ tokenizer = None
+ for item in tqdm(dataset, desc="Collecting calibration data"):
+ if "input_ids" in item:
+ # Assuming that dataset contains already preprocessed text
+ inputs = self._wrap_sample_as_array(item, add_batch_dim=True)
+ else:
+ tokenizer = tokenizer or get_tokenizer()
+ inputs = tokenizer(item["text"], truncation=True, max_length=seq_len, return_tensors="pt")
+
+ self.model.generate(**inputs, max_new_tokens=seq_len)
+ finally:
+ for model in models.values():
+ model.request = model.request.request
+
+ for model_name in collected_inputs:
+ collected_inputs[model_name] = nncf.Dataset(collected_inputs[model_name])
+
+ return OVCalibrationDataset(collected_inputs)
+
def _prepare_diffusion_calibration_data(
self, config: OVQuantizationConfigBase, dataset: Union[List, "Dataset"]
) -> OVCalibrationDataset:
@@ -1202,10 +1259,7 @@ def _quantize_ovbasemodel(
#
# Regular (non-hybrid) weight-only quantization
#
- if is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
- for submodel_name in self.model.ov_submodels:
- quantization_configs[submodel_name] = quantization_config
- elif isinstance(self.model, OVModelForVisualCausalLM):
+ if isinstance(self.model, OVModelForVisualCausalLM):
for submodel_name in self.model.ov_submodels:
quantization_configs[submodel_name] = (
quantization_config
@@ -1213,7 +1267,8 @@ def _quantize_ovbasemodel(
else OVWeightQuantizationConfig(bits=8, sym=True)
)
else:
- quantization_configs["model"] = quantization_config
+ for submodel_name in self.model.ov_submodels:
+ quantization_configs[submodel_name] = quantization_config
else:
#
# Hybrid/Full/Mixed quantization
@@ -1274,7 +1329,8 @@ def _quantize_ovbasemodel(
else OVWeightQuantizationConfig(bits=8, sym=True)
)
else:
- quantization_configs["model"] = quantization_config
+ for submodel_name in self.model.ov_submodels:
+ quantization_configs[submodel_name] = quantization_config
elif isinstance(quantization_config, OVMixedQuantizationConfig):
#
# Mixed quantization
@@ -1282,7 +1338,8 @@ def _quantize_ovbasemodel(
if is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
raise NotImplementedError("Mixed precision quantization isn't supported for diffusers.")
- quantization_configs["model"] = quantization_config
+ for submodel_name in self.model.ov_submodels:
+ quantization_configs[submodel_name] = quantization_config
else:
raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}")
diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py
index 6f650b89bb..d80f866cfa 100644
--- a/optimum/intel/openvino/utils.py
+++ b/optimum/intel/openvino/utils.py
@@ -124,6 +124,7 @@
"fill-mask": "OVModelForMaskedLM",
"text-generation": "OVModelForCausalLM",
"text2text-generation": "OVModelForSeq2SeqLM",
+ "text2text-generation-with-past": "OVModelForSeq2SeqLM",
"text-classification": "OVModelForSequenceClassification",
"token-classification": "OVModelForTokenClassification",
"question-answering": "OVModelForQuestionAnswering",
diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py
index 83b5b1e80d..ea53e9ff1d 100644
--- a/tests/openvino/test_exporters_cli.py
+++ b/tests/openvino/test_exporters_cli.py
@@ -400,6 +400,20 @@ class OVCLIExportTestCase(unittest.TestCase):
"model": {"int8": 65},
},
),
+ (
+ "text2text-generation-with-past",
+ "t5",
+ "int8",
+ "--dataset c4 --num-samples 1",
+ {"encoder": 30, "decoder": 52, "decoder_with_past": 61}
+ if is_transformers_version("<=", "4.36.0")
+ else {"encoder": 30, "decoder": 62},
+ (
+ {"encoder": {"int8": 32}, "decoder": {"int8": 52}, "decoder_with_past": {"int8": 42}}
+ if is_transformers_version("<=", "4.36.0")
+ else {"encoder": {"int8": 32}, "decoder": {"int8": 52}}
+ ),
+ ),
]
TEST_4BIT_CONFIGURATIONS = [
@@ -937,7 +951,6 @@ def test_exporters_cli_hybrid_quantization(
def test_exporters_cli_4bit(
self, task: str, model_type: str, option: str, expected_num_weight_nodes_per_model: Dict[str, Dict[str, int]]
):
- self.maxDiff = 100000
with TemporaryDirectory() as tmpdir:
result = subprocess.run(
f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}",
@@ -989,9 +1002,11 @@ def test_exporters_cli_full_quantization(
)
model = model_cls.from_pretrained(tmpdir)
- if "automatic-speech-recognition" in task and model.decoder_with_past is None:
- del expected_num_weight_nodes_per_model["decoder_with_past"]
- del expected_fake_nodes_per_model["decoder_with_past"]
+ if (
+ "automatic-speech-recognition" in task or "text2text-generation" in task
+ ) and model.decoder_with_past is None:
+ expected_num_weight_nodes_per_model.pop("decoder_with_past", None)
+ expected_fake_nodes_per_model.pop("decoder_with_past", None)
check_compression_state_per_model(
self,
diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py
index 26c6283b85..27e99c59bd 100644
--- a/tests/openvino/test_quantization.py
+++ b/tests/openvino/test_quantization.py
@@ -127,7 +127,7 @@ class OVQuantizerTest(unittest.TestCase):
(OVModelForSequenceClassification, "bert", 32, 35),
(OVModelForCausalLM, "gpt2", 31, 22),
)
- # TODO (nikita-savelyevv): Extend for OVModelForSpeechSeq2Seq and OVStableDiffusionPipeline
+ # TODO (nikita-savelyevv): Extend for OVModelForSpeechSeq2Seq, OVStableDiffusionPipeline and OVModelForSeq2SeqLM
SUPPORTED_ARCHITECTURES_OV_MODEL = (
(OVModelForSequenceClassification, "bert", 32, 35),
(OVModelForCausalLM, "gpt2", 31, 22),
@@ -356,6 +356,23 @@ class OVQuantizerTest(unittest.TestCase):
"model": {"int8": 65},
},
),
+ (
+ OVModelForSeq2SeqLM,
+ "t5",
+ OVQuantizationConfig(
+ dtype="int8",
+ dataset="wikitext2",
+ num_samples=1,
+ ),
+ {"encoder": 30, "decoder": 52, "decoder_with_past": 61}
+ if is_transformers_version("<=", "4.36.0")
+ else {"encoder": 30, "decoder": 62},
+ (
+ {"encoder": {"int8": 32}, "decoder": {"int8": 52}, "decoder_with_past": {"int8": 42}}
+ if is_transformers_version("<=", "4.36.0")
+ else {"encoder": {"int8": 32}, "decoder": {"int8": 52}}
+ ),
+ ),
]
if is_transformers_version(">=", "4.45.0"):
@@ -572,13 +589,18 @@ def test_ov_model_static_quantization_with_auto_dataset(
ov_model = model_cls.from_pretrained(model_id, quantization_config=quantization_config)
ov_model.save_pretrained(tmp_dir)
- if model_cls == OVModelForSpeechSeq2Seq:
+ if model_cls in [OVModelForSpeechSeq2Seq, OVModelForSeq2SeqLM]:
if ov_model.decoder_with_past is None:
- del expected_fake_nodes_per_model["decoder_with_past"]
- del expected_num_weight_nodes_per_model["decoder_with_past"]
+ expected_fake_nodes_per_model.pop("decoder_with_past", None)
+ expected_num_weight_nodes_per_model.pop("decoder_with_past", None)
- input_features = torch.randn((1, ov_model.config.num_mel_bins, 3000), dtype=torch.float32)
- ov_model.generate(input_features)
+ if model_cls == OVModelForSpeechSeq2Seq:
+ input_features = torch.randn((1, ov_model.config.num_mel_bins, 3000), dtype=torch.float32)
+ ov_model.generate(input_features)
+ else:
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
+ inputs = tokenizer("This is a sample ", return_tensors="pt")
+ ov_model.generate(**inputs)
elif model_cls in (OVModelForCausalLM, OVModelForFeatureExtraction, OVModelForMaskedLM):
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
@@ -1303,7 +1325,6 @@ def test_ovmodel_4bit_auto_compression_with_config(
pass
submodels = model.ov_submodels
- self.maxDiff = 1000000
check_compression_state_per_model(self, submodels, expected_num_weight_nodes_per_model)
model.save_pretrained(tmp_dir)
|