diff --git a/docs/source/openvino/optimization.mdx b/docs/source/openvino/optimization.mdx index 741c295ffb..6930c65e2c 100644 --- a/docs/source/openvino/optimization.mdx +++ b/docs/source/openvino/optimization.mdx @@ -373,6 +373,59 @@ Click on a ✅ to copy the command/code for the corresponding optimization case. + + text2text-generation
(OVModelForSeq2SeqLM) + + + + + + + + + + + + + – + - + + + + + + + + + + + + + zero-shot-image-classification
(OVModelForZeroShotImageClassification) diff --git a/optimum/commands/export/openvino.py b/optimum/commands/export/openvino.py index 4fa7f8901a..d4525af037 100644 --- a/optimum/commands/export/openvino.py +++ b/optimum/commands/export/openvino.py @@ -482,6 +482,7 @@ def run(self): and ( task in ["fill-mask", "zero-shot-image-classification"] or task.startswith("text-generation") + or task.startswith("text2text-generation") or task.startswith("automatic-speech-recognition") or task.startswith("feature-extraction") ) @@ -491,6 +492,10 @@ def run(self): from optimum.intel import OVModelForCausalLM model_cls = OVModelForCausalLM + elif task.startswith("text2text-generation"): + from optimum.intel import OVModelForSeq2SeqLM + + model_cls = OVModelForSeq2SeqLM elif task == "image-text-to-text": from optimum.intel import OVModelForVisualCausalLM diff --git a/optimum/intel/openvino/configuration.py b/optimum/intel/openvino/configuration.py index fa014e500e..5ca7e2dfdc 100644 --- a/optimum/intel/openvino/configuration.py +++ b/optimum/intel/openvino/configuration.py @@ -361,6 +361,24 @@ class OVQuantizationMethod(str, Enum): # Default configs for int8 full quantization _DEFAULT_INT8_FQ_CONFIGS = { + "google-t5/t5-small": { + "dtype": "int8", + "dataset": "wikitext2", + "num_samples": 300, + "smooth_quant_alpha": -1, + }, + "google-t5/t5-large": { + "dtype": "int8", + "dataset": "wikitext2", + "num_samples": 300, + "smooth_quant_alpha": -1, + }, + "google-t5/t5-3b": { + "dtype": "int8", + "dataset": "wikitext2", + "num_samples": 300, + "smooth_quant_alpha": -1, + }, "FacebookAI/roberta-large": { "dtype": "int8", "dataset": "wikitext2", diff --git a/optimum/intel/openvino/modeling_seq2seq.py b/optimum/intel/openvino/modeling_seq2seq.py index 78caf1144f..9ce4fe0208 100644 --- a/optimum/intel/openvino/modeling_seq2seq.py +++ b/optimum/intel/openvino/modeling_seq2seq.py @@ -11,7 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -import copy import logging import os from pathlib import Path @@ -42,9 +41,9 @@ from ...exporters.openvino import main_export from ...exporters.openvino.stateful import model_has_state -from .. import OVConfig, OVQuantizer +from .. import OVConfig from ..utils import is_transformers_version -from .configuration import OVQuantizationConfigBase, OVWeightQuantizationConfig +from .configuration import OVWeightQuantizationConfig from .modeling_base import OVBaseModel from .utils import ( ONNX_DECODER_NAME, @@ -477,7 +476,6 @@ def _from_pretrained( decoder_with_past_file_name = decoder_with_past_file_name or default_decoder_with_past_file_name decoder_with_past = None - quantization_config = cls._prepare_quantization_config(quantization_config, load_in_8bit) compile_only = kwargs.pop("compile_only", False) device = kwargs.pop("device", "CPU") ov_config = kwargs.pop("ov_config", None) @@ -521,10 +519,10 @@ def _from_pretrained( "decoder_with_past": model_save_dir / decoder_with_past_file_name, } if not compile_only: - encoder = cls.load_model(file_names["encoder"], quantization_config) - decoder = cls.load_model(file_names["decoder"], quantization_config) + encoder = cls.load_model(file_names["encoder"]) + decoder = cls.load_model(file_names["decoder"]) if use_cache and not model_has_state(decoder) and os.path.exists(file_names["decoder_with_past"]): - decoder_with_past = cls.load_model(file_names["decoder_with_past"], quantization_config) + decoder_with_past = cls.load_model(file_names["decoder_with_past"]) else: model_kwargs = {"device": device, "ov_config": ov_config, "model_save_dir": model_save_dir} encoder = cls._compile_model(file_names["encoder"], **model_kwargs) @@ -551,7 +549,8 @@ def _from_pretrained( "Generation config file not found, using a generation config created from the model config." ) - return cls( + quantization_config = cls._prepare_quantization_config(quantization_config, load_in_8bit) + model = cls( encoder=encoder, decoder=decoder, decoder_with_past=decoder_with_past, @@ -565,6 +564,17 @@ def _from_pretrained( **kwargs, ) + if quantization_config is not None: + from optimum.intel import OVQuantizer + + quantizer = OVQuantizer(model) + quantization_config_copy = quantization_config.clone() + quantization_config_copy.tokenizer = quantization_config.tokenizer or model_id + quantization_config_copy.processor = quantization_config.processor or model_id + quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config_copy)) + + return model + @classmethod def _export( cls, @@ -657,12 +667,17 @@ def forward( encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None, past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None, cache_position: Optional[torch.LongTensor] = None, + labels: Optional[torch.LongTensor] = None, **kwargs, ) -> Seq2SeqLMOutput: # Encode if needed : first prediction pass if encoder_outputs is None: encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask) + if labels is not None and decoder_input_ids is None: + # get decoder inputs from shifting lm labels to the right + decoder_input_ids = self._shift_right(labels) + # Decode if past_key_values is None or self.decoder_with_past is None: decoder_outputs = self.decoder( @@ -786,6 +801,28 @@ def compile(self): for submodel_name in self._ov_submodel_names: getattr(self, submodel_name)._compile() + def _shift_right(self, input_ids): + # Adopted from https://github.com/huggingface/transformers/blob/v4.53.1/src/transformers/models/t5/modeling_tf_t5.py#L957 + decoder_start_token_id = self.config.decoder_start_token_id + pad_token_id = self.config.pad_token_id + + if decoder_start_token_id is None: + raise ValueError( + "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. " + "See T5 docs for more information." + ) + + shifted_input_ids = input_ids.new_zeros(input_ids.shape) + shifted_input_ids[..., 1:] = input_ids[..., :-1].clone() + shifted_input_ids[..., 0] = decoder_start_token_id + + if pad_token_id is None: + raise ValueError("self.model.config.pad_token_id has to be defined.") + # replace possible -100 values in labels by `pad_token_id` + shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id) + + return shifted_input_ids + class OVEncoder: """ @@ -1345,27 +1382,9 @@ def _from_pretrained( cls, model_id: Union[str, Path], config: "PretrainedConfig", - load_in_8bit: bool = False, - quantization_config: Union[dict, OVQuantizationConfigBase] = None, **kwargs, ): - compile_only = kwargs.get("compile_only", False) - - quantization_config = cls._prepare_quantization_config(quantization_config, load_in_8bit) - is_data_aware_quantization = quantization_config is not None and quantization_config.dataset is not None - if not compile_only and is_data_aware_quantization: - model = super(OVModelForSpeechSeq2Seq, cls)._from_pretrained( - model_id, config, load_in_8bit=False, **kwargs - ) - quantization_config_copy = copy.deepcopy(quantization_config) - quantization_config_copy.processor = quantization_config.processor or model_id - OVQuantizer(model).quantize(ov_config=OVConfig(quantization_config=quantization_config_copy)) - else: - model = super(OVModelForSpeechSeq2Seq, cls)._from_pretrained( - model_id, config, load_in_8bit=load_in_8bit, quantization_config=quantization_config, **kwargs - ) - - return model + return super(OVModelForSpeechSeq2Seq, cls)._from_pretrained(model_id, config, **kwargs) class DummyWhisperModel: def __init__(self): diff --git a/optimum/intel/openvino/quantization.py b/optimum/intel/openvino/quantization.py index 1f33584da4..a8eb22c11c 100644 --- a/optimum/intel/openvino/quantization.py +++ b/optimum/intel/openvino/quantization.py @@ -270,6 +270,7 @@ def build_from_quantization_config(self, config: OVQuantizationConfigBase) -> OV OVModelForCausalLM, OVModelForFeatureExtraction, OVModelForMaskedLM, + OVModelForSeq2SeqLM, OVModelForVisualCausalLM, OVModelForZeroShotImageClassification, OVSentenceTransformer, @@ -344,7 +345,9 @@ def build_from_quantization_config(self, config: OVQuantizationConfigBase) -> OV ) return self.build_from_dataset(config, dataset) - elif isinstance(self.model, (OVModelForFeatureExtraction, OVSentenceTransformer, OVModelForMaskedLM)): + elif isinstance( + self.model, (OVModelForFeatureExtraction, OVSentenceTransformer, OVModelForMaskedLM, OVModelForSeq2SeqLM) + ): if isinstance(config.dataset, str): dataset_metadata = PREDEFINED_LANGUAGE_DATASETS[config.dataset] dataset = self.load_dataset( @@ -467,6 +470,7 @@ def build_from_dataset( from optimum.intel import ( OVModelForFeatureExtraction, OVModelForMaskedLM, + OVModelForSeq2SeqLM, OVModelForVisualCausalLM, OVModelForZeroShotImageClassification, OVSentenceTransformer, @@ -492,6 +496,7 @@ def build_from_dataset( OVModelForMaskedLM, OVModelForZeroShotImageClassification, OVSentenceTransformer, + OVModelForSeq2SeqLM, ), ) or (is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline)): # Prepare from raw dataset avoiding dataloader creation @@ -504,6 +509,8 @@ def build_from_dataset( return self._prepare_visual_causal_lm_calibration_data(quantization_config, dataset) elif isinstance(self.model, _OVModelForWhisper): return self._prepare_speech_to_text_calibration_data(quantization_config, dataset) + elif isinstance(self.model, OVModelForSeq2SeqLM): + return self._prepare_text_to_text_calibration_data(quantization_config, dataset) elif is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline): return self._prepare_diffusion_calibration_data(quantization_config, dataset) elif isinstance(self.model, (OVModelForFeatureExtraction, OVSentenceTransformer, OVModelForMaskedLM)): @@ -770,6 +777,56 @@ def _prepare_speech_to_text_calibration_data( return OVCalibrationDataset(collected_inputs) + def _prepare_text_to_text_calibration_data( + self, + config: OVQuantizationConfigBase, + dataset: "Dataset", + seq_len: int = 128, + ) -> OVCalibrationDataset: + """ + Prepares calibration data for text-to-text pipelines by inferring it on a dataset and collecting incurred inputs. + """ + from optimum.intel.openvino.modeling_seq2seq import OVDecoder, OVEncoder + + models: Dict[str, Union[OVEncoder, OVDecoder]] = {} + collected_inputs: Dict[str, List[Dict[str, Any]]] = {} + for submodel_name in self.model._ov_submodel_names: + ov_component: Union[OVEncoder, OVDecoder] = getattr(self.model, submodel_name) + models[submodel_name] = ov_component + collected_inputs[submodel_name] = [] + ov_component._compile() + ov_component.request = InferRequestWrapper( + ov_component.request, collected_inputs[submodel_name], apply_caching=True + ) + try: + + def get_tokenizer(): + if config.tokenizer is None: + raise ValueError("Please provide tokenizer for calibration via quantization_config.tokenizer.") + return AutoTokenizer.from_pretrained(config.tokenizer, trust_remote_code=config.trust_remote_code) + + num_samples = config.num_samples or 128 + dataset = list(tqdm(dataset.take(num_samples), desc="Downloading dataset", total=num_samples)) + + tokenizer = None + for item in tqdm(dataset, desc="Collecting calibration data"): + if "input_ids" in item: + # Assuming that dataset contains already preprocessed text + inputs = self._wrap_sample_as_array(item, add_batch_dim=True) + else: + tokenizer = tokenizer or get_tokenizer() + inputs = tokenizer(item["text"], truncation=True, max_length=seq_len, return_tensors="pt") + + self.model.generate(**inputs, max_new_tokens=seq_len) + finally: + for model in models.values(): + model.request = model.request.request + + for model_name in collected_inputs: + collected_inputs[model_name] = nncf.Dataset(collected_inputs[model_name]) + + return OVCalibrationDataset(collected_inputs) + def _prepare_diffusion_calibration_data( self, config: OVQuantizationConfigBase, dataset: Union[List, "Dataset"] ) -> OVCalibrationDataset: @@ -1202,10 +1259,7 @@ def _quantize_ovbasemodel( # # Regular (non-hybrid) weight-only quantization # - if is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline): - for submodel_name in self.model.ov_submodels: - quantization_configs[submodel_name] = quantization_config - elif isinstance(self.model, OVModelForVisualCausalLM): + if isinstance(self.model, OVModelForVisualCausalLM): for submodel_name in self.model.ov_submodels: quantization_configs[submodel_name] = ( quantization_config @@ -1213,7 +1267,8 @@ def _quantize_ovbasemodel( else OVWeightQuantizationConfig(bits=8, sym=True) ) else: - quantization_configs["model"] = quantization_config + for submodel_name in self.model.ov_submodels: + quantization_configs[submodel_name] = quantization_config else: # # Hybrid/Full/Mixed quantization @@ -1274,7 +1329,8 @@ def _quantize_ovbasemodel( else OVWeightQuantizationConfig(bits=8, sym=True) ) else: - quantization_configs["model"] = quantization_config + for submodel_name in self.model.ov_submodels: + quantization_configs[submodel_name] = quantization_config elif isinstance(quantization_config, OVMixedQuantizationConfig): # # Mixed quantization @@ -1282,7 +1338,8 @@ def _quantize_ovbasemodel( if is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline): raise NotImplementedError("Mixed precision quantization isn't supported for diffusers.") - quantization_configs["model"] = quantization_config + for submodel_name in self.model.ov_submodels: + quantization_configs[submodel_name] = quantization_config else: raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}") diff --git a/optimum/intel/openvino/utils.py b/optimum/intel/openvino/utils.py index 6f650b89bb..d80f866cfa 100644 --- a/optimum/intel/openvino/utils.py +++ b/optimum/intel/openvino/utils.py @@ -124,6 +124,7 @@ "fill-mask": "OVModelForMaskedLM", "text-generation": "OVModelForCausalLM", "text2text-generation": "OVModelForSeq2SeqLM", + "text2text-generation-with-past": "OVModelForSeq2SeqLM", "text-classification": "OVModelForSequenceClassification", "token-classification": "OVModelForTokenClassification", "question-answering": "OVModelForQuestionAnswering", diff --git a/tests/openvino/test_exporters_cli.py b/tests/openvino/test_exporters_cli.py index 83b5b1e80d..ea53e9ff1d 100644 --- a/tests/openvino/test_exporters_cli.py +++ b/tests/openvino/test_exporters_cli.py @@ -400,6 +400,20 @@ class OVCLIExportTestCase(unittest.TestCase): "model": {"int8": 65}, }, ), + ( + "text2text-generation-with-past", + "t5", + "int8", + "--dataset c4 --num-samples 1", + {"encoder": 30, "decoder": 52, "decoder_with_past": 61} + if is_transformers_version("<=", "4.36.0") + else {"encoder": 30, "decoder": 62}, + ( + {"encoder": {"int8": 32}, "decoder": {"int8": 52}, "decoder_with_past": {"int8": 42}} + if is_transformers_version("<=", "4.36.0") + else {"encoder": {"int8": 32}, "decoder": {"int8": 52}} + ), + ), ] TEST_4BIT_CONFIGURATIONS = [ @@ -937,7 +951,6 @@ def test_exporters_cli_hybrid_quantization( def test_exporters_cli_4bit( self, task: str, model_type: str, option: str, expected_num_weight_nodes_per_model: Dict[str, Dict[str, int]] ): - self.maxDiff = 100000 with TemporaryDirectory() as tmpdir: result = subprocess.run( f"optimum-cli export openvino --model {MODEL_NAMES[model_type]} --task {task} --weight-format {option} {tmpdir}", @@ -989,9 +1002,11 @@ def test_exporters_cli_full_quantization( ) model = model_cls.from_pretrained(tmpdir) - if "automatic-speech-recognition" in task and model.decoder_with_past is None: - del expected_num_weight_nodes_per_model["decoder_with_past"] - del expected_fake_nodes_per_model["decoder_with_past"] + if ( + "automatic-speech-recognition" in task or "text2text-generation" in task + ) and model.decoder_with_past is None: + expected_num_weight_nodes_per_model.pop("decoder_with_past", None) + expected_fake_nodes_per_model.pop("decoder_with_past", None) check_compression_state_per_model( self, diff --git a/tests/openvino/test_quantization.py b/tests/openvino/test_quantization.py index 26c6283b85..27e99c59bd 100644 --- a/tests/openvino/test_quantization.py +++ b/tests/openvino/test_quantization.py @@ -127,7 +127,7 @@ class OVQuantizerTest(unittest.TestCase): (OVModelForSequenceClassification, "bert", 32, 35), (OVModelForCausalLM, "gpt2", 31, 22), ) - # TODO (nikita-savelyevv): Extend for OVModelForSpeechSeq2Seq and OVStableDiffusionPipeline + # TODO (nikita-savelyevv): Extend for OVModelForSpeechSeq2Seq, OVStableDiffusionPipeline and OVModelForSeq2SeqLM SUPPORTED_ARCHITECTURES_OV_MODEL = ( (OVModelForSequenceClassification, "bert", 32, 35), (OVModelForCausalLM, "gpt2", 31, 22), @@ -356,6 +356,23 @@ class OVQuantizerTest(unittest.TestCase): "model": {"int8": 65}, }, ), + ( + OVModelForSeq2SeqLM, + "t5", + OVQuantizationConfig( + dtype="int8", + dataset="wikitext2", + num_samples=1, + ), + {"encoder": 30, "decoder": 52, "decoder_with_past": 61} + if is_transformers_version("<=", "4.36.0") + else {"encoder": 30, "decoder": 62}, + ( + {"encoder": {"int8": 32}, "decoder": {"int8": 52}, "decoder_with_past": {"int8": 42}} + if is_transformers_version("<=", "4.36.0") + else {"encoder": {"int8": 32}, "decoder": {"int8": 52}} + ), + ), ] if is_transformers_version(">=", "4.45.0"): @@ -572,13 +589,18 @@ def test_ov_model_static_quantization_with_auto_dataset( ov_model = model_cls.from_pretrained(model_id, quantization_config=quantization_config) ov_model.save_pretrained(tmp_dir) - if model_cls == OVModelForSpeechSeq2Seq: + if model_cls in [OVModelForSpeechSeq2Seq, OVModelForSeq2SeqLM]: if ov_model.decoder_with_past is None: - del expected_fake_nodes_per_model["decoder_with_past"] - del expected_num_weight_nodes_per_model["decoder_with_past"] + expected_fake_nodes_per_model.pop("decoder_with_past", None) + expected_num_weight_nodes_per_model.pop("decoder_with_past", None) - input_features = torch.randn((1, ov_model.config.num_mel_bins, 3000), dtype=torch.float32) - ov_model.generate(input_features) + if model_cls == OVModelForSpeechSeq2Seq: + input_features = torch.randn((1, ov_model.config.num_mel_bins, 3000), dtype=torch.float32) + ov_model.generate(input_features) + else: + tokenizer = AutoTokenizer.from_pretrained(model_id) + inputs = tokenizer("This is a sample ", return_tensors="pt") + ov_model.generate(**inputs) elif model_cls in (OVModelForCausalLM, OVModelForFeatureExtraction, OVModelForMaskedLM): tokenizer = AutoTokenizer.from_pretrained(model_id) if tokenizer.pad_token is None: @@ -1303,7 +1325,6 @@ def test_ovmodel_4bit_auto_compression_with_config( pass submodels = model.ov_submodels - self.maxDiff = 1000000 check_compression_state_per_model(self, submodels, expected_num_weight_nodes_per_model) model.save_pretrained(tmp_dir)