Skip to content

[OV] Add quantization for text2text-generation models #1359

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
53 changes: 53 additions & 0 deletions docs/source/openvino/optimization.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -373,6 +373,59 @@ Click on a ✅ to copy the command/code for the corresponding optimization case.
</button>
</td>
</tr>
<tr>
<td style="text-align: center; vertical-align: middle;">text2text-generation<br>(OVModelForSeq2SeqLM)</td>
<td style="text-align: center; vertical-align: middle;">
<button
onclick="navigator.clipboard.writeText('optimum-cli export openvino -m google-t5/t5-small --weight-format int8 ./save_dir')">
</button>
</td>
<td style="text-align: center; vertical-align: middle;">
<button
onclick="navigator.clipboard.writeText('OVModelForSeq2SeqLM.from_pretrained(\'google-t5/t5-small\', quantization_config=OVWeightQuantizationConfig(bits=8)).save_pretrained(\'save_dir\')')">
</button>
</td>
<td style="text-align: center; vertical-align: middle;">
<button
onclick="navigator.clipboard.writeText('optimum-cli export openvino -m google-t5/t5-small --weight-format int4 --dataset wikitext2 ./save_dir')">
</button>
</td>
<td style="text-align: center; vertical-align: middle;">
<button
onclick="navigator.clipboard.writeText('OVModelForSeq2SeqLM.from_pretrained(\'google-t5/t5-small\', quantization_config=OVWeightQuantizationConfig(bits=4, dataset=\'wikitext2\')).save_pretrained(\'save_dir\')')">
</button>
</td>
<td style="text-align: center; vertical-align: middle;">–</td>
<td style="text-align: center; vertical-align: middle;">-</td>
<td style="text-align: center; vertical-align: middle;">
<button
onclick="navigator.clipboard.writeText('optimum-cli export openvino -m google-t5/t5-small --quant-mode int8 --dataset wikitext2 --smooth-quant-alpha -1 ./save_dir')">
</button>
</td>
<td style="text-align: center; vertical-align: middle;">
<button
onclick="navigator.clipboard.writeText('OVModelForSeq2SeqLM.from_pretrained(\'google-t5/t5-small\', quantization_config=OVQuantizationConfig(bits=8, dataset=\'wikitext2\', smooth_quant_alpha=-1)).save_pretrained(\'save_dir\')')">
</button>
</td>
<td style="text-align: center; vertical-align: middle;">
<button
onclick="navigator.clipboard.writeText('optimum-cli export openvino -m google-t5/t5-small --quant-mode nf4_f8e4m3 --dataset wikitext2 --smooth-quant-alpha -1 ./save_dir')">
</button>
</td>
<td style="text-align: center; vertical-align: middle;">
<button
onclick="navigator.clipboard.writeText('OVModelForSeq2SeqLM.from_pretrained(\'google-t5/t5-small\', quantization_config=OVMixedQuantizationConfig(OVWeightQuantizationConfig(bits=4, dtype=\'nf4\'), OVQuantizationConfig(dtype=\'f8e4m3\', dataset=\'wikitext2\', smooth_quant_alpha=-1))).save_pretrained(\'save_dir\')')">
</button>
</td>
</tr>
<tr>
<td style="text-align: center; vertical-align: middle;">zero-shot-image-classification<br>(OVModelForZeroShotImageClassification)</td>
<td style="text-align: center; vertical-align: middle;">
Expand Down
5 changes: 5 additions & 0 deletions optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,7 @@ def run(self):
and (
task in ["fill-mask", "zero-shot-image-classification"]
or task.startswith("text-generation")
or task.startswith("text2text-generation")
or task.startswith("automatic-speech-recognition")
or task.startswith("feature-extraction")
)
Expand All @@ -491,6 +492,10 @@ def run(self):
from optimum.intel import OVModelForCausalLM

model_cls = OVModelForCausalLM
elif task.startswith("text2text-generation"):
from optimum.intel import OVModelForSeq2SeqLM

model_cls = OVModelForSeq2SeqLM
elif task == "image-text-to-text":
from optimum.intel import OVModelForVisualCausalLM

Expand Down
18 changes: 18 additions & 0 deletions optimum/intel/openvino/configuration.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,24 @@ class OVQuantizationMethod(str, Enum):

# Default configs for int8 full quantization
_DEFAULT_INT8_FQ_CONFIGS = {
"google-t5/t5-small": {
"dtype": "int8",
"dataset": "wikitext2",
"num_samples": 300,
"smooth_quant_alpha": -1,
},
"google-t5/t5-large": {
"dtype": "int8",
"dataset": "wikitext2",
"num_samples": 300,
"smooth_quant_alpha": -1,
},
"google-t5/t5-3b": {
"dtype": "int8",
"dataset": "wikitext2",
"num_samples": 300,
"smooth_quant_alpha": -1,
},
"FacebookAI/roberta-large": {
"dtype": "int8",
"dataset": "wikitext2",
Expand Down
72 changes: 45 additions & 27 deletions optimum/intel/openvino/modeling_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import logging
import os
from pathlib import Path
Expand Down Expand Up @@ -42,9 +41,9 @@

from ...exporters.openvino import main_export
from ...exporters.openvino.stateful import model_has_state
from .. import OVConfig, OVQuantizer
from .. import OVConfig
from ..utils import is_transformers_version
from .configuration import OVQuantizationConfigBase, OVWeightQuantizationConfig
from .configuration import OVWeightQuantizationConfig
from .modeling_base import OVBaseModel
from .utils import (
ONNX_DECODER_NAME,
Expand Down Expand Up @@ -477,7 +476,6 @@ def _from_pretrained(
decoder_with_past_file_name = decoder_with_past_file_name or default_decoder_with_past_file_name
decoder_with_past = None

quantization_config = cls._prepare_quantization_config(quantization_config, load_in_8bit)
compile_only = kwargs.pop("compile_only", False)
device = kwargs.pop("device", "CPU")
ov_config = kwargs.pop("ov_config", None)
Expand Down Expand Up @@ -521,10 +519,10 @@ def _from_pretrained(
"decoder_with_past": model_save_dir / decoder_with_past_file_name,
}
if not compile_only:
encoder = cls.load_model(file_names["encoder"], quantization_config)
decoder = cls.load_model(file_names["decoder"], quantization_config)
encoder = cls.load_model(file_names["encoder"])
decoder = cls.load_model(file_names["decoder"])
if use_cache and not model_has_state(decoder) and os.path.exists(file_names["decoder_with_past"]):
decoder_with_past = cls.load_model(file_names["decoder_with_past"], quantization_config)
decoder_with_past = cls.load_model(file_names["decoder_with_past"])
else:
model_kwargs = {"device": device, "ov_config": ov_config, "model_save_dir": model_save_dir}
encoder = cls._compile_model(file_names["encoder"], **model_kwargs)
Expand All @@ -551,7 +549,8 @@ def _from_pretrained(
"Generation config file not found, using a generation config created from the model config."
)

return cls(
quantization_config = cls._prepare_quantization_config(quantization_config, load_in_8bit)
model = cls(
encoder=encoder,
decoder=decoder,
decoder_with_past=decoder_with_past,
Expand All @@ -565,6 +564,17 @@ def _from_pretrained(
**kwargs,
)

if quantization_config is not None:
from optimum.intel import OVQuantizer

quantizer = OVQuantizer(model)
quantization_config_copy = quantization_config.clone()
quantization_config_copy.tokenizer = quantization_config.tokenizer or model_id
quantization_config_copy.processor = quantization_config.processor or model_id
quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))

return model

@classmethod
def _export(
cls,
Expand Down Expand Up @@ -657,12 +667,17 @@ def forward(
encoder_outputs: Optional[Tuple[Tuple[torch.Tensor]]] = None,
past_key_values: Optional[Tuple[Tuple[torch.Tensor]]] = None,
cache_position: Optional[torch.LongTensor] = None,
labels: Optional[torch.LongTensor] = None,
**kwargs,
) -> Seq2SeqLMOutput:
# Encode if needed : first prediction pass
if encoder_outputs is None:
encoder_outputs = self.encoder(input_ids=input_ids, attention_mask=attention_mask)

if labels is not None and decoder_input_ids is None:
# get decoder inputs from shifting lm labels to the right
decoder_input_ids = self._shift_right(labels)

# Decode
if past_key_values is None or self.decoder_with_past is None:
decoder_outputs = self.decoder(
Expand Down Expand Up @@ -786,6 +801,27 @@ def compile(self):
for submodel_name in self._ov_submodel_names:
getattr(self, submodel_name)._compile()

def _shift_right(self, input_ids):
decoder_start_token_id = self.config.decoder_start_token_id
pad_token_id = self.config.pad_token_id

if decoder_start_token_id is None:
raise ValueError(
"self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. "
"See T5 docs for more information."
)

shifted_input_ids = input_ids.new_zeros(input_ids.shape)
shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
shifted_input_ids[..., 0] = decoder_start_token_id

if pad_token_id is None:
raise ValueError("self.model.config.pad_token_id has to be defined.")
# replace possible -100 values in labels by `pad_token_id`
shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)

return shifted_input_ids


class OVEncoder:
"""
Expand Down Expand Up @@ -1345,27 +1381,9 @@ def _from_pretrained(
cls,
model_id: Union[str, Path],
config: "PretrainedConfig",
load_in_8bit: bool = False,
quantization_config: Union[dict, OVQuantizationConfigBase] = None,
**kwargs,
):
compile_only = kwargs.get("compile_only", False)

quantization_config = cls._prepare_quantization_config(quantization_config, load_in_8bit)
is_data_aware_quantization = quantization_config is not None and quantization_config.dataset is not None
if not compile_only and is_data_aware_quantization:
model = super(OVModelForSpeechSeq2Seq, cls)._from_pretrained(
model_id, config, load_in_8bit=False, **kwargs
)
quantization_config_copy = copy.deepcopy(quantization_config)
quantization_config_copy.processor = quantization_config.processor or model_id
OVQuantizer(model).quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))
else:
model = super(OVModelForSpeechSeq2Seq, cls)._from_pretrained(
model_id, config, load_in_8bit=load_in_8bit, quantization_config=quantization_config, **kwargs
)

return model
return super(OVModelForSpeechSeq2Seq, cls)._from_pretrained(model_id, config, **kwargs)

class DummyWhisperModel:
def __init__(self):
Expand Down
73 changes: 65 additions & 8 deletions optimum/intel/openvino/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ def build_from_quantization_config(self, config: OVQuantizationConfigBase) -> OV
OVModelForCausalLM,
OVModelForFeatureExtraction,
OVModelForMaskedLM,
OVModelForSeq2SeqLM,
OVModelForVisualCausalLM,
OVModelForZeroShotImageClassification,
OVSentenceTransformer,
Expand Down Expand Up @@ -344,7 +345,9 @@ def build_from_quantization_config(self, config: OVQuantizationConfigBase) -> OV
)

return self.build_from_dataset(config, dataset)
elif isinstance(self.model, (OVModelForFeatureExtraction, OVSentenceTransformer, OVModelForMaskedLM)):
elif isinstance(
self.model, (OVModelForFeatureExtraction, OVSentenceTransformer, OVModelForMaskedLM, OVModelForSeq2SeqLM)
):
if isinstance(config.dataset, str):
dataset_metadata = PREDEFINED_LANGUAGE_DATASETS[config.dataset]
dataset = self.load_dataset(
Expand Down Expand Up @@ -467,6 +470,7 @@ def build_from_dataset(
from optimum.intel import (
OVModelForFeatureExtraction,
OVModelForMaskedLM,
OVModelForSeq2SeqLM,
OVModelForVisualCausalLM,
OVModelForZeroShotImageClassification,
OVSentenceTransformer,
Expand All @@ -492,6 +496,7 @@ def build_from_dataset(
OVModelForMaskedLM,
OVModelForZeroShotImageClassification,
OVSentenceTransformer,
OVModelForSeq2SeqLM,
),
) or (is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline)):
# Prepare from raw dataset avoiding dataloader creation
Expand All @@ -504,6 +509,8 @@ def build_from_dataset(
return self._prepare_visual_causal_lm_calibration_data(quantization_config, dataset)
elif isinstance(self.model, _OVModelForWhisper):
return self._prepare_speech_to_text_calibration_data(quantization_config, dataset)
elif isinstance(self.model, OVModelForSeq2SeqLM):
return self._prepare_text_to_text_calibration_data(quantization_config, dataset)
elif is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
return self._prepare_diffusion_calibration_data(quantization_config, dataset)
elif isinstance(self.model, (OVModelForFeatureExtraction, OVSentenceTransformer, OVModelForMaskedLM)):
Expand Down Expand Up @@ -770,6 +777,56 @@ def _prepare_speech_to_text_calibration_data(

return OVCalibrationDataset(collected_inputs)

def _prepare_text_to_text_calibration_data(
self,
config: OVQuantizationConfigBase,
dataset: "Dataset",
seq_len: int = 128,
) -> OVCalibrationDataset:
"""
Prepares calibration data for text-to-text pipelines by inferring it on a dataset and collecting incurred inputs.
"""
from optimum.intel.openvino.modeling_seq2seq import OVDecoder, OVEncoder

models: Dict[str, Union[OVEncoder, OVDecoder]] = {}
collected_inputs: Dict[str, List[Dict[str, Any]]] = {}
for submodel_name in self.model._ov_submodel_names:
ov_component: Union[OVEncoder, OVDecoder] = getattr(self.model, submodel_name)
models[submodel_name] = ov_component
collected_inputs[submodel_name] = []
ov_component._compile()
ov_component.request = InferRequestWrapper(
ov_component.request, collected_inputs[submodel_name], apply_caching=True
)
try:

def get_tokenizer():
if config.tokenizer is None:
raise ValueError("Please provide tokenizer for calibration via quantization_config.tokenizer.")
return AutoTokenizer.from_pretrained(config.tokenizer, trust_remote_code=config.trust_remote_code)

num_samples = config.num_samples or 128
dataset = list(tqdm(dataset.take(num_samples), desc="Downloading dataset", total=num_samples))

tokenizer = None
for item in tqdm(dataset, desc="Collecting calibration data"):
if "input_ids" in item:
# Assuming that dataset contains already preprocessed text
inputs = self._wrap_sample_as_array(item, add_batch_dim=True)
else:
tokenizer = tokenizer or get_tokenizer()
inputs = tokenizer(item["text"], truncation=True, max_length=seq_len, return_tensors="pt")

self.model.generate(**inputs, max_new_tokens=seq_len)
finally:
for model in models.values():
model.request = model.request.request

for model_name in collected_inputs:
collected_inputs[model_name] = nncf.Dataset(collected_inputs[model_name])

return OVCalibrationDataset(collected_inputs)

def _prepare_diffusion_calibration_data(
self, config: OVQuantizationConfigBase, dataset: Union[List, "Dataset"]
) -> OVCalibrationDataset:
Expand Down Expand Up @@ -1202,18 +1259,16 @@ def _quantize_ovbasemodel(
#
# Regular (non-hybrid) weight-only quantization
#
if is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
for submodel_name in self.model.ov_submodels:
quantization_configs[submodel_name] = quantization_config
elif isinstance(self.model, OVModelForVisualCausalLM):
if isinstance(self.model, OVModelForVisualCausalLM):
for submodel_name in self.model.ov_submodels:
quantization_configs[submodel_name] = (
quantization_config
if submodel_name == "lm_model"
else OVWeightQuantizationConfig(bits=8, sym=True)
)
else:
quantization_configs["model"] = quantization_config
for submodel_name in self.model.ov_submodels:
quantization_configs[submodel_name] = quantization_config
else:
#
# Hybrid/Full/Mixed quantization
Expand Down Expand Up @@ -1274,15 +1329,17 @@ def _quantize_ovbasemodel(
else OVWeightQuantizationConfig(bits=8, sym=True)
)
else:
quantization_configs["model"] = quantization_config
for submodel_name in self.model.ov_submodels:
quantization_configs[submodel_name] = quantization_config
elif isinstance(quantization_config, OVMixedQuantizationConfig):
#
# Mixed quantization
#
if is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
raise NotImplementedError("Mixed precision quantization isn't supported for diffusers.")

quantization_configs["model"] = quantization_config
for submodel_name in self.model.ov_submodels:
quantization_configs[submodel_name] = quantization_config
else:
raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}")

Expand Down
1 change: 1 addition & 0 deletions optimum/intel/openvino/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@
"fill-mask": "OVModelForMaskedLM",
"text-generation": "OVModelForCausalLM",
"text2text-generation": "OVModelForSeq2SeqLM",
"text2text-generation-with-past": "OVModelForSeq2SeqLM",
"text-classification": "OVModelForSequenceClassification",
"token-classification": "OVModelForTokenClassification",
"question-answering": "OVModelForQuestionAnswering",
Expand Down
Loading