Skip to content

[OV] Add quantization for text2text-generation models #1359

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
5 changes: 5 additions & 0 deletions optimum/commands/export/openvino.py
Original file line number Diff line number Diff line change
Expand Up @@ -482,6 +482,7 @@ def run(self):
and (
task in ["fill-mask", "zero-shot-image-classification"]
or task.startswith("text-generation")
or task.startswith("text2text-generation")
or task.startswith("automatic-speech-recognition")
or task.startswith("feature-extraction")
)
Expand All @@ -491,6 +492,10 @@ def run(self):
from optimum.intel import OVModelForCausalLM

model_cls = OVModelForCausalLM
elif task.startswith("text2text-generation"):
from optimum.intel import OVModelForSeq2SeqLM

model_cls = OVModelForSeq2SeqLM
elif task == "image-text-to-text":
from optimum.intel import OVModelForVisualCausalLM

Expand Down
46 changes: 19 additions & 27 deletions optimum/intel/openvino/modeling_seq2seq.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import logging
import os
from pathlib import Path
Expand Down Expand Up @@ -42,9 +41,9 @@

from ...exporters.openvino import main_export
from ...exporters.openvino.stateful import model_has_state
from .. import OVConfig, OVQuantizer
from .. import OVConfig
from ..utils import is_transformers_version
from .configuration import OVQuantizationConfigBase, OVWeightQuantizationConfig
from .configuration import OVWeightQuantizationConfig
from .modeling_base import OVBaseModel
from .utils import (
ONNX_DECODER_NAME,
Expand Down Expand Up @@ -477,7 +476,6 @@ def _from_pretrained(
decoder_with_past_file_name = decoder_with_past_file_name or default_decoder_with_past_file_name
decoder_with_past = None

quantization_config = cls._prepare_quantization_config(quantization_config, load_in_8bit)
compile_only = kwargs.pop("compile_only", False)
device = kwargs.pop("device", "CPU")
ov_config = kwargs.pop("ov_config", None)
Expand Down Expand Up @@ -521,10 +519,10 @@ def _from_pretrained(
"decoder_with_past": model_save_dir / decoder_with_past_file_name,
}
if not compile_only:
encoder = cls.load_model(file_names["encoder"], quantization_config)
decoder = cls.load_model(file_names["decoder"], quantization_config)
encoder = cls.load_model(file_names["encoder"])
decoder = cls.load_model(file_names["decoder"])
if use_cache and not model_has_state(decoder) and os.path.exists(file_names["decoder_with_past"]):
decoder_with_past = cls.load_model(file_names["decoder_with_past"], quantization_config)
decoder_with_past = cls.load_model(file_names["decoder_with_past"])
else:
model_kwargs = {"device": device, "ov_config": ov_config, "model_save_dir": model_save_dir}
encoder = cls._compile_model(file_names["encoder"], **model_kwargs)
Expand All @@ -551,7 +549,8 @@ def _from_pretrained(
"Generation config file not found, using a generation config created from the model config."
)

return cls(
quantization_config = cls._prepare_quantization_config(quantization_config, load_in_8bit)
model = cls(
encoder=encoder,
decoder=decoder,
decoder_with_past=decoder_with_past,
Expand All @@ -565,6 +564,17 @@ def _from_pretrained(
**kwargs,
)

if quantization_config is not None:
from optimum.intel import OVQuantizer

quantizer = OVQuantizer(model)
quantization_config_copy = quantization_config.clone()
quantization_config_copy.tokenizer = quantization_config.tokenizer or model_id
quantization_config_copy.processor = quantization_config.processor or model_id
quantizer.quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))

return model

@classmethod
def _export(
cls,
Expand Down Expand Up @@ -1345,27 +1355,9 @@ def _from_pretrained(
cls,
model_id: Union[str, Path],
config: "PretrainedConfig",
load_in_8bit: bool = False,
quantization_config: Union[dict, OVQuantizationConfigBase] = None,
**kwargs,
):
compile_only = kwargs.get("compile_only", False)

quantization_config = cls._prepare_quantization_config(quantization_config, load_in_8bit)
is_data_aware_quantization = quantization_config is not None and quantization_config.dataset is not None
if not compile_only and is_data_aware_quantization:
model = super(OVModelForSpeechSeq2Seq, cls)._from_pretrained(
model_id, config, load_in_8bit=False, **kwargs
)
quantization_config_copy = copy.deepcopy(quantization_config)
quantization_config_copy.processor = quantization_config.processor or model_id
OVQuantizer(model).quantize(ov_config=OVConfig(quantization_config=quantization_config_copy))
else:
model = super(OVModelForSpeechSeq2Seq, cls)._from_pretrained(
model_id, config, load_in_8bit=load_in_8bit, quantization_config=quantization_config, **kwargs
)

return model
return super(OVModelForSpeechSeq2Seq, cls)._from_pretrained(model_id, config, **kwargs)

class DummyWhisperModel:
def __init__(self):
Expand Down
73 changes: 65 additions & 8 deletions optimum/intel/openvino/quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,6 +270,7 @@ def build_from_quantization_config(self, config: OVQuantizationConfigBase) -> OV
OVModelForCausalLM,
OVModelForFeatureExtraction,
OVModelForMaskedLM,
OVModelForSeq2SeqLM,
OVModelForVisualCausalLM,
OVModelForZeroShotImageClassification,
OVSentenceTransformer,
Expand Down Expand Up @@ -344,7 +345,9 @@ def build_from_quantization_config(self, config: OVQuantizationConfigBase) -> OV
)

return self.build_from_dataset(config, dataset)
elif isinstance(self.model, (OVModelForFeatureExtraction, OVSentenceTransformer, OVModelForMaskedLM)):
elif isinstance(
self.model, (OVModelForFeatureExtraction, OVSentenceTransformer, OVModelForMaskedLM, OVModelForSeq2SeqLM)
):
if isinstance(config.dataset, str):
dataset_metadata = PREDEFINED_LANGUAGE_DATASETS[config.dataset]
dataset = self.load_dataset(
Expand Down Expand Up @@ -467,6 +470,7 @@ def build_from_dataset(
from optimum.intel import (
OVModelForFeatureExtraction,
OVModelForMaskedLM,
OVModelForSeq2SeqLM,
OVModelForVisualCausalLM,
OVModelForZeroShotImageClassification,
OVSentenceTransformer,
Expand All @@ -492,6 +496,7 @@ def build_from_dataset(
OVModelForMaskedLM,
OVModelForZeroShotImageClassification,
OVSentenceTransformer,
OVModelForSeq2SeqLM,
),
) or (is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline)):
# Prepare from raw dataset avoiding dataloader creation
Expand All @@ -504,6 +509,8 @@ def build_from_dataset(
return self._prepare_visual_causal_lm_calibration_data(quantization_config, dataset)
elif isinstance(self.model, _OVModelForWhisper):
return self._prepare_speech_to_text_calibration_data(quantization_config, dataset)
elif isinstance(self.model, OVModelForSeq2SeqLM):
return self._prepare_text_to_text_calibration_data(quantization_config, dataset)
elif is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
return self._prepare_diffusion_calibration_data(quantization_config, dataset)
elif isinstance(self.model, (OVModelForFeatureExtraction, OVSentenceTransformer, OVModelForMaskedLM)):
Expand Down Expand Up @@ -770,6 +777,56 @@ def _prepare_speech_to_text_calibration_data(

return OVCalibrationDataset(collected_inputs)

def _prepare_text_to_text_calibration_data(
self,
config: OVQuantizationConfigBase,
dataset: "Dataset",
seq_len: int = 128,
) -> OVCalibrationDataset:
"""
Prepares calibration data for text-to-text pipelines by inferring it on a dataset and collecting incurred inputs.
"""
from optimum.intel.openvino.modeling_seq2seq import OVDecoder, OVEncoder

models: Dict[str, Union[OVEncoder, OVDecoder]] = {}
collected_inputs: Dict[str, List[Dict[str, Any]]] = {}
for submodel_name in self.model._ov_submodel_names:
ov_component: Union[OVEncoder, OVDecoder] = getattr(self.model, submodel_name)
models[submodel_name] = ov_component
collected_inputs[submodel_name] = []
ov_component._compile()
ov_component.request = InferRequestWrapper(
ov_component.request, collected_inputs[submodel_name], apply_caching=True
)
try:

def get_tokenizer():
if config.tokenizer is None:
raise ValueError("Please provide tokenizer for calibration via quantization_config.tokenizer.")
return AutoTokenizer.from_pretrained(config.tokenizer, trust_remote_code=config.trust_remote_code)

num_samples = config.num_samples or 128
dataset = list(tqdm(dataset.take(num_samples), desc="Downloading dataset", total=num_samples))

tokenizer = None
for item in tqdm(dataset, desc="Collecting calibration data"):
if "input_ids" in item:
# Assuming that dataset contains already preprocessed text
inputs = self._wrap_sample_as_array(item, add_batch_dim=True)
else:
tokenizer = tokenizer or get_tokenizer()
inputs = tokenizer(item["text"], truncation=True, max_length=seq_len, return_tensors="pt")

self.model.generate(**inputs, max_new_tokens=seq_len)
finally:
for model in models.values():
model.request = model.request.request

for model_name in collected_inputs:
collected_inputs[model_name] = nncf.Dataset(collected_inputs[model_name])

return OVCalibrationDataset(collected_inputs)

def _prepare_diffusion_calibration_data(
self, config: OVQuantizationConfigBase, dataset: Union[List, "Dataset"]
) -> OVCalibrationDataset:
Expand Down Expand Up @@ -1202,18 +1259,16 @@ def _quantize_ovbasemodel(
#
# Regular (non-hybrid) weight-only quantization
#
if is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
for submodel_name in self.model.ov_submodels:
quantization_configs[submodel_name] = quantization_config
elif isinstance(self.model, OVModelForVisualCausalLM):
if isinstance(self.model, OVModelForVisualCausalLM):
for submodel_name in self.model.ov_submodels:
quantization_configs[submodel_name] = (
quantization_config
if submodel_name == "lm_model"
else OVWeightQuantizationConfig(bits=8, sym=True)
)
else:
quantization_configs["model"] = quantization_config
for submodel_name in self.model.ov_submodels:
quantization_configs[submodel_name] = quantization_config
else:
#
# Hybrid/Full/Mixed quantization
Expand Down Expand Up @@ -1274,15 +1329,17 @@ def _quantize_ovbasemodel(
else OVWeightQuantizationConfig(bits=8, sym=True)
)
else:
quantization_configs["model"] = quantization_config
for submodel_name in self.model.ov_submodels:
quantization_configs[submodel_name] = quantization_config
elif isinstance(quantization_config, OVMixedQuantizationConfig):
#
# Mixed quantization
#
if is_diffusers_available() and isinstance(self.model, OVDiffusionPipeline):
raise NotImplementedError("Mixed precision quantization isn't supported for diffusers.")

quantization_configs["model"] = quantization_config
for submodel_name in self.model.ov_submodels:
quantization_configs[submodel_name] = quantization_config
else:
raise ValueError(f"Unsupported type of quantization config: {type(quantization_config)}")

Expand Down
1 change: 1 addition & 0 deletions optimum/intel/openvino/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,7 @@
"fill-mask": "OVModelForMaskedLM",
"text-generation": "OVModelForCausalLM",
"text2text-generation": "OVModelForSeq2SeqLM",
"text2text-generation-with-past": "OVModelForSeq2SeqLM",
"text-classification": "OVModelForSequenceClassification",
"token-classification": "OVModelForTokenClassification",
"question-answering": "OVModelForQuestionAnswering",
Expand Down
23 changes: 20 additions & 3 deletions tests/openvino/test_exporters_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,20 @@ class OVCLIExportTestCase(unittest.TestCase):
"model": {"int8": 65},
},
),
(
"text2text-generation-with-past",
"t5",
"int8",
"--dataset c4 --num-samples 1",
{"encoder": 30, "decoder": 52, "decoder_with_past": 61}
if is_transformers_version("<=", "4.36.0")
else {"encoder": 30, "decoder": 62},
(
{"encoder": {"int8": 32}, "decoder": {"int8": 52}, "decoder_with_past": {"int8": 42}}
if is_transformers_version("<=", "4.36.0")
else {"encoder": {"int8": 32}, "decoder": {"int8": 52}}
),
),
]

TEST_4BIT_CONFIGURATIONS = [
Expand Down Expand Up @@ -957,6 +971,7 @@ def test_exporters_cli_full_quantization(
expected_fake_nodes_per_model: Dict[str, int],
expected_num_weight_nodes_per_model: Dict[str, Dict[str, int]],
):
self.maxDiff = 100000
with TemporaryDirectory() as tmpdir:
subprocess.run(
f"optimum-cli export openvino --task {task} --model {MODEL_NAMES[model_type]} "
Expand All @@ -971,9 +986,11 @@ def test_exporters_cli_full_quantization(
)
model = model_cls.from_pretrained(tmpdir)

if "automatic-speech-recognition" in task and model.decoder_with_past is None:
del expected_num_weight_nodes_per_model["decoder_with_past"]
del expected_fake_nodes_per_model["decoder_with_past"]
if (
"automatic-speech-recognition" in task or "text2text-generation" in task
) and model.decoder_with_past is None:
expected_num_weight_nodes_per_model.pop("decoder_with_past", None)
expected_fake_nodes_per_model.pop("decoder_with_past", None)

check_compression_state_per_model(
self,
Expand Down
34 changes: 28 additions & 6 deletions tests/openvino/test_quantization.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ class OVQuantizerTest(unittest.TestCase):
(OVModelForSequenceClassification, "bert", 32, 35),
(OVModelForCausalLM, "gpt2", 31, 22),
)
# TODO (nikita-savelyevv): Extend for OVModelForSpeechSeq2Seq and OVStableDiffusionPipeline
# TODO (nikita-savelyevv): Extend for OVModelForSpeechSeq2Seq, OVStableDiffusionPipeline and OVModelForSeq2SeqLM
SUPPORTED_ARCHITECTURES_OV_MODEL = (
(OVModelForSequenceClassification, "bert", 32, 35),
(OVModelForCausalLM, "gpt2", 31, 22),
Expand Down Expand Up @@ -356,6 +356,23 @@ class OVQuantizerTest(unittest.TestCase):
"model": {"int8": 65},
},
),
(
OVModelForSeq2SeqLM,
"t5",
OVQuantizationConfig(
dtype="int8",
dataset="wikitext2",
num_samples=1,
),
{"encoder": 30, "decoder": 52, "decoder_with_past": 61}
if is_transformers_version("<=", "4.36.0")
else {"encoder": 30, "decoder": 62},
(
{"encoder": {"int8": 32}, "decoder": {"int8": 52}, "decoder_with_past": {"int8": 42}}
if is_transformers_version("<=", "4.36.0")
else {"encoder": {"int8": 32}, "decoder": {"int8": 52}}
),
),
]

if is_transformers_version(">=", "4.45.0"):
Expand Down Expand Up @@ -572,13 +589,18 @@ def test_ov_model_static_quantization_with_auto_dataset(
ov_model = model_cls.from_pretrained(model_id, quantization_config=quantization_config)
ov_model.save_pretrained(tmp_dir)

if model_cls == OVModelForSpeechSeq2Seq:
if model_cls in [OVModelForSpeechSeq2Seq, OVModelForSeq2SeqLM]:
if ov_model.decoder_with_past is None:
del expected_fake_nodes_per_model["decoder_with_past"]
del expected_num_weight_nodes_per_model["decoder_with_past"]
expected_fake_nodes_per_model.pop("decoder_with_past", None)
expected_num_weight_nodes_per_model.pop("decoder_with_past", None)

input_features = torch.randn((1, ov_model.config.num_mel_bins, 3000), dtype=torch.float32)
ov_model.generate(input_features)
if model_cls == OVModelForSpeechSeq2Seq:
input_features = torch.randn((1, ov_model.config.num_mel_bins, 3000), dtype=torch.float32)
ov_model.generate(input_features)
else:
tokenizer = AutoTokenizer.from_pretrained(model_id)
inputs = tokenizer("This is a sample <mask>", return_tensors="pt")
ov_model.generate(**inputs)
elif model_cls in (OVModelForCausalLM, OVModelForFeatureExtraction, OVModelForMaskedLM):
tokenizer = AutoTokenizer.from_pretrained(model_id)
if tokenizer.pad_token is None:
Expand Down