enable user choice for various quantization schemes

zichuan-wei · copybara-github · commit 869f6adf362e · 2025-05-09T16:07:05.000-07:00
PiperOrigin-RevId: 756939691
diff --git a/ai_edge_torch/generative/tools/batch_convert.py b/ai_edge_torch/generative/tools/batch_convert.py
@@ -282,9 +282,12 @@ def convert_models(conversion_configs: Sequence[ConversionConfig]) -> None:
       )
       converter.convert_to_tflite(
           pytorch_model,
-          tflite_path=os.path.join(config.tflite_output_path, output_filename),
+          output_path=config.tflite_output_path,
+          output_name_prefix=output_filename,
           prefill_seq_len=config.prefill_seq_lens,
-          quantize=True if precision == ExportPrecision.INT8 else False,
+          quantize=converter.QuantizationName.DYNAMIC_INT8
+          if precision == ExportPrecision.INT8
+          else converter.QuantizationName.NONE,
           export_config=ExportConfig(),
       )
       logging.info("Successfully converted model: %s", output_filename)
diff --git a/ai_edge_torch/generative/utilities/converter.py b/ai_edge_torch/generative/utilities/converter.py
@@ -15,6 +15,7 @@
 
 """Common utility functions for model conversion."""
 
+import enum
 import os
 import pathlib
 from typing import Optional, Union
@@ -42,6 +43,27 @@ def forward(self, *export_args, **export_kwargs):
     return self.module(*export_args, **full_kwargs)
 
 
+class QuantizationName(str, enum.Enum):
+  """Strings for all supported quantization recipes.
+
+  none: No quantization.
+  dynamic_int8: Dynamic range quantization with int8 weights.
+  weight_only_int8: Weight only quantization with int8 weights.
+  fp16: Float16 quantization.
+  dynamic_int4_block32: Dynamic range quantization with int4 weights and block
+  size of 32, better model quality but slower inference.
+  dynamic_int4_block128: Dynamic range quantization with int4 weights and block
+  size of 128, faster inference but worse model quality.
+  """
+
+  NONE = 'none'
+  DYNAMIC_INT8 = 'dynamic_int8'
+  WEIGHT_ONLY_INT8 = 'weight_only_int8'
+  FP16 = 'fp16'
+  DYNAMIC_INT4_BLOCK32 = 'dynamic_int4_block32'
+  DYNAMIC_INT4_BLOCK128 = 'dynamic_int4_block128'
+
+
 def define_conversion_flags(
     model_name: str,
     default_mask_as_input: bool = False,
@@ -74,10 +96,10 @@ def define_conversion_flags(
       1280,
       'The maximum size of KV cache buffer, including both prefill and decode.',
   )
-  flags.DEFINE_bool(
+  flags.DEFINE_string(
       'quantize',
-      True,
-      'Whether the model should be quantized.',
+      'dynamic_int8',
+      'How the model should be quantized.',
   )
   flags.DEFINE_multi_integer(
       'lora_ranks',
@@ -99,6 +121,66 @@ def define_conversion_flags(
   return flags
 
 
+def get_quant_recipe_from_flag(
+    quantize: str,
+) -> Optional[quant_recipes.QuantizationRecipe]:
+  """Processes the quantization flag and returns the corresponding recipe.
+
+  Args:
+      quantize: The quantization type.
+
+  Returns:
+      The quantization recipe, or None if no quantization is needed.
+
+  Raises:
+      ValueError: If the quantization type is not supported.
+  """
+  match quantize:
+    case QuantizationName.NONE:
+      return None
+    case QuantizationName.DYNAMIC_INT8:
+      return quant_recipes.full_int8_dynamic_recipe()
+    case QuantizationName.WEIGHT_ONLY_INT8:
+      return quant_recipes.full_int8_weight_only_recipe()
+    case QuantizationName.FP16:
+      return quant_recipes.full_fp16_recipe()
+    case QuantizationName.DYNAMIC_INT4_BLOCK32:
+      return quant_recipes.full_int4_dynamic_block_recipe(32)
+    case QuantizationName.DYNAMIC_INT4_BLOCK128:
+      return quant_recipes.full_int4_dynamic_block_recipe(128)
+    case _:
+      raise ValueError(f'Unsupported quantization flag: {quantize}')
+
+
+def create_quantize_suffix(quantize: str) -> str:
+  """Creates a suffix for the output file name based on the quantization type.
+
+  Args:
+      quantize: The quantization type.
+
+  Returns:
+      A string representing the quantization suffix.
+
+  Raises:
+      ValueError: If the quantization type is not supported.
+  """
+  match quantize:
+    case QuantizationName.NONE:
+      return 'f32'
+    case QuantizationName.DYNAMIC_INT8:
+      return 'q8'
+    case QuantizationName.WEIGHT_ONLY_INT8:
+      return 'q8_wo'
+    case QuantizationName.FP16:
+      return 'fp16'
+    case QuantizationName.DYNAMIC_INT4_BLOCK32:
+      return 'q4_block32'
+    case QuantizationName.DYNAMIC_INT4_BLOCK128:
+      return 'q4_block128'
+    case _:
+      raise ValueError(f'Unsupported quantization flag: {quantize}')
+
+
 def _build_mask(mask_len, kv_cache_max_len, causal_mask_value) -> torch.Tensor:
   if isinstance(mask_len, list):
     return [
@@ -118,7 +200,7 @@ def convert_to_tflite(
     prefill_seq_len: Union[int, list[int]],
     pixel_values_size: torch.Size = None,
     pixel_seq_len: int = 0,
-    quantize: bool = True,
+    quantize: str = 'dynamic_int8',
     config: cfg.ModelConfig = None,
     lora_ranks: Optional[list[int]] = None,
     export_config: ExportConfig = None,
@@ -164,8 +246,8 @@ def convert_to_tflite(
         embeddings generated by the image encoder with pixel values. The actual
         length of prefill_seq_len will be added by pixel_seq_len when pixel
         values are passed.
-      quantize (bool, optional): Whether the model should be quanized. Defaults
-        to True.
+      quantize (str, optional): The quantization type. Defaults to
+        'dynamic_int8'.
       config (cfg.ModelConfig, optional): The model config used to configure KV
         cache. If None, it uses the config of the pytorch_model.
       lora_ranks (list[int], optional): The ranks of the LORA layers. If None,
@@ -186,7 +268,7 @@ def convert_to_tflite(
       lora = lora_utils.LoRA.zeros(rank, config)
       loras.append(lora)
 
-  quant_suffix = 'q8' if quantize else 'f32'
+  quant_suffix = create_quantize_suffix(quantize)
   kv_size = config.kv_cache_max_len
   lora_suffix = (
       '' if not lora_ranks else f'_lora{",".join(map(str, lora_ranks))}'
@@ -220,7 +302,7 @@ def _export_helper(
     prefill_seq_lens: list[int],
     pixel_values_size: torch.Size,
     pixel_seq_len: int,
-    quantize: bool,
+    quantize: str,
     config: cfg.ModelConfig,
     loras: list[None | lora_utils.LoRA],
     export_config: ExportConfig,
@@ -269,7 +351,7 @@ def _export_helper(
       kv_layout=export_config.kvcache_layout,
   )
 
-  quant_config = quant_recipes.full_int8_dynamic_recipe() if quantize else None
+  quant_config = get_quant_recipe_from_flag(quantize)
   quant_config._model_config = config
 
   # For export, we create a module that captures any non-exportable,