feat!: Tokenization - Deprecated ComputeTokenResult.token_info_list in favor of ComputeTokenResult.tokens_info

happy-qiao · copybara-github · commit efbcb54e0d5d · 2024-08-30T15:44:59.000-07:00
PiperOrigin-RevId: 669468222
diff --git a/vertexai/preview/tokenization.py b/vertexai/preview/tokenization.py
@@ -16,7 +16,7 @@
 # We just want to re-export certain classes
 # pylint: disable=g-multiple-import,g-importing-member
 from vertexai.tokenization._tokenizers import (
-    get_tokenizer_for_model,
+    _get_tokenizer_for_model_preview as get_tokenizer_for_model,
 )
 
 
diff --git a/vertexai/tokenization/_tokenizers.py b/vertexai/tokenization/_tokenizers.py
@@ -53,17 +53,31 @@ class TokensInfo:
     role: str = None
 
 
+@dataclasses.dataclass(frozen=True)
+class ComputeTokensResult:
+    tokens_info: Sequence[TokensInfo]
+
+
+class PreviewComputeTokensResult(ComputeTokensResult):
+    def token_info_list(self) -> Sequence[TokensInfo]:
+        import warnings
+
+        message = "PreviewComputeTokensResult.token_info_list is deprecated. Use ComputeTokensResult.tokens_info instead."
+        warnings.warn(message, DeprecationWarning, stacklevel=2)
+        return self.tokens_info
+
+
 @dataclasses.dataclass(frozen=True)
 class ComputeTokensResult:
     """Represents token string pieces and ids output in compute_tokens function.
 
     Attributes:
         tokens_info: Lists of tokens_info from the input.
-        The input `contents: ContentsType` could have
-        multiple string instances and each tokens_info
-        item represents each string instance. Each token
-        info consists tokens list, token_ids list and
-        a role.
+            The input `contents: ContentsType` could have
+            multiple string instances and each tokens_info
+            item represents each string instance. Each token
+            info consists tokens list, token_ids list and
+            a role.
         token_info_list: the value in this field equal to tokens_info.
     """
 
@@ -523,6 +537,32 @@ def compute_tokens(self, contents: ContentsType) -> ComputeTokensResult:
         )
 
 
+class PreviewTokenizer(Tokenizer):
+    def compute_tokens(self, contents: ContentsType) -> PreviewComputeTokensResult:
+        return PreviewComputeTokensResult(tokens_info=super().compute_tokens(contents))
+
+
+def _get_tokenizer_for_model_preview(model_name: str) -> PreviewTokenizer:
+    """Returns a tokenizer for the given tokenizer name.
+
+    Usage:
+        ```
+        tokenizer = get_tokenizer_for_model("gemini-1.5-pro-001")
+        print(tokenizer.count_tokens("Hello world!"))
+        ```
+
+    Supported models can be found at
+    https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models.
+
+    Args:
+        model_name: Specify the tokenizer is from which model.
+    """
+    if not model_name:
+        raise ValueError("model_name must not be empty.")
+
+    return PreviewTokenizer(get_tokenizer_name(model_name))
+
+
 def get_tokenizer_for_model(model_name: str) -> Tokenizer:
     """Returns a tokenizer for the given tokenizer name.
 

Original file line number	Diff line number	Diff line change
`@@ -16,7 +16,7 @@`
`16`	`16`	`# We just want to re-export certain classes`
`17`	`17`	`# pylint: disable=g-multiple-import,g-importing-member`
`18`	`18`	`from vertexai.tokenization._tokenizers import (`
`19`		`- get_tokenizer_for_model,`
	`19`	`+ _get_tokenizer_for_model_preview as get_tokenizer_for_model,`
`20`	`20`	`)`
`21`	`21`
`22`	`22`