Skip to content

Commit efbcb54

Browse files
happy-qiaocopybara-github
authored andcommitted
feat!: Tokenization - Deprecated ComputeTokenResult.token_info_list in favor of ComputeTokenResult.tokens_info
PiperOrigin-RevId: 669468222
1 parent 6bf771f commit efbcb54

File tree

2 files changed

+46
-6
lines changed

2 files changed

+46
-6
lines changed

vertexai/preview/tokenization.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
# We just want to re-export certain classes
1717
# pylint: disable=g-multiple-import,g-importing-member
1818
from vertexai.tokenization._tokenizers import (
19-
get_tokenizer_for_model,
19+
_get_tokenizer_for_model_preview as get_tokenizer_for_model,
2020
)
2121

2222

vertexai/tokenization/_tokenizers.py

+45-5
Original file line numberDiff line numberDiff line change
@@ -53,17 +53,31 @@ class TokensInfo:
5353
role: str = None
5454

5555

56+
@dataclasses.dataclass(frozen=True)
57+
class ComputeTokensResult:
58+
tokens_info: Sequence[TokensInfo]
59+
60+
61+
class PreviewComputeTokensResult(ComputeTokensResult):
62+
def token_info_list(self) -> Sequence[TokensInfo]:
63+
import warnings
64+
65+
message = "PreviewComputeTokensResult.token_info_list is deprecated. Use ComputeTokensResult.tokens_info instead."
66+
warnings.warn(message, DeprecationWarning, stacklevel=2)
67+
return self.tokens_info
68+
69+
5670
@dataclasses.dataclass(frozen=True)
5771
class ComputeTokensResult:
5872
"""Represents token string pieces and ids output in compute_tokens function.
5973
6074
Attributes:
6175
tokens_info: Lists of tokens_info from the input.
62-
The input `contents: ContentsType` could have
63-
multiple string instances and each tokens_info
64-
item represents each string instance. Each token
65-
info consists tokens list, token_ids list and
66-
a role.
76+
The input `contents: ContentsType` could have
77+
multiple string instances and each tokens_info
78+
item represents each string instance. Each token
79+
info consists tokens list, token_ids list and
80+
a role.
6781
token_info_list: the value in this field equal to tokens_info.
6882
"""
6983

@@ -523,6 +537,32 @@ def compute_tokens(self, contents: ContentsType) -> ComputeTokensResult:
523537
)
524538

525539

540+
class PreviewTokenizer(Tokenizer):
541+
def compute_tokens(self, contents: ContentsType) -> PreviewComputeTokensResult:
542+
return PreviewComputeTokensResult(tokens_info=super().compute_tokens(contents))
543+
544+
545+
def _get_tokenizer_for_model_preview(model_name: str) -> PreviewTokenizer:
546+
"""Returns a tokenizer for the given tokenizer name.
547+
548+
Usage:
549+
```
550+
tokenizer = get_tokenizer_for_model("gemini-1.5-pro-001")
551+
print(tokenizer.count_tokens("Hello world!"))
552+
```
553+
554+
Supported models can be found at
555+
https://cloud.google.com/vertex-ai/generative-ai/docs/learn/models.
556+
557+
Args:
558+
model_name: Specify the tokenizer is from which model.
559+
"""
560+
if not model_name:
561+
raise ValueError("model_name must not be empty.")
562+
563+
return PreviewTokenizer(get_tokenizer_name(model_name))
564+
565+
526566
def get_tokenizer_for_model(model_name: str) -> Tokenizer:
527567
"""Returns a tokenizer for the given tokenizer name.
528568

0 commit comments

Comments
 (0)