allenai · dirkgr · Aug 24, 2021 · Aug 19, 2021 · Aug 19, 2021 · Aug 20, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -28,6 +28,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added Tango components, to be explored in detail in a later post
 - Added `ScaledDotProductMatrixAttention`, and converted the transformer toolkit to use it
 - Added tests to ensure that all `Attention` and `MatrixAttention` implementations are interchangeable
+- Added `from_pretrained_transformer_and_instances` constructor to `Vocabulary`
 
 ### Fixed
 

diff --git a/allennlp/data/vocabulary.py b/allennlp/data/vocabulary.py
@@ -423,6 +423,82 @@ def from_files_and_instances(
         )
         return vocab
 
+    @classmethod
+    def from_pretrained_transformer_and_instances(
+        cls,
+        instances: Iterable["adi.Instance"],
+        transformers: Dict[str, Union[str, List[str]]],
+        min_count: Dict[str, int] = None,
+        max_vocab_size: Union[int, Dict[str, int]] = None,
+        non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES,
+        pretrained_files: Optional[Dict[str, str]] = None,
+        only_include_pretrained_words: bool = False,
+        tokens_to_add: Dict[str, List[str]] = None,
+        min_pretrained_embeddings: Dict[str, int] = None,
+        padding_token: Optional[str] = DEFAULT_PADDING_TOKEN,
+        oov_token: Optional[str] = DEFAULT_OOV_TOKEN,
+    ) -> "Vocabulary":
+        """
+        Construct a vocabulary given a collection of `Instance`'s and some parameters. Then extends
+        it with generated vocabularies from pretrained transformers.
+
+        Vocabulary from instances is constructed by passing parameters to :func:`from_instances`,
+        and then updated by including merging in vocabularies from
+        :func:`from_pretrained_transformer`. See other methods for full descriptions for what the
+        other parameters do.
+
+        The `instances` parameters does not get an entry in a typical AllenNLP configuration file,
+        other parameters do (if you want non-default parameters).
+
+        # Parameters
+
+        transformers : `Dict[str, Union[str, List[str]]]`
+            Dictionary mapping the vocab namespaces (keys) to a transformer model name (value).
+            If desired, multiple model names can be provided to a single namespace as a list of
+            strings. Namespaces not included will be ignored.
+
+        # Examples
+
+        You can use this constructor by modifying the following example within your training
+        configuration.
+
+        ```jsonnet
+        {
+            vocabulary: {
+                type: 'from_pretrained_transformer_and_instances',
+                transformers: {
+                    'namespace1': 'bert-base-cased',
+                    'namespace2': ['bert-base-cased', 'roberta-base'],
+                },
+            }
+        }
+        ```
+        """
+        vocab = cls.from_instances(
+            instances=instances,
+            min_count=min_count,
+            max_vocab_size=max_vocab_size,
+            non_padded_namespaces=non_padded_namespaces,
+            pretrained_files=pretrained_files,
+            only_include_pretrained_words=only_include_pretrained_words,
+            tokens_to_add=tokens_to_add,
+            min_pretrained_embeddings=min_pretrained_embeddings,
+            padding_token=padding_token,
+            oov_token=oov_token,
+        )
+
+        for namespace, model_name_list in transformers.items():
+            if isinstance(model_name_list, str):
+                model_name_list = [model_name_list]
+
+            for model_name in model_name_list:
+                transformer_vocab = cls.from_pretrained_transformer(
+                    model_name=model_name, namespace=namespace
+                )
+                vocab.extend_from_vocab(transformer_vocab)
+
+        return vocab
+
     @classmethod
     def empty(cls) -> "Vocabulary":
         """
@@ -810,6 +886,10 @@ def print_statistics(self) -> None:
 Vocabulary.register("from_pretrained_transformer", constructor="from_pretrained_transformer")(
     Vocabulary
 )
+Vocabulary.register(
+    "from_pretrained_transformer_and_instances",
+    constructor="from_pretrained_transformer_and_instances",
+)(Vocabulary)
 Vocabulary.register("from_instances", constructor="from_instances")(Vocabulary)
 Vocabulary.register("from_files", constructor="from_files")(Vocabulary)
 Vocabulary.register("extend", constructor="from_files_and_instances")(Vocabulary)

diff --git a/tests/data/vocabulary_test.py b/tests/data/vocabulary_test.py
@@ -905,3 +905,138 @@ def test_from_pretrained_transformer(self, model_name):
 
         vocab1 = Vocabulary.from_files(self.TEST_DIR / "vocab")
         assert vocab1._token_to_index[namespace] == tokenizer.get_vocab()
+
+
+class TestVocabularyFromPretrainedTransformerAndInstances(AllenNlpTestCase):
+    def setup_method(self):
+        super().setup_method()
+
+        # Create dataset with single namespace
+        token_indexer_1 = SingleIdTokenIndexer("namespace_1")
+        text_field_1 = TextField(
+            [Token(t) for t in ["a", "a", "a", "a", "b", "b", "c", "c", "c"]],
+            {"namespace_1": token_indexer_1},
+        )
+        single_field_instance = Instance({"text": text_field_1})
+        self.single_namespace_dataset = Batch([single_field_instance])
+
+        # Create dataset with multiple namespaces
+        token_indexer_2 = SingleIdTokenIndexer("namespace_2")
+        text_field_2 = TextField(
+            [Token(t) for t in ["d", "d", "d", "d", "e", "e", "f", "f", "f"]],
+            {"namespace_2": token_indexer_2},
+        )
+        multiple_field_instance = Instance(
+            {"first_text": text_field_1, "second_text": text_field_2}
+        )
+        self.multiple_namespace_dataset = Batch([multiple_field_instance])
+
+    @staticmethod
+    def _get_expected_vocab(dataset, namespace, model_name):
+        vocab_from_instances = Vocabulary.from_instances(dataset)
+        instance_tokens = set(vocab_from_instances._token_to_index[namespace].keys())
+        transformer_tokens = set(
+            Vocabulary.from_pretrained_transformer(model_name, namespace)
+            ._token_to_index[namespace]
+            .keys()
+        )
+        return instance_tokens.union(transformer_tokens)
+
+    def _get_expected_vocab_size(self, dataset, namespace, model_name):
+        return len(self._get_expected_vocab(dataset, namespace, model_name))
+
+    @pytest.mark.parametrize("model_name", ["bert-base-cased", "roberta-base"])
+    def test_with_single_namespace_and_single_model(self, model_name):
+        dataset = self.single_namespace_dataset
+        namespace = "namespace_1"
+
+        expected_vocab_size = self._get_expected_vocab_size(dataset, namespace, model_name)
+
+        vocab = Vocabulary.from_pretrained_transformer_and_instances(
+            dataset, {namespace: model_name}
+        )
+
+        assert vocab.get_vocab_size(namespace) == expected_vocab_size
+
+    @pytest.mark.parametrize("model1_name", ["bert-base-cased", "roberta-base"])
+    @pytest.mark.parametrize("model2_name", ["bert-base-cased", "roberta-base"])
+    def test_with_single_namespace_and_multiple_models(self, model1_name, model2_name):
+        dataset = self.single_namespace_dataset
+        namespace = "namespace_1"
+
+        model1_tokens = self._get_expected_vocab(dataset, namespace, model1_name)
+        model2_tokens = self._get_expected_vocab(dataset, namespace, model2_name)
+        expected_vocab_size = len(model1_tokens.union(model2_tokens))
+
+        vocab = Vocabulary.from_pretrained_transformer_and_instances(
+            dataset, {namespace: [model1_name, model2_name]}
+        )
+
+        assert vocab.get_vocab_size(namespace) == expected_vocab_size
+
+    @pytest.mark.parametrize("model_name", ["bert-base-cased", "roberta-base"])
+    def test_only_updates_single_namespace_when_multiple_present(self, model_name):
+        dataset = self.multiple_namespace_dataset
+        namespace1 = "namespace_1"
+        namespace2 = "namespace_2"
+
+        namespace1_vocab_size = self._get_expected_vocab_size(dataset, namespace1, model_name)
+        namespace2_vocab_size = Vocabulary.from_instances(dataset).get_vocab_size("namespace_2")
+
+        vocab = Vocabulary.from_pretrained_transformer_and_instances(
+            dataset, {namespace1: model_name}
+        )
+
+        # Make sure only the desired namespace is extended
+        assert vocab.get_vocab_size(namespace1) == namespace1_vocab_size
+        assert vocab.get_vocab_size(namespace2) == namespace2_vocab_size
+
+    @pytest.mark.parametrize("namespace1_model_name", ["bert-base-cased", "roberta-base"])
+    @pytest.mark.parametrize("namespace2_model_name", ["bert-base-cased", "roberta-base"])
+    def test_with_different_models_per_namespace(
+        self, namespace1_model_name, namespace2_model_name
+    ):
+        dataset = self.multiple_namespace_dataset
+        namespace1 = "namespace_1"
+        namespace2 = "namespace_2"
+
+        namespace1_vocab_size = self._get_expected_vocab_size(
+            dataset, namespace1, namespace1_model_name
+        )
+        namespace2_vocab_size = self._get_expected_vocab_size(
+            dataset, namespace2, namespace2_model_name
+        )
+
+        vocab = Vocabulary.from_pretrained_transformer_and_instances(
+            dataset, {namespace1: namespace1_model_name, namespace2: namespace2_model_name}
+        )
+
+        assert vocab.get_vocab_size(namespace1) == namespace1_vocab_size
+        assert vocab.get_vocab_size(namespace2) == namespace2_vocab_size
+
+    @pytest.mark.parametrize("namespace1_list", [["bert-base-cased", "distilbert-base-cased"]])
+    @pytest.mark.parametrize("namespace2_list", [["roberta-base", "distilroberta-base"]])
+    def test_with_multiple_models_across_multiple_namespaces(
+        self, namespace1_list, namespace2_list
+    ):
+        dataset = self.multiple_namespace_dataset
+        namespace1 = "namespace_1"
+        namespace2 = "namespace_2"
+
+        namespace1_vocab = set()
+        namespace2_vocab = set()
+
+        for model1_name in namespace1_list:
+            namespace1_model_vocab = self._get_expected_vocab(dataset, namespace1, model1_name)
+            namespace1_vocab.update(namespace1_model_vocab)
+
+        for model2_name in namespace2_list:
+            namespace2_model_vocab = self._get_expected_vocab(dataset, namespace2, model2_name)
+            namespace2_vocab.update(namespace2_model_vocab)
+
+        vocab = Vocabulary.from_pretrained_transformer_and_instances(
+            dataset, {namespace1: namespace1_list, namespace2: namespace2_list}
+        )
+
+        assert vocab.get_vocab_size(namespace1) == len(namespace1_vocab)
+        assert vocab.get_vocab_size(namespace2) == len(namespace2_vocab)