Skip to content
This repository was archived by the owner on Dec 16, 2022. It is now read-only.

Create Vocabulary from both pretrained transformers and instances #5368

Merged
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- Added Tango components, to be explored in detail in a later post
- Added `ScaledDotProductMatrixAttention`, and converted the transformer toolkit to use it
- Added tests to ensure that all `Attention` and `MatrixAttention` implementations are interchangeable
- Added `from_pretrained_transformer_and_instances` constructor to `Vocabulary`

### Fixed

Expand Down
80 changes: 80 additions & 0 deletions allennlp/data/vocabulary.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,82 @@ def from_files_and_instances(
)
return vocab

@classmethod
def from_pretrained_transformer_and_instances(
cls,
instances: Iterable["adi.Instance"],
transformers: Dict[str, Union[str, List[str]]],
min_count: Dict[str, int] = None,
max_vocab_size: Union[int, Dict[str, int]] = None,
non_padded_namespaces: Iterable[str] = DEFAULT_NON_PADDED_NAMESPACES,
pretrained_files: Optional[Dict[str, str]] = None,
only_include_pretrained_words: bool = False,
tokens_to_add: Dict[str, List[str]] = None,
min_pretrained_embeddings: Dict[str, int] = None,
padding_token: Optional[str] = DEFAULT_PADDING_TOKEN,
oov_token: Optional[str] = DEFAULT_OOV_TOKEN,
) -> "Vocabulary":
"""
Construct a vocabulary given a collection of `Instance`'s and some parameters. Then extends
it with generated vocabularies from pretrained transformers.

Vocabulary from instances is constructed by passing parameters to :func:`from_instances`,
and then updated by including merging in vocabularies from
:func:`from_pretrained_transformer`. See other methods for full descriptions for what the
other parameters do.

The `instances` parameters does not get an entry in a typical AllenNLP configuration file,
other parameters do (if you want non-default parameters).

# Parameters

transformers : `Dict[str, Union[str, List[str]]]`
Dictionary mapping the vocab namespaces (keys) to a transformer model name (value).
If desired, multiple model names can be provided to a single namespace as a list of
strings. Namespaces not included will be ignored.

# Examples

You can use this constructor by modifying the following example within your training
configuration.

```jsonnet
{
vocabulary: {
type: 'from_pretrained_transformer_and_instances',
transformers: {
'namespace1': 'bert-base-cased',
'namespace2': ['bert-base-cased', 'roberta-base'],
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is supposed to happen when you put two transformers into the same namespace?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If two models are put into the same namespace, that namespace is extended by the tokens in both models. I don't know why someone might want to do it, but there might be a research reason for it?

This is tested with both test_with_single_namespace_and_multiple_models and test_with_multiple_models_across_multiple_namespaces

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the result will be wrong if you do that. Each transformer expects a word piece to map to a certain integer. If a word piece maps to a different integer, the embeddings won't work. You'll probably get an "index out of bounds" exception (if you're lucky). Since we can't map two word pieces to the same integer (and we certainly can't map the same word piece to two different integers), I think we have to disallow taking in two transformer vocabs into the same namespace.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That makes sense to me! I've updated the code to reflect those changes.

},
}
}
```
"""
vocab = cls.from_instances(
instances=instances,
min_count=min_count,
max_vocab_size=max_vocab_size,
non_padded_namespaces=non_padded_namespaces,
pretrained_files=pretrained_files,
only_include_pretrained_words=only_include_pretrained_words,
tokens_to_add=tokens_to_add,
min_pretrained_embeddings=min_pretrained_embeddings,
padding_token=padding_token,
oov_token=oov_token,
)

for namespace, model_name_list in transformers.items():
if isinstance(model_name_list, str):
model_name_list = [model_name_list]

for model_name in model_name_list:
transformer_vocab = cls.from_pretrained_transformer(
model_name=model_name, namespace=namespace
)
vocab.extend_from_vocab(transformer_vocab)

return vocab

@classmethod
def empty(cls) -> "Vocabulary":
"""
Expand Down Expand Up @@ -810,6 +886,10 @@ def print_statistics(self) -> None:
Vocabulary.register("from_pretrained_transformer", constructor="from_pretrained_transformer")(
Vocabulary
)
Vocabulary.register(
"from_pretrained_transformer_and_instances",
constructor="from_pretrained_transformer_and_instances",
)(Vocabulary)
Vocabulary.register("from_instances", constructor="from_instances")(Vocabulary)
Vocabulary.register("from_files", constructor="from_files")(Vocabulary)
Vocabulary.register("extend", constructor="from_files_and_instances")(Vocabulary)
Expand Down
135 changes: 135 additions & 0 deletions tests/data/vocabulary_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -905,3 +905,138 @@ def test_from_pretrained_transformer(self, model_name):

vocab1 = Vocabulary.from_files(self.TEST_DIR / "vocab")
assert vocab1._token_to_index[namespace] == tokenizer.get_vocab()


class TestVocabularyFromPretrainedTransformerAndInstances(AllenNlpTestCase):
def setup_method(self):
super().setup_method()

# Create dataset with single namespace
token_indexer_1 = SingleIdTokenIndexer("namespace_1")
text_field_1 = TextField(
[Token(t) for t in ["a", "a", "a", "a", "b", "b", "c", "c", "c"]],
{"namespace_1": token_indexer_1},
)
single_field_instance = Instance({"text": text_field_1})
self.single_namespace_dataset = Batch([single_field_instance])

# Create dataset with multiple namespaces
token_indexer_2 = SingleIdTokenIndexer("namespace_2")
text_field_2 = TextField(
[Token(t) for t in ["d", "d", "d", "d", "e", "e", "f", "f", "f"]],
{"namespace_2": token_indexer_2},
)
multiple_field_instance = Instance(
{"first_text": text_field_1, "second_text": text_field_2}
)
self.multiple_namespace_dataset = Batch([multiple_field_instance])

@staticmethod
def _get_expected_vocab(dataset, namespace, model_name):
vocab_from_instances = Vocabulary.from_instances(dataset)
instance_tokens = set(vocab_from_instances._token_to_index[namespace].keys())
transformer_tokens = set(
Vocabulary.from_pretrained_transformer(model_name, namespace)
._token_to_index[namespace]
.keys()
)
return instance_tokens.union(transformer_tokens)

def _get_expected_vocab_size(self, dataset, namespace, model_name):
return len(self._get_expected_vocab(dataset, namespace, model_name))

@pytest.mark.parametrize("model_name", ["bert-base-cased", "roberta-base"])
def test_with_single_namespace_and_single_model(self, model_name):
dataset = self.single_namespace_dataset
namespace = "namespace_1"

expected_vocab_size = self._get_expected_vocab_size(dataset, namespace, model_name)

vocab = Vocabulary.from_pretrained_transformer_and_instances(
dataset, {namespace: model_name}
)

assert vocab.get_vocab_size(namespace) == expected_vocab_size

@pytest.mark.parametrize("model1_name", ["bert-base-cased", "roberta-base"])
@pytest.mark.parametrize("model2_name", ["bert-base-cased", "roberta-base"])
def test_with_single_namespace_and_multiple_models(self, model1_name, model2_name):
dataset = self.single_namespace_dataset
namespace = "namespace_1"

model1_tokens = self._get_expected_vocab(dataset, namespace, model1_name)
model2_tokens = self._get_expected_vocab(dataset, namespace, model2_name)
expected_vocab_size = len(model1_tokens.union(model2_tokens))

vocab = Vocabulary.from_pretrained_transformer_and_instances(
dataset, {namespace: [model1_name, model2_name]}
)

assert vocab.get_vocab_size(namespace) == expected_vocab_size

@pytest.mark.parametrize("model_name", ["bert-base-cased", "roberta-base"])
def test_only_updates_single_namespace_when_multiple_present(self, model_name):
dataset = self.multiple_namespace_dataset
namespace1 = "namespace_1"
namespace2 = "namespace_2"

namespace1_vocab_size = self._get_expected_vocab_size(dataset, namespace1, model_name)
namespace2_vocab_size = Vocabulary.from_instances(dataset).get_vocab_size("namespace_2")

vocab = Vocabulary.from_pretrained_transformer_and_instances(
dataset, {namespace1: model_name}
)

# Make sure only the desired namespace is extended
assert vocab.get_vocab_size(namespace1) == namespace1_vocab_size
assert vocab.get_vocab_size(namespace2) == namespace2_vocab_size

@pytest.mark.parametrize("namespace1_model_name", ["bert-base-cased", "roberta-base"])
@pytest.mark.parametrize("namespace2_model_name", ["bert-base-cased", "roberta-base"])
def test_with_different_models_per_namespace(
self, namespace1_model_name, namespace2_model_name
):
dataset = self.multiple_namespace_dataset
namespace1 = "namespace_1"
namespace2 = "namespace_2"

namespace1_vocab_size = self._get_expected_vocab_size(
dataset, namespace1, namespace1_model_name
)
namespace2_vocab_size = self._get_expected_vocab_size(
dataset, namespace2, namespace2_model_name
)

vocab = Vocabulary.from_pretrained_transformer_and_instances(
dataset, {namespace1: namespace1_model_name, namespace2: namespace2_model_name}
)

assert vocab.get_vocab_size(namespace1) == namespace1_vocab_size
assert vocab.get_vocab_size(namespace2) == namespace2_vocab_size

@pytest.mark.parametrize("namespace1_list", [["bert-base-cased", "distilbert-base-cased"]])
@pytest.mark.parametrize("namespace2_list", [["roberta-base", "distilroberta-base"]])
def test_with_multiple_models_across_multiple_namespaces(
self, namespace1_list, namespace2_list
):
dataset = self.multiple_namespace_dataset
namespace1 = "namespace_1"
namespace2 = "namespace_2"

namespace1_vocab = set()
namespace2_vocab = set()

for model1_name in namespace1_list:
namespace1_model_vocab = self._get_expected_vocab(dataset, namespace1, model1_name)
namespace1_vocab.update(namespace1_model_vocab)

for model2_name in namespace2_list:
namespace2_model_vocab = self._get_expected_vocab(dataset, namespace2, model2_name)
namespace2_vocab.update(namespace2_model_vocab)

vocab = Vocabulary.from_pretrained_transformer_and_instances(
dataset, {namespace1: namespace1_list, namespace2: namespace2_list}
)

assert vocab.get_vocab_size(namespace1) == len(namespace1_vocab)
assert vocab.get_vocab_size(namespace2) == len(namespace2_vocab)