allenai · dirkgr · Jun 21, 2021 · Jun 9, 2021 · May 19, 2021 · May 19, 2021
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -12,6 +12,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Added `on_backward` training callback which allows for control over backpropagation and gradient manipulation.
 - Added `AdversarialBiasMitigator`, a Model wrapper to adversarially mitigate biases in predictions produced by a pretrained model for a downstream task.
 - Added `which_loss` parameter to `ensure_model_can_train_save_and_load` in `ModelTestCase` to specify which loss to test.
+- The activation layer in the transformer toolkit now can be queried for its output dimension.
+- `TransformerEmbeddings` now takes, but ignores, a parameter for the attention mask. This is needed for compatibility with some other modules that get called the same way and use the mask.
+- `TransformerPooler` can now be instantiated from a pretrained transformer module, just like the other modules in the transformer toolkit.
 
 ### Fixed
 

diff --git a/allennlp/modules/transformer/activation_layer.py b/allennlp/modules/transformer/activation_layer.py
@@ -24,6 +24,9 @@ def __init__(
             self.act_fn = activation
         self.pool = pool
 
+    def get_output_dim(self) -> int:
+        return self.dense.out_features
+
     def forward(self, hidden_states):
         if self.pool:
             hidden_states = hidden_states[:, 0]

diff --git a/allennlp/modules/transformer/transformer_embeddings.py b/allennlp/modules/transformer/transformer_embeddings.py
@@ -113,7 +113,6 @@ class TransformerEmbeddings(Embeddings):
         # Albert is a special case. A linear projection is applied to the embeddings,
         # but that linear transformation lives in the encoder.
         "albert.embeddings.LayerNorm": "layer_norm",
-        "albert.embeddings.LayerNorm": "layer_norm",
         "albert.embeddings.word_embeddings": "embeddings.word_embeddings",
         "albert.embeddings.position_embeddings": "embeddings.position_embeddings",
         "albert.embeddings.token_type_embeddings": "embeddings.token_type_embeddings",
@@ -161,6 +160,7 @@ def __init__(
     def forward(  # type: ignore
         self,
         input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
         token_type_ids: Optional[torch.Tensor] = None,
         position_ids: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
@@ -169,6 +169,8 @@ def forward(  # type: ignore
         # Parameters
         input_ids : `torch.Tensor`
             Shape `batch_size x seq_len`
+        attention_mask : `torch.Tensor`
+            Shape `batch_size x seq_len`. This parameter is ignored, but it is here for compatibility.
         token_type_ids : `torch.Tensor`, optional
             Shape `batch_size x seq_len`
         position_ids : `torch.Tensor`, optional

diff --git a/allennlp/modules/transformer/transformer_pooler.py b/allennlp/modules/transformer/transformer_pooler.py
@@ -1,11 +1,44 @@
+from typing import Dict, Optional, Any, Union, TYPE_CHECKING
+
+import torch
+
 from allennlp.common import FromParams
 from allennlp.modules.transformer.activation_layer import ActivationLayer
 
+if TYPE_CHECKING:
+    from transformers.configuration_utils import PretrainedConfig
+
 
 class TransformerPooler(ActivationLayer, FromParams):
+
+    _pretrained_relevant_module = ["pooler", "bert.pooler"]
+
     def __init__(
         self,
         hidden_size: int,
         intermediate_size: int,
+        activation: Union[str, torch.nn.Module] = "relu",
     ):
-        super().__init__(hidden_size, intermediate_size, "relu", pool=True)
+        super().__init__(hidden_size, intermediate_size, activation, pool=True)
+
+    @classmethod
+    def _get_input_arguments(
+        cls,
+        pretrained_module: torch.nn.Module,
+        source: str = "huggingface",
+        mapping: Optional[Dict[str, str]] = None,
+        **kwargs,
+    ) -> Dict[str, Any]:
+        final_kwargs = {}
+
+        final_kwargs["hidden_size"] = pretrained_module.dense.in_features
+        final_kwargs["intermediate_size"] = pretrained_module.dense.out_features
+        final_kwargs["activation"] = pretrained_module.activation
+
+        final_kwargs.update(kwargs)
+
+        return final_kwargs
+
+    @classmethod
+    def _from_config(cls, config: "PretrainedConfig", **kwargs):
+        return cls(config.hidden_size, config.hidden_size, "tanh")  # BERT has this hardcoded