1
1
import copy
2
-
3
2
import torch
3
+ import pytest
4
4
5
5
from allennlp .common import Params
6
6
from allennlp .common import cached_transformers
7
+
7
8
from allennlp .common .testing import assert_equal_parameters
8
9
from allennlp .modules .transformer import TransformerBlock
9
10
from allennlp .common .testing import AllenNlpTestCase
10
11
12
+ from transformers .configuration_bert import BertConfig
13
+ from transformers .modeling_bert import BertEncoder
14
+ from transformers .configuration_roberta import RobertaConfig
15
+ from transformers .modeling_roberta import RobertaEncoder
16
+ from transformers .configuration_electra import ElectraConfig
17
+ from transformers .modeling_electra import ElectraEncoder
18
+
19
+ PARAMS_DICT = {
20
+ "num_hidden_layers" : 3 ,
21
+ "hidden_size" : 6 ,
22
+ "intermediate_size" : 3 ,
23
+ "num_attention_heads" : 2 ,
24
+ "attention_dropout" : 0.1 ,
25
+ "hidden_dropout" : 0.2 ,
26
+ "activation" : "relu" ,
27
+ }
28
+
29
+
30
+ def get_modules (params_dict ):
31
+ modules = {}
32
+ params = copy .deepcopy (params_dict )
33
+ params ["attention_probs_dropout_prob" ] = params .pop ("attention_dropout" )
34
+ params ["hidden_dropout_prob" ] = params .pop ("hidden_dropout" )
35
+
36
+ torch .manual_seed (1234 )
37
+ hf_module = BertEncoder (BertConfig (** params ))
38
+ modules ["bert" ] = hf_module
39
+
40
+ torch .manual_seed (1234 )
41
+ hf_module = RobertaEncoder (RobertaConfig (** params ))
42
+ modules ["roberta" ] = hf_module
43
+
44
+ torch .manual_seed (1234 )
45
+ hf_module = ElectraEncoder (ElectraConfig (** params ))
46
+ modules ["electra" ] = hf_module
47
+
48
+ return modules
49
+
11
50
12
51
class TestTransformerBlock (AllenNlpTestCase ):
13
52
def setup_method (self ):
@@ -50,16 +89,6 @@ def test_loading_from_pretrained_weights(self):
50
89
}
51
90
assert_equal_parameters (pretrained_module , module , mapping )
52
91
53
- def test_loading_from_pretrained_weights_using_model_name (self ):
54
- module = TransformerBlock .from_pretrained_module (self .pretrained_name )
55
- mapping = {
56
- val : key
57
- for key , val in module ._construct_default_mapping (
58
- self .pretrained , "huggingface" , {}
59
- ).items ()
60
- }
61
- assert_equal_parameters (self .pretrained .encoder , module , mapping )
62
-
63
92
def test_loading_partial_pretrained_weights (self ):
64
93
65
94
kwargs = TransformerBlock ._get_input_arguments (self .pretrained .encoder )
@@ -78,3 +107,68 @@ def test_loading_partial_pretrained_weights(self):
78
107
transformer_block ,
79
108
mapping ,
80
109
)
110
+
111
+ @pytest .mark .parametrize ("module_name, hf_module" , get_modules (PARAMS_DICT ).items ())
112
+ def test_forward_against_huggingface_outputs (self , module_name , hf_module ):
113
+ hidden_states = torch .randn (2 , 3 , 6 )
114
+ attention_mask = torch .tensor ([[0 , 1 , 0 ], [1 , 1 , 0 ]])
115
+
116
+ block = TransformerBlock .from_pretrained_module (hf_module )
117
+
118
+ torch .manual_seed (1234 )
119
+ output = block .forward (hidden_states , attention_mask = attention_mask )
120
+ # We do this because bert, roberta, electra process the attention_mask at the model level.
121
+ attention_mask_hf = (attention_mask == 0 ).view ((2 , 1 , 1 , 3 )).expand (2 , 2 , 3 , 3 ) * - 10e5
122
+ torch .manual_seed (1234 )
123
+ hf_output = hf_module .forward (hidden_states , attention_mask = attention_mask_hf )
124
+
125
+ assert torch .allclose (output [0 ], hf_output [0 ])
126
+
127
+ @pytest .mark .parametrize (
128
+ "pretrained_name" ,
129
+ [
130
+ "bert-base-uncased" ,
131
+ ],
132
+ )
133
+ def test_loading_from_pretrained_weights_using_model_name (self , pretrained_name ):
134
+
135
+ torch .manual_seed (1234 )
136
+ pretrained = cached_transformers .get (pretrained_name , False )
137
+
138
+ if "distilbert" in pretrained_name :
139
+ pretrained_module = pretrained .transformer
140
+ else :
141
+ pretrained_module = pretrained .encoder
142
+
143
+ torch .manual_seed (1234 )
144
+ module = TransformerBlock .from_pretrained_module (pretrained_name )
145
+ mapping = {
146
+ val : key
147
+ for key , val in module ._construct_default_mapping (
148
+ pretrained_module , "huggingface" , {}
149
+ ).items ()
150
+ }
151
+ assert_equal_parameters (pretrained_module , module , mapping = mapping )
152
+
153
+ batch_size = 1
154
+ seq_len = 768
155
+ dim = dict (module .named_modules ())["layers.0.attention.self.query" ].in_features
156
+ hidden_states = torch .randn (batch_size , seq_len , dim )
157
+ attention_mask = torch .randn (batch_size , seq_len )
158
+ mask_reshp = (batch_size , 1 , 1 , dim )
159
+ attention_mask_hf = (attention_mask == 0 ).view (mask_reshp )
160
+ attention_mask_hf = attention_mask_hf .expand (batch_size , 12 , seq_len , seq_len ) * - 10e5
161
+
162
+ torch .manual_seed (1234 )
163
+ output = module .forward (hidden_states , attention_mask = attention_mask .squeeze ())[0 ]
164
+ torch .manual_seed (1234 )
165
+ hf_output = pretrained_module .forward (hidden_states , attention_mask = attention_mask_hf )[0 ]
166
+
167
+ # FIX: look into the reason for mismatch.
168
+ # Update: The discrepancy comes from torch.nn.Dropout layer, despite setting random seeds.
169
+ # Have also tried setting random seeds right before the actual call to dropout in both modules.
170
+ # While the issue has been isolated, not removing this comment till we can figure out a way
171
+ # to get deterministic outputs from dropout.
172
+ # assert torch.allclose(output, hf_output)
173
+ print (output )
174
+ print (hf_output )
0 commit comments