Skip to content

Commit 5df5413

Browse files
authored
embed int8 quantization and add embed test (#5667)
1 parent 5e2d56d commit 5df5413

File tree

8 files changed

+261
-9
lines changed

8 files changed

+261
-9
lines changed

.ci/pnnx.yml

+2
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,14 @@ on:
44
branches: [master]
55
paths:
66
- '.ci/pnnx.yml'
7+
- 'src/layer/*'
78
- 'tools/pnnx/**'
89
- '!tools/pnnx/README.md'
910
mr:
1011
target-branches: [master]
1112
paths:
1213
- '.ci/pnnx.yml'
14+
- 'src/layer/*'
1315
- 'tools/pnnx/**'
1416
- '!tools/pnnx/README.md'
1517
concurrency:

docs/developer-guide/operators.md

+2
Original file line numberDiff line numberDiff line change
@@ -837,11 +837,13 @@ y = embedding(x)
837837
| 1 | input_dim | int | 0 | |
838838
| 2 | bias_term | int | 0 | |
839839
| 3 | weight_data_size | int | 0 | |
840+
| 18 | int8_scale_term| int | 0 | |
840841

841842
| weight | type | shape |
842843
| ------------- | ----- | --------------------- |
843844
| weight_data | float | [weight_data_size] |
844845
| bias_term | float | [num_output] |
846+
| weight_data_int8_scales| float | [1] |
845847

846848
# Exp
847849
```

src/layer/embed.cpp

+79-9
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@ int Embed::load_param(const ParamDict& pd)
3030
input_dim = pd.get(1, 0);
3131
bias_term = pd.get(2, 0);
3232
weight_data_size = pd.get(3, 0);
33+
int8_scale_term = pd.get(18, 0);
3334

3435
return 0;
3536
}
@@ -47,18 +48,23 @@ int Embed::load_model(const ModelBin& mb)
4748
return -100;
4849
}
4950

51+
#if NCNN_INT8
52+
if (int8_scale_term)
53+
{
54+
weight_data_int8_scale = mb.load(1, 1)[0];
55+
}
56+
#endif // NCNN_INT8
57+
5058
return 0;
5159
}
5260

53-
int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
61+
static void embed(const Mat& bottom_blob, const Mat& weight_data, const Mat& bias_data, Mat& top_blob, int input_dim, const Option& opt)
5462
{
55-
int words = static_cast<int>(bottom_blob.total());
63+
const int num_output = top_blob.w;
64+
const int words = top_blob.h;
5665

57-
top_blob.create(num_output, words, 4u, opt.blob_allocator);
58-
if (top_blob.empty())
59-
return -100;
66+
const float* bias_ptr = bias_data;
6067

61-
// num_output
6268
#pragma omp parallel for num_threads(opt.num_threads)
6369
for (int q = 0; q < words; q++)
6470
{
@@ -73,15 +79,79 @@ int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) con
7379

7480
const float* em = (const float*)weight_data + num_output * word_index;
7581

76-
memcpy(outptr, em, num_output * sizeof(float));
82+
if (bias_ptr)
83+
{
84+
for (int p = 0; p < num_output; p++)
85+
{
86+
outptr[p] = em[p] + bias_ptr[p];
87+
}
88+
}
89+
else
90+
{
91+
memcpy(outptr, em, num_output * sizeof(float));
92+
}
93+
}
94+
}
95+
96+
#if NCNN_INT8
97+
static void embed_int8(const Mat& bottom_blob, const Mat& weight_data, float weight_data_int8_scale, const Mat& bias_data, Mat& top_blob, int input_dim, const Option& opt)
98+
{
99+
const int num_output = top_blob.w;
100+
const int words = top_blob.h;
101+
102+
const float* bias_ptr = bias_data;
103+
104+
#pragma omp parallel for num_threads(opt.num_threads)
105+
for (int q = 0; q < words; q++)
106+
{
107+
float* outptr = top_blob.row(q);
108+
109+
int word_index = ((const int*)bottom_blob)[q];
77110

78-
if (bias_term)
111+
if (word_index < 0)
112+
word_index = 0;
113+
if (word_index >= input_dim)
114+
word_index = input_dim - 1;
115+
116+
const float descale_em = 1.f / weight_data_int8_scale;
117+
118+
const signed char* em = (const signed char*)weight_data + num_output * word_index;
119+
120+
if (bias_ptr)
79121
{
80122
for (int p = 0; p < num_output; p++)
81123
{
82-
outptr[p] += bias_data[p];
124+
outptr[p] = em[p] * descale_em + bias_ptr[p];
83125
}
84126
}
127+
else
128+
{
129+
for (int p = 0; p < num_output; p++)
130+
{
131+
outptr[p] = em[p] * descale_em;
132+
}
133+
}
134+
}
135+
}
136+
#endif // NCNN_INT8
137+
138+
int Embed::forward(const Mat& bottom_blob, Mat& top_blob, const Option& opt) const
139+
{
140+
int words = static_cast<int>(bottom_blob.total());
141+
142+
top_blob.create(num_output, words, 4u, opt.blob_allocator);
143+
if (top_blob.empty())
144+
return -100;
145+
146+
#if NCNN_INT8
147+
if (int8_scale_term)
148+
{
149+
embed_int8(bottom_blob, weight_data, weight_data_int8_scale, bias_data, top_blob, input_dim, opt);
150+
}
151+
else
152+
#endif // NCNN_INT8
153+
{
154+
embed(bottom_blob, weight_data, bias_data, top_blob, input_dim, opt);
85155
}
86156

87157
return 0;

src/layer/embed.h

+6
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,15 @@ class Embed : public Layer
3838

3939
int weight_data_size;
4040

41+
int int8_scale_term;
42+
4143
// model
4244
Mat weight_data;
4345
Mat bias_data;
46+
47+
#if NCNN_INT8
48+
float weight_data_int8_scale;
49+
#endif
4450
};
4551

4652
} // namespace ncnn

tests/CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,7 @@ ncnn_add_layer_test(Dropout)
101101
ncnn_add_layer_test(Einsum)
102102
ncnn_add_layer_test(Eltwise)
103103
ncnn_add_layer_test(ELU)
104+
ncnn_add_layer_test(Embed)
104105
ncnn_add_layer_test(Erf)
105106
ncnn_add_layer_test(ExpandDims)
106107
ncnn_add_layer_test(Flatten)

tests/test_embed.cpp

+108
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,108 @@
1+
// Tencent is pleased to support the open source community by making ncnn available.
2+
//
3+
// Copyright (C) 2024 THL A29 Limited, a Tencent company. All rights reserved.
4+
//
5+
// Licensed under the BSD 3-Clause License (the "License"); you may not use this file except
6+
// in compliance with the License. You may obtain a copy of the License at
7+
//
8+
// https://opensource.org/licenses/BSD-3-Clause
9+
//
10+
// Unless required by applicable law or agreed to in writing, software distributed
11+
// under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
12+
// CONDITIONS OF ANY KIND, either express or implied. See the License for the
13+
// specific language governing permissions and limitations under the License.
14+
15+
#include "testutil.h"
16+
17+
static int test_embed(int words, int num_output, int input_dim, int bias)
18+
{
19+
ncnn::ParamDict pd;
20+
pd.set(0, num_output);
21+
pd.set(1, input_dim);
22+
pd.set(2, bias);
23+
pd.set(3, num_output * input_dim);
24+
25+
std::vector<ncnn::Mat> weights(bias ? 2 : 1);
26+
weights[0] = RandomMat(num_output * input_dim);
27+
if (bias)
28+
weights[1] = RandomMat(num_output);
29+
30+
ncnn::Mat a(words);
31+
RandomizeInt(a, 0, input_dim);
32+
33+
int ret = test_layer("Embed", pd, weights, a);
34+
if (ret != 0)
35+
{
36+
fprintf(stderr, "test_embed failed words=%d num_output=%d input_dim=%d bias=%d\n", words, num_output, input_dim, bias);
37+
}
38+
39+
return ret;
40+
}
41+
42+
static int test_embed_0()
43+
{
44+
return 0
45+
|| test_embed(128, 128, 128, 0)
46+
|| test_embed(128, 128, 128, 1)
47+
|| test_embed(127, 127, 127, 0)
48+
|| test_embed(127, 127, 127, 1)
49+
|| test_embed(124, 124, 124, 0)
50+
|| test_embed(124, 124, 124, 1);
51+
}
52+
53+
#if NCNN_INT8
54+
static int test_embed_int8(int words, int num_output, int input_dim, int bias)
55+
{
56+
ncnn::ParamDict pd;
57+
pd.set(0, num_output);
58+
pd.set(1, input_dim);
59+
pd.set(2, bias);
60+
pd.set(3, num_output * input_dim);
61+
pd.set(18, 2);
62+
63+
std::vector<ncnn::Mat> weights(bias ? 3 : 2);
64+
weights[0] = RandomS8Mat(num_output * input_dim);
65+
if (bias)
66+
{
67+
weights[1] = RandomMat(num_output);
68+
weights[2] = RandomMat(1, 100.f, 200.f);
69+
}
70+
else
71+
{
72+
weights[1] = RandomMat(1, 100.f, 200.f);
73+
}
74+
75+
ncnn::Mat a(words);
76+
RandomizeInt(a, 0, input_dim);
77+
78+
int ret = test_layer("Embed", pd, weights, a);
79+
if (ret != 0)
80+
{
81+
fprintf(stderr, "test_embed_int8 failed words=%d num_output=%d input_dim=%d bias=%d\n", words, num_output, input_dim, bias);
82+
}
83+
84+
return ret;
85+
}
86+
87+
static int test_embed_1()
88+
{
89+
return 0
90+
|| test_embed_int8(128, 128, 128, 0)
91+
|| test_embed_int8(128, 128, 128, 1)
92+
|| test_embed_int8(127, 127, 127, 0)
93+
|| test_embed_int8(127, 127, 127, 1)
94+
|| test_embed_int8(124, 124, 124, 0)
95+
|| test_embed_int8(124, 124, 124, 1);
96+
}
97+
#endif // NCNN_INT8
98+
99+
int main()
100+
{
101+
SRAND(7767517);
102+
103+
#if NCNN_INT8
104+
return test_embed_0() || test_embed_1();
105+
#else
106+
return test_embed_0();
107+
#endif
108+
}

tools/modelwriter.h

+11
Original file line numberDiff line numberDiff line change
@@ -1676,9 +1676,20 @@ int ModelWriter::save(const char* parampath, const char* binpath)
16761676
fprintf_param_value(" 1=%d", input_dim)
16771677
fprintf_param_value(" 2=%d", bias_term)
16781678
fprintf_param_value(" 3=%d", weight_data_size)
1679+
fprintf_param_value(" 18=%d", int8_scale_term)
16791680

16801681
fwrite_weight_tag_data(op->weight_data, bp);
16811682
fwrite_weight_data(op->bias_data, bp);
1683+
1684+
#if NCNN_INT8
1685+
// write int8_scale data
1686+
if (op->int8_scale_term)
1687+
{
1688+
ncnn::Mat weight_data_int8_scales(1);
1689+
weight_data_int8_scales[0] = op->weight_data_int8_scale;
1690+
fwrite_weight_data(weight_data_int8_scales, bp, 90, 100);
1691+
}
1692+
#endif // NCNN_INT8
16821693
}
16831694
else if (layer->type == "Exp")
16841695
{

tools/quantize/ncnn2int8.cpp

+52
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,8 @@ class NetQuantize : public ModelWriter
133133
int quantize_lstm();
134134
int quantize_gru();
135135

136+
int quantize_embed();
137+
136138
int fuse_requantize();
137139
};
138140

@@ -562,6 +564,55 @@ int NetQuantize::quantize_gru()
562564
return 0;
563565
}
564566

567+
int NetQuantize::quantize_embed()
568+
{
569+
for (size_t i = 0; i < layers.size(); i++)
570+
{
571+
if (layers[i]->type != "Embed")
572+
continue;
573+
574+
// Embed - quantize weight from fp32 to int8
575+
ncnn::Embed* embed = (ncnn::Embed*)layers[i];
576+
577+
fprintf(stderr, "quantize_embed %s\n", embed->name.c_str());
578+
579+
// TODO move to ncnn2table
580+
581+
const int num_output = embed->num_output;
582+
const int input_dim = embed->input_dim;
583+
584+
ncnn::Mat weight_data_int8_scales(1);
585+
{
586+
const float* ptr = embed->weight_data;
587+
float absmax = 0.f;
588+
for (int i = 0; i < embed->weight_data.w; i++)
589+
{
590+
absmax = std::max(absmax, (float)fabs(ptr[i]));
591+
}
592+
593+
weight_data_int8_scales[0] = absmax == 0.f ? 1.f : 127 / absmax;
594+
}
595+
596+
{
597+
ncnn::Mat weight_data_int8;
598+
599+
ncnn::Option opt_q = opt;
600+
opt_q.blob_allocator = embed->weight_data.allocator;
601+
opt_q.use_packing_layout = false;
602+
ncnn::quantize_to_int8(embed->weight_data, weight_data_int8, weight_data_int8_scales, opt_q);
603+
if (weight_data_int8.empty())
604+
return -100;
605+
606+
embed->weight_data = weight_data_int8;
607+
}
608+
609+
embed->int8_scale_term = 2;
610+
embed->weight_data_int8_scale = weight_data_int8_scales[0];
611+
}
612+
613+
return 0;
614+
}
615+
565616
int NetQuantize::fuse_requantize()
566617
{
567618
const size_t layer_count = layers.size();
@@ -809,6 +860,7 @@ int main(int argc, char** argv)
809860
quantizer.quantize_rnn();
810861
quantizer.quantize_lstm();
811862
quantizer.quantize_gru();
863+
quantizer.quantize_embed();
812864

813865
quantizer.fuse_requantize();
814866

0 commit comments

Comments
 (0)