Skip to content

Commit 4e9665d

Browse files
committed
Added ADAM optimizer, unless git screwed it up, cos there is no diff
1 parent 2633fef commit 4e9665d

21 files changed

+386
-130
lines changed

arch/Makefile.am

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
AM_CPPFLAGS += -I$(top_srcdir)/ccutil -I$(top_srcdir)/viewer
1+
AM_CPPFLAGS += -I$(top_srcdir)/ccutil -I$(top_srcdir)/viewer -DUSE_STD_NAMESPACE
22
AUTOMAKE_OPTIONS = subdir-objects
33
SUBDIRS =
44
AM_CXXFLAGS =

arch/simddetect.cpp

+14
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,9 @@ SIMDDetect SIMDDetect::detector;
3737

3838
// If true, then AVX has been detected.
3939
bool SIMDDetect::avx_available_;
40+
bool SIMDDetect::avx2_available_;
41+
bool SIMDDetect::avx512F_available_;
42+
bool SIMDDetect::avx512BW_available_;
4043
// If true, then SSe4.1 has been detected.
4144
bool SIMDDetect::sse_available_;
4245

@@ -50,8 +53,19 @@ SIMDDetect::SIMDDetect() {
5053
#if defined(__GNUC__)
5154
unsigned int eax, ebx, ecx, edx;
5255
if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) {
56+
// Note that these tests all use hex because the older compilers don't have
57+
// the newer flags.
5358
sse_available_ = (ecx & 0x00080000) != 0;
5459
avx_available_ = (ecx & 0x10000000) != 0;
60+
if (avx_available_) {
61+
// There is supposed to be a __get_cpuid_count function, but this is all
62+
// there is in my cpuid.h. It is a macro for an asm statement and cannot
63+
// be used inside an if.
64+
__cpuid_count(7, 0, eax, ebx, ecx, edx);
65+
avx2_available_ = (ebx & 0x00000020) != 0;
66+
avx512F_available_ = (ebx & 0x00010000) != 0;
67+
avx512BW_available_ = (ebx & 0x40000000) != 0;
68+
}
5569
}
5670
#elif defined(_WIN32)
5771
int cpuInfo[4];

arch/simddetect.h

+13
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,16 @@ class SIMDDetect {
2424
public:
2525
// Returns true if AVX is available on this system.
2626
static inline bool IsAVXAvailable() { return detector.avx_available_; }
27+
// Returns true if AVX2 (integer support) is available on this system.
28+
static inline bool IsAVX2Available() { return detector.avx2_available_; }
29+
// Returns true if AVX512 Foundation (float) is available on this system.
30+
static inline bool IsAVX512FAvailable() {
31+
return detector.avx512F_available_;
32+
}
33+
// Returns true if AVX512 integer is available on this system.
34+
static inline bool IsAVX512BWAvailable() {
35+
return detector.avx512BW_available_;
36+
}
2737
// Returns true if SSE4.1 is available on this system.
2838
static inline bool IsSSEAvailable() { return detector.sse_available_; }
2939

@@ -36,6 +46,9 @@ class SIMDDetect {
3646
static SIMDDetect detector;
3747
// If true, then AVX has been detected.
3848
static TESS_API bool avx_available_;
49+
static TESS_API bool avx2_available_;
50+
static TESS_API bool avx512F_available_;
51+
static TESS_API bool avx512BW_available_;
3952
// If true, then SSe4.1 has been detected.
4053
static TESS_API bool sse_available_;
4154
};

ccstruct/matrix.h

+9-6
Original file line numberDiff line numberDiff line change
@@ -360,19 +360,22 @@ class GENERIC_2D_ARRAY {
360360
}
361361

362362
// Accumulates the element-wise sums of squares of src into *this.
363-
void SumSquares(const GENERIC_2D_ARRAY<T>& src) {
363+
void SumSquares(const GENERIC_2D_ARRAY<T>& src, T decay_factor) {
364+
T update_factor = 1.0 - decay_factor;
364365
int size = num_elements();
365366
for (int i = 0; i < size; ++i) {
366-
array_[i] += src.array_[i] * src.array_[i];
367+
array_[i] = array_[i] * decay_factor +
368+
update_factor * src.array_[i] * src.array_[i];
367369
}
368370
}
369371

370-
// Scales each element using the ada-grad algorithm, ie array_[i] by
371-
// sqrt(num_samples/max(1,sqsum[i])).
372-
void AdaGradScaling(const GENERIC_2D_ARRAY<T>& sqsum, int num_samples) {
372+
// Scales each element using the adam algorithm, ie array_[i] by
373+
// sqrt(sqsum[i] + epsilon)).
374+
void AdamUpdate(const GENERIC_2D_ARRAY<T>& sum,
375+
const GENERIC_2D_ARRAY<T>& sqsum, T epsilon) {
373376
int size = num_elements();
374377
for (int i = 0; i < size; ++i) {
375-
array_[i] *= sqrt(num_samples / MAX(1.0, sqsum.array_[i]));
378+
array_[i] += sum.array_[i] / (sqrt(sqsum.array_[i]) + epsilon);
376379
}
377380
}
378381

lstm/convolve.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ bool Convolve::Backward(bool debug, const NetworkIO& fwd_deltas,
112112
}
113113
}
114114
} while (src_index.Increment());
115-
back_deltas->CopyWithNormalization(*delta_sum, fwd_deltas);
115+
back_deltas->CopyAll(*delta_sum);
116116
return true;
117117
}
118118

lstm/fullyconnected.cpp

+18-7
Original file line numberDiff line numberDiff line change
@@ -79,11 +79,24 @@ void FullyConnected::SetEnableTraining(TrainingState state) {
7979
// scale `range` picked according to the random number generator `randomizer`.
8080
int FullyConnected::InitWeights(float range, TRand* randomizer) {
8181
Network::SetRandomizer(randomizer);
82-
num_weights_ = weights_.InitWeightsFloat(no_, ni_ + 1, TestFlag(NF_ADA_GRAD),
82+
num_weights_ = weights_.InitWeightsFloat(no_, ni_ + 1, TestFlag(NF_ADAM),
8383
range, randomizer);
8484
return num_weights_;
8585
}
8686

87+
// Changes the number of outputs to the size of the given code_map, copying
88+
// the old weight matrix entries for each output from code_map[output] where
89+
// non-negative, and uses the mean (over all outputs) of the existing weights
90+
// for all outputs with negative code_map entries. Returns the new number of
91+
// weights. Only operates on Softmax layers with old_no outputs.
92+
int FullyConnected::RemapOutputs(int old_no, const std::vector<int>& code_map) {
93+
if (type_ == NT_SOFTMAX && no_ == old_no) {
94+
num_weights_ = weights_.RemapOutputs(code_map);
95+
no_ = code_map.size();
96+
}
97+
return num_weights_;
98+
}
99+
87100
// Converts a float network to an int network.
88101
void FullyConnected::ConvertToInt() {
89102
weights_.ConvertToInt();
@@ -240,7 +253,6 @@ bool FullyConnected::Backward(bool debug, const NetworkIO& fwd_deltas,
240253
FinishBackward(*errors_t.get());
241254
if (needs_to_backprop_) {
242255
back_deltas->ZeroInvalidElements();
243-
back_deltas->CopyWithNormalization(*back_deltas, fwd_deltas);
244256
#if DEBUG_DETAIL > 0
245257
tprintf("F Backprop:%s\n", name_.string());
246258
back_deltas->Print(10);
@@ -281,12 +293,11 @@ void FullyConnected::FinishBackward(const TransposedArray& errors_t) {
281293
weights_.SumOuterTransposed(errors_t, *external_source_, true);
282294
}
283295

284-
// Updates the weights using the given learning rate and momentum.
285-
// num_samples is the quotient to be used in the adagrad computation iff
286-
// use_ada_grad_ is true.
296+
// Updates the weights using the given learning rate, momentum and adam_beta.
297+
// num_samples is used in the adam computation iff use_adam_ is true.
287298
void FullyConnected::Update(float learning_rate, float momentum,
288-
int num_samples) {
289-
weights_.Update(learning_rate, momentum, num_samples);
299+
float adam_beta, int num_samples) {
300+
weights_.Update(learning_rate, momentum, adam_beta, num_samples);
290301
}
291302

292303
// Sums the products of weight updates in *this and other, splitting into

lstm/fullyconnected.h

+10-4
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,12 @@ class FullyConnected : public Network {
6868
// Sets up the network for training. Initializes weights using weights of
6969
// scale `range` picked according to the random number generator `randomizer`.
7070
virtual int InitWeights(float range, TRand* randomizer);
71+
// Changes the number of outputs to the size of the given code_map, copying
72+
// the old weight matrix entries for each output from code_map[output] where
73+
// non-negative, and uses the mean (over all outputs) of the existing weights
74+
// for all outputs with negative code_map entries. Returns the new number of
75+
// weights. Only operates on Softmax layers with old_no outputs.
76+
int RemapOutputs(int old_no, const std::vector<int>& code_map) override;
7177

7278
// Converts a float network to an int network.
7379
virtual void ConvertToInt();
@@ -101,10 +107,10 @@ class FullyConnected : public Network {
101107
TransposedArray* errors_t, double* backprop);
102108
void FinishBackward(const TransposedArray& errors_t);
103109

104-
// Updates the weights using the given learning rate and momentum.
105-
// num_samples is the quotient to be used in the adagrad computation iff
106-
// use_ada_grad_ is true.
107-
virtual void Update(float learning_rate, float momentum, int num_samples);
110+
// Updates the weights using the given learning rate, momentum and adam_beta.
111+
// num_samples is used in the adam computation iff use_adam_ is true.
112+
void Update(float learning_rate, float momentum, float adam_beta,
113+
int num_samples) override;
108114
// Sums the products of weight updates in *this and other, splitting into
109115
// positive (same direction) in *same and negative (different direction) in
110116
// *changed.

lstm/lstm.cpp

+21-13
Original file line numberDiff line numberDiff line change
@@ -132,14 +132,27 @@ int LSTM::InitWeights(float range, TRand* randomizer) {
132132
for (int w = 0; w < WT_COUNT; ++w) {
133133
if (w == GFS && !Is2D()) continue;
134134
num_weights_ += gate_weights_[w].InitWeightsFloat(
135-
ns_, na_ + 1, TestFlag(NF_ADA_GRAD), range, randomizer);
135+
ns_, na_ + 1, TestFlag(NF_ADAM), range, randomizer);
136136
}
137137
if (softmax_ != NULL) {
138138
num_weights_ += softmax_->InitWeights(range, randomizer);
139139
}
140140
return num_weights_;
141141
}
142142

143+
// Changes the number of outputs to the size of the given code_map, copying
144+
// the old weight matrix entries for each output from code_map[output] where
145+
// non-negative, and uses the mean (over all outputs) of the existing weights
146+
// for all outputs with negative code_map entries. Returns the new number of
147+
// weights. Only operates on Softmax layers with old_no outputs.
148+
int LSTM::RemapOutputs(int old_no, const std::vector<int>& code_map) {
149+
if (softmax_ != NULL) {
150+
num_weights_ -= softmax_->num_weights();
151+
num_weights_ += softmax_->RemapOutputs(old_no, code_map);
152+
}
153+
return num_weights_;
154+
}
155+
143156
// Converts a float network to an int network.
144157
void LSTM::ConvertToInt() {
145158
for (int w = 0; w < WT_COUNT; ++w) {
@@ -618,27 +631,22 @@ bool LSTM::Backward(bool debug, const NetworkIO& fwd_deltas,
618631
if (softmax_ != NULL) {
619632
softmax_->FinishBackward(*softmax_errors_t);
620633
}
621-
if (needs_to_backprop_) {
622-
// Normalize the inputerr in back_deltas.
623-
back_deltas->CopyWithNormalization(*back_deltas, fwd_deltas);
624-
return true;
625-
}
626-
return false;
634+
return needs_to_backprop_;
627635
}
628636

629-
// Updates the weights using the given learning rate and momentum.
630-
// num_samples is the quotient to be used in the adagrad computation iff
631-
// use_ada_grad_ is true.
632-
void LSTM::Update(float learning_rate, float momentum, int num_samples) {
637+
// Updates the weights using the given learning rate, momentum and adam_beta.
638+
// num_samples is used in the adam computation iff use_adam_ is true.
639+
void LSTM::Update(float learning_rate, float momentum, float adam_beta,
640+
int num_samples) {
633641
#if DEBUG_DETAIL > 3
634642
PrintW();
635643
#endif
636644
for (int w = 0; w < WT_COUNT; ++w) {
637645
if (w == GFS && !Is2D()) continue;
638-
gate_weights_[w].Update(learning_rate, momentum, num_samples);
646+
gate_weights_[w].Update(learning_rate, momentum, adam_beta, num_samples);
639647
}
640648
if (softmax_ != NULL) {
641-
softmax_->Update(learning_rate, momentum, num_samples);
649+
softmax_->Update(learning_rate, momentum, adam_beta, num_samples);
642650
}
643651
#if DEBUG_DETAIL > 3
644652
PrintDW();

lstm/lstm.h

+10-4
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,12 @@ class LSTM : public Network {
7676
// Sets up the network for training. Initializes weights using weights of
7777
// scale `range` picked according to the random number generator `randomizer`.
7878
virtual int InitWeights(float range, TRand* randomizer);
79+
// Changes the number of outputs to the size of the given code_map, copying
80+
// the old weight matrix entries for each output from code_map[output] where
81+
// non-negative, and uses the mean (over all outputs) of the existing weights
82+
// for all outputs with negative code_map entries. Returns the new number of
83+
// weights. Only operates on Softmax layers with old_no outputs.
84+
int RemapOutputs(int old_no, const std::vector<int>& code_map) override;
7985

8086
// Converts a float network to an int network.
8187
virtual void ConvertToInt();
@@ -99,10 +105,10 @@ class LSTM : public Network {
99105
virtual bool Backward(bool debug, const NetworkIO& fwd_deltas,
100106
NetworkScratch* scratch,
101107
NetworkIO* back_deltas);
102-
// Updates the weights using the given learning rate and momentum.
103-
// num_samples is the quotient to be used in the adagrad computation iff
104-
// use_ada_grad_ is true.
105-
virtual void Update(float learning_rate, float momentum, int num_samples);
108+
// Updates the weights using the given learning rate, momentum and adam_beta.
109+
// num_samples is used in the adam computation iff use_adam_ is true.
110+
void Update(float learning_rate, float momentum, float adam_beta,
111+
int num_samples) override;
106112
// Sums the products of weight updates in *this and other, splitting into
107113
// positive (same direction) in *same and negative (different direction) in
108114
// *changed.

lstm/lstmrecognizer.cpp

+15-8
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,9 @@ LSTMRecognizer::LSTMRecognizer()
5555
training_iteration_(0),
5656
sample_iteration_(0),
5757
null_char_(UNICHAR_BROKEN),
58-
weight_range_(0.0f),
5958
learning_rate_(0.0f),
6059
momentum_(0.0f),
60+
adam_beta_(0.0f),
6161
dict_(NULL),
6262
search_(NULL),
6363
debug_win_(NULL) {}
@@ -94,7 +94,7 @@ bool LSTMRecognizer::Serialize(const TessdataManager* mgr, TFile* fp) const {
9494
if (fp->FWrite(&sample_iteration_, sizeof(sample_iteration_), 1) != 1)
9595
return false;
9696
if (fp->FWrite(&null_char_, sizeof(null_char_), 1) != 1) return false;
97-
if (fp->FWrite(&weight_range_, sizeof(weight_range_), 1) != 1) return false;
97+
if (fp->FWrite(&adam_beta_, sizeof(adam_beta_), 1) != 1) return false;
9898
if (fp->FWrite(&learning_rate_, sizeof(learning_rate_), 1) != 1) return false;
9999
if (fp->FWrite(&momentum_, sizeof(momentum_), 1) != 1) return false;
100100
if (include_charsets && IsRecoding() && !recoder_.Serialize(fp)) return false;
@@ -120,8 +120,7 @@ bool LSTMRecognizer::DeSerialize(const TessdataManager* mgr, TFile* fp) {
120120
if (fp->FReadEndian(&sample_iteration_, sizeof(sample_iteration_), 1) != 1)
121121
return false;
122122
if (fp->FReadEndian(&null_char_, sizeof(null_char_), 1) != 1) return false;
123-
if (fp->FReadEndian(&weight_range_, sizeof(weight_range_), 1) != 1)
124-
return false;
123+
if (fp->FReadEndian(&adam_beta_, sizeof(adam_beta_), 1) != 1) return false;
125124
if (fp->FReadEndian(&learning_rate_, sizeof(learning_rate_), 1) != 1)
126125
return false;
127126
if (fp->FReadEndian(&momentum_, sizeof(momentum_), 1) != 1) return false;
@@ -207,14 +206,22 @@ void LSTMRecognizer::OutputStats(const NetworkIO& outputs, float* min_output,
207206
STATS stats(0, kOutputScale + 1);
208207
for (int t = 0; t < outputs.Width(); ++t) {
209208
int best_label = outputs.BestLabel(t, NULL);
210-
if (best_label != null_char_ || t == 0) {
209+
if (best_label != null_char_) {
211210
float best_output = outputs.f(t)[best_label];
212211
stats.add(static_cast<int>(kOutputScale * best_output), 1);
213212
}
214213
}
215-
*min_output = static_cast<float>(stats.min_bucket()) / kOutputScale;
216-
*mean_output = stats.mean() / kOutputScale;
217-
*sd = stats.sd() / kOutputScale;
214+
// If the output is all nulls it could be that the photometric interpretation
215+
// is wrong, so make it look bad, so the other way can win, even if not great.
216+
if (stats.get_total() == 0) {
217+
*min_output = 0.0f;
218+
*mean_output = 0.0f;
219+
*sd = 1.0f;
220+
} else {
221+
*min_output = static_cast<float>(stats.min_bucket()) / kOutputScale;
222+
*mean_output = stats.mean() / kOutputScale;
223+
*sd = stats.sd() / kOutputScale;
224+
}
218225
}
219226

220227
// Recognizes the image_data, returning the labels,

0 commit comments

Comments
 (0)