Skip to content

Commit 91c7504

Browse files
committed
Added a feature to enrich the hOCR output with glyph confidences
By using the parameter -c glyph_confidences=true the user is able to enrich the hOCR output with additional information. Tesseract then lists additionally the timesteps with all glyphs that were considered with their confidence for every timestep of the LSTM. The format of the hOCR output is slightly changed: There is now a linebreak after every word for better readability by humans. Signed-off-by: Noah Metzger <[email protected]>
1 parent 607e8fd commit 91c7504

11 files changed

+138
-16
lines changed

src/api/baseapi.cpp

+46-3
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@
4949
#include <fstream> // for size_t
5050
#include <iostream> // for std::cin
5151
#include <memory> // for std::unique_ptr
52+
#include <set> // for std::pair
53+
#include <vector> // for std::vector
5254
#include "allheaders.h" // for pixDestroy, boxCreate, boxaAddBox, box...
5355
#include "blobclass.h" // for ExtractFontName
5456
#include "boxword.h" // for BoxWord
@@ -398,6 +400,7 @@ int TessBaseAPI::Init(const char* data, int data_size, const char* language,
398400
return -1;
399401
}
400402
}
403+
401404
PERF_COUNT_SUB("update tesseract_")
402405
// Update datapath and language requested for the last valid initialization.
403406
if (datapath_ == nullptr)
@@ -1389,6 +1392,17 @@ static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1,
13891392
*hocr_str += "'";
13901393
}
13911394

1395+
static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1,
1396+
int num2, int num3) {
1397+
const size_t BUFSIZE = 64;
1398+
char id_buffer[BUFSIZE];
1399+
snprintf(id_buffer, BUFSIZE - 1, "%s_%d_%d_%d", base.c_str(), num1, num2,num3);
1400+
id_buffer[BUFSIZE - 1] = '\0';
1401+
*hocr_str += " id='";
1402+
*hocr_str += id_buffer;
1403+
*hocr_str += "'";
1404+
}
1405+
13921406
static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level,
13931407
STRING* hocr_str) {
13941408
int left, top, right, bottom;
@@ -1449,7 +1463,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
14491463
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0))
14501464
return nullptr;
14511465

1452-
int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
1466+
int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1, tcnt = 1, gcnt = 1;
14531467
int page_id = page_number + 1; // hOCR uses 1-based page numbers.
14541468
bool para_is_ltr = true; // Default direction is LTR
14551469
const char* paragraph_lang = nullptr;
@@ -1529,7 +1543,11 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
15291543
}
15301544

15311545
// Now, process the word...
1532-
hocr_str += "<span class='ocrx_word'";
1546+
std::vector<std::vector<std::pair<const char*, float>>>* confidencemap = nullptr;
1547+
if (tesseract_->glyph_confidences) {
1548+
confidencemap = res_it->GetGlyphConfidences();
1549+
}
1550+
hocr_str += "\n <span class='ocrx_word'";
15331551
AddIdTohOCR(&hocr_str, "word", page_id, wcnt);
15341552
int left, top, right, bottom;
15351553
bool bold, italic, underlined, monospace, serif, smallcaps;
@@ -1587,7 +1605,32 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
15871605
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
15881606
if (italic) hocr_str += "</em>";
15891607
if (bold) hocr_str += "</strong>";
1590-
hocr_str += "</span> ";
1608+
// If glyph confidence is required it is added here
1609+
if (tesseract_->glyph_confidences && confidencemap != nullptr) {
1610+
for (size_t i = 0; i < confidencemap->size(); i++) {
1611+
hocr_str += "\n <span class='ocrx_cinfo'";
1612+
AddIdTohOCR(&hocr_str, "timestep", page_id, wcnt, tcnt);
1613+
hocr_str += ">";
1614+
//*
1615+
std::vector<std::pair<const char*, float>> timestep = (*confidencemap)[i];
1616+
for (std::pair<const char*, float> conf : timestep) {
1617+
hocr_str += "<span class='ocr_glyph'";
1618+
AddIdTohOCR(&hocr_str, "glyph", page_id, wcnt, gcnt);
1619+
hocr_str.add_str_int(" title='x_confs ", int(conf.second * 100));
1620+
hocr_str += "'";
1621+
hocr_str += ">";
1622+
hocr_str += conf.first;
1623+
hocr_str += "</span>";
1624+
gcnt++;
1625+
}
1626+
//*/
1627+
hocr_str += "</span>";
1628+
tcnt++;
1629+
}
1630+
}
1631+
hocr_str += "</span>";
1632+
tcnt = 1;
1633+
gcnt = 1;
15911634
wcnt++;
15921635
// Close any ending block/paragraph/textline.
15931636
if (last_word_in_line) {

src/ccmain/linerec.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -239,7 +239,7 @@ void Tesseract::LSTMRecognizeWord(const BLOCK& block, ROW *row, WERD_RES *word,
239239
if (im_data == nullptr) return;
240240
lstm_recognizer_->RecognizeLine(*im_data, true, classify_debug_level > 0,
241241
kWorstDictCertainty / kCertaintyScale,
242-
word_box, words);
242+
word_box, words, glyph_confidences);
243243
delete im_data;
244244
SearchWords(words);
245245
}

src/ccmain/resultiterator.cpp

+10
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,8 @@
2727
#include "tesseractclass.h"
2828
#include "unicharset.h"
2929
#include "unicodes.h"
30+
#include <set>
31+
#include <vector>
3032

3133
namespace tesseract {
3234

@@ -602,6 +604,14 @@ char* ResultIterator::GetUTF8Text(PageIteratorLevel level) const {
602604
return result;
603605
}
604606

607+
std::vector<std::vector<std::pair<const char*, float>>>* ResultIterator::GetGlyphConfidences() const {
608+
if (it_->word() != nullptr) {
609+
return &it_->word()->timesteps;
610+
} else {
611+
return nullptr;
612+
}
613+
}
614+
605615
void ResultIterator::AppendUTF8WordText(STRING *text) const {
606616
if (!it_->word()) return;
607617
ASSERT_HOST(it_->word()->best_choice != nullptr);

src/ccmain/resultiterator.h

+7
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_
2323
#define TESSERACT_CCMAIN_RESULT_ITERATOR_H_
2424

25+
#include <set> // for std::pair
26+
#include <vector> // for std::vector
2527
#include "ltrresultiterator.h" // for LTRResultIterator
2628
#include "platform.h" // for TESS_API, TESS_LOCAL
2729
#include "publictypes.h" // for PageIteratorLevel
@@ -95,6 +97,11 @@ class TESS_API ResultIterator : public LTRResultIterator {
9597
*/
9698
virtual char* GetUTF8Text(PageIteratorLevel level) const;
9799

100+
/**
101+
* Returns the glyph confidences for every LSTM timestep for the current Word
102+
*/
103+
virtual std::vector<std::vector<std::pair<const char*, float>>>* GetGlyphConfidences() const;
104+
98105
/**
99106
* Return whether the current paragraph's dominant reading direction
100107
* is left-to-right (as opposed to right-to-left).

src/ccmain/tesseractclass.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,9 @@ Tesseract::Tesseract()
508508
STRING_MEMBER(page_separator, "\f",
509509
"Page separator (default is form feed control character)",
510510
this->params()),
511+
BOOL_MEMBER(glyph_confidences, false,
512+
"Allows to include glyph confidences in the hOCR output",
513+
this->params()),
511514

512515
backup_config_file_(nullptr),
513516
pix_binary_(nullptr),

src/ccmain/tesseractclass.h

+1
Original file line numberDiff line numberDiff line change
@@ -1114,6 +1114,7 @@ class Tesseract : public Wordrec {
11141114
"Preserve multiple interword spaces");
11151115
STRING_VAR_H(page_separator, "\f",
11161116
"Page separator (default is form feed control character)");
1117+
BOOL_VAR_H(glyph_confidences, false, "Allows to include glyph confidences in the hOCR output");
11171118

11181119
//// ambigsrecog.cpp /////////////////////////////////////////////////////////
11191120
FILE *init_recog_training(const STRING &fname);

src/ccstruct/pageres.h

+4
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121
#define PAGERES_H
2222

2323
#include <cstdint> // for int32_t, int16_t
24+
#include <set> // for std::pair
25+
#include <vector> // for std::vector
2426
#include <sys/types.h> // for int8_t
2527
#include "blamer.h" // for BlamerBundle (ptr only), IRR_NUM_REASONS
2628
#include "clst.h" // for CLIST_ITERATOR, CLISTIZEH
@@ -218,6 +220,8 @@ class WERD_RES : public ELIST_LINK {
218220
// Gaps between blobs in chopped_word. blob_gaps[i] is the gap between
219221
// blob i and blob i+1.
220222
GenericVector<int> blob_gaps;
223+
// Stores the glyph confidences of every timestep of the lstm
224+
std::vector<std::vector<std::pair<const char*, float>>> timesteps;
221225
// Ratings matrix contains classifier choices for each classified combination
222226
// of blobs. The dimension is the same as the number of blobs in chopped_word
223227
// and the leading diagonal corresponds to classifier results of the blobs

src/lstm/lstmrecognizer.cpp

+5-3
Original file line numberDiff line numberDiff line change
@@ -172,7 +172,7 @@ bool LSTMRecognizer::LoadDictionary(const char* lang, TessdataManager* mgr) {
172172
void LSTMRecognizer::RecognizeLine(const ImageData& image_data, bool invert,
173173
bool debug, double worst_dict_cert,
174174
const TBOX& line_box,
175-
PointerVector<WERD_RES>* words) {
175+
PointerVector<WERD_RES>* words, bool glyph_confidences) {
176176
NetworkIO outputs;
177177
float scale_factor;
178178
NetworkIO inputs;
@@ -183,9 +183,11 @@ void LSTMRecognizer::RecognizeLine(const ImageData& image_data, bool invert,
183183
search_ =
184184
new RecodeBeamSearch(recoder_, null_char_, SimpleTextOutput(), dict_);
185185
}
186-
search_->Decode(outputs, kDictRatio, kCertOffset, worst_dict_cert, nullptr);
186+
search_->Decode(outputs, kDictRatio, kCertOffset, worst_dict_cert,
187+
&GetUnicharset(), glyph_confidences);
187188
search_->ExtractBestPathAsWords(line_box, scale_factor, debug,
188-
&GetUnicharset(), words);
189+
&GetUnicharset(), words,
190+
glyph_confidences);
189191
}
190192

191193
// Helper computes min and mean best results in the output.

src/lstm/lstmrecognizer.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -184,7 +184,8 @@ class LSTMRecognizer {
184184
// will be used in a dictionary word.
185185
void RecognizeLine(const ImageData& image_data, bool invert, bool debug,
186186
double worst_dict_cert, const TBOX& line_box,
187-
PointerVector<WERD_RES>* words);
187+
PointerVector<WERD_RES>* words,
188+
bool glyph_confidences = false);
188189

189190
// Helper computes min and mean best results in the output.
190191
void OutputStats(const NetworkIO& outputs,

src/lstm/recodebeam.cpp

+48-4
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@
2222
#include "networkio.h"
2323
#include "pageres.h"
2424
#include "unicharcompress.h"
25+
#include <set>
26+
#include <vector>
2527

2628
#include <algorithm>
2729

@@ -77,13 +79,18 @@ RecodeBeamSearch::RecodeBeamSearch(const UnicharCompress& recoder,
7779
// Decodes the set of network outputs, storing the lattice internally.
7880
void RecodeBeamSearch::Decode(const NetworkIO& output, double dict_ratio,
7981
double cert_offset, double worst_dict_cert,
80-
const UNICHARSET* charset) {
82+
const UNICHARSET* charset, bool glyph_confidence) {
8183
beam_size_ = 0;
8284
int width = output.Width();
85+
if (glyph_confidence)
86+
timesteps.clear();
8387
for (int t = 0; t < width; ++t) {
8488
ComputeTopN(output.f(t), output.NumFeatures(), kBeamWidths[0]);
8589
DecodeStep(output.f(t), t, dict_ratio, cert_offset, worst_dict_cert,
8690
charset);
91+
if (glyph_confidence) {
92+
SaveMostCertainGlyphs(output.f(t), output.NumFeatures(), charset, t);
93+
}
8794
}
8895
}
8996
void RecodeBeamSearch::Decode(const GENERIC_2D_ARRAY<float>& output,
@@ -98,6 +105,35 @@ void RecodeBeamSearch::Decode(const GENERIC_2D_ARRAY<float>& output,
98105
}
99106
}
100107

108+
void RecodeBeamSearch::SaveMostCertainGlyphs(const float* outputs,
109+
int num_outputs,
110+
const UNICHARSET* charset,
111+
int xCoord) {
112+
std::vector<std::pair<const char*, float>> glyphs;
113+
int pos = 0;
114+
for (int i = 0; i < num_outputs; ++i) {
115+
if (outputs[i] >= 0.01f) {
116+
const char* charakter;
117+
if (i + 2 >= num_outputs) {
118+
charakter = "";
119+
} else if (i > 0) {
120+
charakter = charset->id_to_unichar_ext(i + 2);
121+
} else {
122+
charakter = charset->id_to_unichar_ext(i);
123+
}
124+
pos = 0;
125+
//order the possible glyphs within one timestep
126+
//beginning with the most likely
127+
while (glyphs.size() > pos && glyphs[pos].second > outputs[i]) {
128+
pos++;
129+
}
130+
glyphs.insert(glyphs.begin() + pos,
131+
std::pair<const char*, float>(charakter, outputs[i]));
132+
}
133+
}
134+
timesteps.push_back(glyphs);
135+
}
136+
101137
// Returns the best path as labels/scores/xcoords similar to simple CTC.
102138
void RecodeBeamSearch::ExtractBestPathAsLabels(
103139
GenericVector<int>* labels, GenericVector<int>* xcoords) const {
@@ -140,7 +176,8 @@ void RecodeBeamSearch::ExtractBestPathAsUnicharIds(
140176
void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
141177
float scale_factor, bool debug,
142178
const UNICHARSET* unicharset,
143-
PointerVector<WERD_RES>* words) {
179+
PointerVector<WERD_RES>* words,
180+
bool glyph_confidence) {
144181
words->truncate(0);
145182
GenericVector<int> unichar_ids;
146183
GenericVector<float> certs;
@@ -165,6 +202,7 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
165202
}
166203
// Convert labels to unichar-ids.
167204
int word_end = 0;
205+
int timestepEnd = 0;
168206
float prev_space_cert = 0.0f;
169207
for (int word_start = 0; word_start < num_ids; word_start = word_end) {
170208
for (word_end = word_start + 1; word_end < num_ids; ++word_end) {
@@ -188,6 +226,12 @@ void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box,
188226
WERD_RES* word_res = InitializeWord(
189227
leading_space, line_box, word_start, word_end,
190228
std::min(space_cert, prev_space_cert), unicharset, xcoords, scale_factor);
229+
if (glyph_confidence) {
230+
for (size_t i = timestepEnd; i < xcoords[word_end]; i++) {
231+
word_res->timesteps.push_back(timesteps[i]);
232+
}
233+
timestepEnd = xcoords[word_end];
234+
}
191235
for (int i = word_start; i < word_end; ++i) {
192236
BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST;
193237
BLOB_CHOICE_IT bc_it(choices);
@@ -381,7 +425,7 @@ void RecodeBeamSearch::ComputeTopN(const float* outputs, int num_outputs,
381425
void RecodeBeamSearch::DecodeStep(const float* outputs, int t,
382426
double dict_ratio, double cert_offset,
383427
double worst_dict_cert,
384-
const UNICHARSET* charset) {
428+
const UNICHARSET* charset, bool debug) {
385429
if (t == beam_.size()) beam_.push_back(new RecodeBeam);
386430
RecodeBeam* step = beam_[t];
387431
beam_size_ = t + 1;
@@ -396,7 +440,7 @@ void RecodeBeamSearch::DecodeStep(const float* outputs, int t,
396440
}
397441
} else {
398442
RecodeBeam* prev = beam_[t - 1];
399-
if (charset != nullptr) {
443+
if (debug) {
400444
int beam_index = BeamIndex(true, NC_ANYTHING, 0);
401445
for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) {
402446
GenericVector<const RecodeNode*> path;

src/lstm/recodebeam.h

+11-4
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,8 @@
2828
#include "networkio.h"
2929
#include "ratngs.h"
3030
#include "unicharcompress.h"
31+
#include <set>
32+
#include <vector>
3133

3234
namespace tesseract {
3335

@@ -182,7 +184,8 @@ class RecodeBeamSearch {
182184
// Decodes the set of network outputs, storing the lattice internally.
183185
// If charset is not null, it enables detailed debugging of the beam search.
184186
void Decode(const NetworkIO& output, double dict_ratio, double cert_offset,
185-
double worst_dict_cert, const UNICHARSET* charset);
187+
double worst_dict_cert, const UNICHARSET* charset,
188+
bool glyph_confidence = false);
186189
void Decode(const GENERIC_2D_ARRAY<float>& output, double dict_ratio,
187190
double cert_offset, double worst_dict_cert,
188191
const UNICHARSET* charset);
@@ -201,11 +204,12 @@ class RecodeBeamSearch {
201204
// Returns the best path as a set of WERD_RES.
202205
void ExtractBestPathAsWords(const TBOX& line_box, float scale_factor,
203206
bool debug, const UNICHARSET* unicharset,
204-
PointerVector<WERD_RES>* words);
207+
PointerVector<WERD_RES>* words, bool glyph_confidence);
205208

206209
// Generates debug output of the content of the beams after a Decode.
207210
void DebugBeams(const UNICHARSET& unicharset) const;
208-
211+
212+
std::vector< std::vector<std::pair<const char*, float>>> timesteps;
209213
// Clipping value for certainty inside Tesseract. Reflects the minimum value
210214
// of certainty that will be returned by ExtractBestPathAsUnicharIds.
211215
// Supposedly on a uniform scale that can be compared across languages and
@@ -291,7 +295,10 @@ class RecodeBeamSearch {
291295
// for the current timestep.
292296
void DecodeStep(const float* outputs, int t, double dict_ratio,
293297
double cert_offset, double worst_dict_cert,
294-
const UNICHARSET* charset);
298+
const UNICHARSET* charset, bool debug = false);
299+
300+
//Saves the most certain glyphs for the current time-step
301+
void SaveMostCertainGlyphs(const float* outputs, int num_outputs, const UNICHARSET* charset, int xCoord);
295302

296303
// Adds to the appropriate beams the legal (according to recoder)
297304
// continuations of context prev, which is from the given index to beams_,

0 commit comments

Comments
 (0)