Skip to content

Commit f3362a4

Browse files
committed
Add renderer to create WordStr box files from images
1 parent 7ca27bb commit f3362a4

File tree

9 files changed

+141
-1
lines changed

9 files changed

+141
-1
lines changed

CMakeLists.txt

+1
Original file line numberDiff line numberDiff line change
@@ -254,6 +254,7 @@ set(tesseract_src ${tesseract_src}
254254
src/api/hocrrenderer.cpp
255255
src/api/lstmboxrenderer.cpp
256256
src/api/pdfrenderer.cpp
257+
src/api/wordstrboxrenderer.cpp
257258
)
258259

259260
if (WIN32)

src/api/Makefile.am

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ libtesseract_api_la_SOURCES += altorenderer.cpp
3737
libtesseract_api_la_SOURCES += hocrrenderer.cpp
3838
libtesseract_api_la_SOURCES += lstmboxrenderer.cpp
3939
libtesseract_api_la_SOURCES += pdfrenderer.cpp
40+
libtesseract_api_la_SOURCES += wordstrboxrenderer.cpp
4041
libtesseract_api_la_SOURCES += renderer.cpp
4142

4243
lib_LTLIBRARIES += libtesseract.la

src/api/baseapi.h

+9-1
Original file line numberDiff line numberDiff line change
@@ -630,7 +630,15 @@ class TESS_API TessBaseAPI {
630630
* Returned string must be freed with the delete [] operator.
631631
*/
632632
char* GetBoxText(int page_number);
633-
633+
634+
/**
635+
* The recognized text is returned as a char* which is coded in the same
636+
* format as a WordStr box file used in training.
637+
* page_number is a 0-based page index that will appear in the box file.
638+
* Returned string must be freed with the delete [] operator.
639+
*/
640+
char* GetWordStrBoxText(int page_number);
641+
634642
/**
635643
* The recognized text is returned as a char* which is coded
636644
* as UNLV format Latin-1 with specific reject and suspect codes.

src/api/renderer.h

+11
Original file line numberDiff line numberDiff line change
@@ -269,6 +269,17 @@ class TESS_API TessBoxTextRenderer : public TessResultRenderer {
269269
virtual bool AddImageHandler(TessBaseAPI* api);
270270
};
271271

272+
/**
273+
* Renders tesseract output into a plain UTF-8 text string in WordStr format
274+
*/
275+
class TESS_API TessWordStrBoxRenderer : public TessResultRenderer {
276+
public:
277+
explicit TessWordStrBoxRenderer(const char* outputbase);
278+
279+
protected:
280+
virtual bool AddImageHandler(TessBaseAPI* api);
281+
};
282+
272283
#ifndef DISABLED_LEGACY_ENGINE
273284

274285
/**

src/api/tesseractmain.cpp

+14
Original file line numberDiff line numberDiff line change
@@ -524,6 +524,20 @@ static void PreloadRenderers(
524524
}
525525
}
526526

527+
api->GetBoolVariable("tessedit_create_wordstrbox", &b);
528+
if (b) {
529+
tesseract::TessWordStrBoxRenderer* renderer =
530+
new tesseract::TessWordStrBoxRenderer(outputbase);
531+
if (renderer->happy()) {
532+
renderers->push_back(renderer);
533+
} else {
534+
delete renderer;
535+
tprintf("Error, could not create WordStr BOX output file: %s\n",
536+
strerror(errno));
537+
error = true;
538+
}
539+
}
540+
527541
api->GetBoolVariable("tessedit_create_txt", &b);
528542
if (b || (!error && renderers->empty())) {
529543
tesseract::TessTextRenderer* renderer =

src/api/wordstrboxrenderer.cpp

+101
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
/**********************************************************************
2+
* File: wordstrboxrenderer.cpp
3+
* Description: Renderer for creating box file with WordStr strings.
4+
* based on the tsv renderer.
5+
*
6+
* (C) Copyright 2006, Google Inc.
7+
** Licensed under the Apache License, Version 2.0 (the "License");
8+
** you may not use this file except in compliance with the License.
9+
** You may obtain a copy of the License at
10+
** http://www.apache.org/licenses/LICENSE-2.0
11+
** Unless required by applicable law or agreed to in writing, software
12+
** distributed under the License is distributed on an "AS IS" BASIS,
13+
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
** See the License for the specific language governing permissions and
15+
** limitations under the License.
16+
*
17+
**********************************************************************/
18+
19+
#include "baseapi.h" // for TessBaseAPI
20+
#include "renderer.h"
21+
#include "tesseractclass.h" // for Tesseract
22+
23+
namespace tesseract {
24+
25+
/**
26+
* Create a UTF8 box file with WordStr strings from the internal data structures.
27+
* page_number is a 0-base page index that will appear in the box file.
28+
* Returned string must be freed with the delete [] operator.
29+
*/
30+
31+
char* TessBaseAPI::GetWordStrBoxText(int page_number) {
32+
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0))
33+
return nullptr;
34+
35+
STRING wordstr_box_str("");
36+
int left, top, right, bottom;
37+
int page_num = page_number;
38+
bool first_line = true;
39+
40+
LTRResultIterator* res_it = GetLTRIterator();
41+
while (!res_it->Empty(RIL_BLOCK)) {
42+
if (res_it->Empty(RIL_WORD)) {
43+
res_it->Next(RIL_WORD);
44+
continue;
45+
}
46+
47+
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
48+
if (!first_line) {
49+
wordstr_box_str.add_str_int("\n\t ", right + 1);
50+
wordstr_box_str.add_str_int(" ", image_height_ - bottom);
51+
wordstr_box_str.add_str_int(" ", right + 5);
52+
wordstr_box_str.add_str_int(" ", image_height_ - top);
53+
wordstr_box_str.add_str_int(" ", page_num); // row for tab for EOL
54+
wordstr_box_str += "\n";
55+
} else {
56+
first_line = false;
57+
}
58+
// Use bounding box for whole line for WordStr
59+
res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
60+
wordstr_box_str.add_str_int("WordStr ", left);
61+
wordstr_box_str.add_str_int(" ", image_height_ - bottom);
62+
wordstr_box_str.add_str_int(" ", right);
63+
wordstr_box_str.add_str_int(" ", image_height_ - top);
64+
wordstr_box_str.add_str_int(" ", page_num); // word
65+
wordstr_box_str += " #";
66+
}
67+
do { wordstr_box_str +=
68+
std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
69+
wordstr_box_str += " ";
70+
res_it->Next(RIL_WORD);
71+
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
72+
}
73+
wordstr_box_str.add_str_int("\n\t ", right + 1);
74+
wordstr_box_str.add_str_int(" ", image_height_ - bottom);
75+
wordstr_box_str.add_str_int(" ", right + 5);
76+
wordstr_box_str.add_str_int(" ", image_height_ - top);
77+
wordstr_box_str.add_str_int(" ", page_num); // row for tab for EOL
78+
wordstr_box_str += "\n";
79+
char* ret = new char[wordstr_box_str.length() + 1];
80+
strcpy(ret, wordstr_box_str.string());
81+
delete res_it;
82+
return ret;
83+
}
84+
85+
/**********************************************************************
86+
* WordStrBox Renderer interface implementation
87+
**********************************************************************/
88+
TessWordStrBoxRenderer::TessWordStrBoxRenderer(const char *outputbase)
89+
: TessResultRenderer(outputbase, "box") {
90+
}
91+
92+
bool TessWordStrBoxRenderer::AddImageHandler(TessBaseAPI* api) {
93+
const std::unique_ptr<const char[]> wordstrbox(api->GetWordStrBoxText(imagenum()));
94+
if (wordstrbox == nullptr) return false;
95+
96+
AppendString(wordstrbox.get());
97+
98+
return true;
99+
}
100+
101+
} // namespace tesseract.

src/ccmain/tesseractclass.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -395,6 +395,8 @@ Tesseract::Tesseract()
395395
this->params()),
396396
BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
397397
this->params()),
398+
BOOL_MEMBER(tessedit_create_wordstrbox, false, "Write WordStr format .box output file",
399+
this->params()),
398400
BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
399401
this->params()),
400402
BOOL_MEMBER(textonly_pdf, false,

src/ccmain/tesseractclass.h

+1
Original file line numberDiff line numberDiff line change
@@ -1042,6 +1042,7 @@ class Tesseract : public Wordrec {
10421042
BOOL_VAR_H(tessedit_create_alto, false, "Write .xml ALTO output file");
10431043
BOOL_VAR_H(tessedit_create_lstmbox, false, "Write .box file for LSTM training");
10441044
BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
1045+
BOOL_VAR_H(tessedit_create_wordstrbox, false, "Write WordStr format .box output file");
10451046
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
10461047
BOOL_VAR_H(textonly_pdf, false,
10471048
"Create PDF with only one invisible text layer");

tessdata/configs/wordstrbox

+1
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
tessedit_create_wordstrbox 1

0 commit comments

Comments
 (0)