Skip to content

Commit 0e868ef

Browse files
committed
Major change to improve layout analysis for heavily diacritic languages:
Tha, Vie, Kan, Tel etc. There is a new overlap detector that detects when diacritics cause a big increase in textline overlap. In such cases, diacritics from overlap regions are kept separate from layout analysis completely, allowing textline formation to happen without them. The diacritics are then assigned to 0, 1 or 2 close words at the end of layout analysis, using and modifying an old noise detection data path. The stored diacritics are used or not during recognition according to the character classifier's liking for them.
1 parent b6d0184 commit 0e868ef

34 files changed

+1856
-744
lines changed

ccmain/control.cpp

+370-8
Large diffs are not rendered by default.

ccmain/fixspace.cpp

+1-2
Original file line numberDiff line numberDiff line change
@@ -205,8 +205,7 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row,
205205
if ((!word->part_of_combo) && (word->box_word == NULL)) {
206206
WordData word_data(block, row, word);
207207
SetupWordPassN(2, &word_data);
208-
classify_word_and_language(&Tesseract::classify_word_pass2, NULL,
209-
&word_data);
208+
classify_word_and_language(2, NULL, &word_data);
210209
}
211210
prev_word_best_choice_ = word->best_choice;
212211
}

ccmain/pageiterator.cpp

+72-60
Original file line numberDiff line numberDiff line change
@@ -26,15 +26,23 @@
2626

2727
namespace tesseract {
2828

29-
PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract,
30-
int scale, int scaled_yres,
31-
int rect_left, int rect_top,
29+
PageIterator::PageIterator(PAGE_RES* page_res, Tesseract* tesseract, int scale,
30+
int scaled_yres, int rect_left, int rect_top,
3231
int rect_width, int rect_height)
33-
: page_res_(page_res), tesseract_(tesseract),
34-
word_(NULL), word_length_(0), blob_index_(0), cblob_it_(NULL),
35-
scale_(scale), scaled_yres_(scaled_yres),
36-
rect_left_(rect_left), rect_top_(rect_top),
37-
rect_width_(rect_width), rect_height_(rect_height) {
32+
: page_res_(page_res),
33+
tesseract_(tesseract),
34+
word_(NULL),
35+
word_length_(0),
36+
blob_index_(0),
37+
cblob_it_(NULL),
38+
include_upper_dots_(false),
39+
include_lower_dots_(false),
40+
scale_(scale),
41+
scaled_yres_(scaled_yres),
42+
rect_left_(rect_left),
43+
rect_top_(rect_top),
44+
rect_width_(rect_width),
45+
rect_height_(rect_height) {
3846
it_ = new PAGE_RES_IT(page_res);
3947
PageIterator::Begin();
4048
}
@@ -50,19 +58,29 @@ PageIterator::~PageIterator() {
5058
* objects at a higher level.
5159
*/
5260
PageIterator::PageIterator(const PageIterator& src)
53-
: page_res_(src.page_res_), tesseract_(src.tesseract_),
54-
word_(NULL), word_length_(src.word_length_),
55-
blob_index_(src.blob_index_), cblob_it_(NULL),
56-
scale_(src.scale_), scaled_yres_(src.scaled_yres_),
57-
rect_left_(src.rect_left_), rect_top_(src.rect_top_),
58-
rect_width_(src.rect_width_), rect_height_(src.rect_height_) {
61+
: page_res_(src.page_res_),
62+
tesseract_(src.tesseract_),
63+
word_(NULL),
64+
word_length_(src.word_length_),
65+
blob_index_(src.blob_index_),
66+
cblob_it_(NULL),
67+
include_upper_dots_(src.include_upper_dots_),
68+
include_lower_dots_(src.include_lower_dots_),
69+
scale_(src.scale_),
70+
scaled_yres_(src.scaled_yres_),
71+
rect_left_(src.rect_left_),
72+
rect_top_(src.rect_top_),
73+
rect_width_(src.rect_width_),
74+
rect_height_(src.rect_height_) {
5975
it_ = new PAGE_RES_IT(*src.it_);
6076
BeginWord(src.blob_index_);
6177
}
6278

6379
const PageIterator& PageIterator::operator=(const PageIterator& src) {
6480
page_res_ = src.page_res_;
6581
tesseract_ = src.tesseract_;
82+
include_upper_dots_ = src.include_upper_dots_;
83+
include_lower_dots_ = src.include_lower_dots_;
6684
scale_ = src.scale_;
6785
scaled_yres_ = src.scaled_yres_;
6886
rect_left_ = src.rect_left_;
@@ -252,16 +270,19 @@ bool PageIterator::BoundingBoxInternal(PageIteratorLevel level,
252270
PARA *para = NULL;
253271
switch (level) {
254272
case RIL_BLOCK:
255-
box = it_->block()->block->bounding_box();
273+
box = it_->block()->block->restricted_bounding_box(include_upper_dots_,
274+
include_lower_dots_);
256275
break;
257276
case RIL_PARA:
258277
para = it_->row()->row->para();
259278
// explicit fall-through.
260279
case RIL_TEXTLINE:
261-
box = it_->row()->row->bounding_box();
280+
box = it_->row()->row->restricted_bounding_box(include_upper_dots_,
281+
include_lower_dots_);
262282
break;
263283
case RIL_WORD:
264-
box = it_->word()->word->bounding_box();
284+
box = it_->word()->word->restricted_bounding_box(include_upper_dots_,
285+
include_lower_dots_);
265286
break;
266287
case RIL_SYMBOL:
267288
if (cblob_it_ == NULL)
@@ -387,39 +408,23 @@ Pix* PageIterator::GetBinaryImage(PageIteratorLevel level) const {
387408
int left, top, right, bottom;
388409
if (!BoundingBoxInternal(level, &left, &top, &right, &bottom))
389410
return NULL;
390-
Pix* pix = NULL;
391-
switch (level) {
392-
case RIL_BLOCK:
393-
case RIL_PARA:
394-
int bleft, btop, bright, bbottom;
395-
BoundingBoxInternal(RIL_BLOCK, &bleft, &btop, &bright, &bbottom);
396-
pix = it_->block()->block->render_mask();
397-
// AND the mask and the image.
398-
pixRasterop(pix, 0, 0, pixGetWidth(pix), pixGetHeight(pix),
399-
PIX_SRC & PIX_DST, tesseract_->pix_binary(),
400-
bleft, btop);
401-
if (level == RIL_PARA) {
402-
// RIL_PARA needs further attention:
403-
// clip the paragraph from the block mask.
404-
Box* box = boxCreate(left - bleft, top - btop,
405-
right - left, bottom - top);
406-
Pix* pix2 = pixClipRectangle(pix, box, NULL);
407-
boxDestroy(&box);
408-
pixDestroy(&pix);
409-
pix = pix2;
410-
}
411-
break;
412-
case RIL_TEXTLINE:
413-
case RIL_WORD:
414-
case RIL_SYMBOL:
415-
if (level == RIL_SYMBOL && cblob_it_ != NULL &&
416-
cblob_it_->data()->area() != 0)
417-
return cblob_it_->data()->render();
418-
// Just clip from the bounding box.
419-
Box* box = boxCreate(left, top, right - left, bottom - top);
420-
pix = pixClipRectangle(tesseract_->pix_binary(), box, NULL);
421-
boxDestroy(&box);
422-
break;
411+
if (level == RIL_SYMBOL && cblob_it_ != NULL &&
412+
cblob_it_->data()->area() != 0)
413+
return cblob_it_->data()->render();
414+
Box* box = boxCreate(left, top, right - left, bottom - top);
415+
Pix* pix = pixClipRectangle(tesseract_->pix_binary(), box, NULL);
416+
boxDestroy(&box);
417+
if (level == RIL_BLOCK || level == RIL_PARA) {
418+
// Clip to the block polygon as well.
419+
TBOX mask_box;
420+
Pix* mask = it_->block()->block->render_mask(&mask_box);
421+
int mask_x = left - mask_box.left();
422+
int mask_y = top - (tesseract_->ImageHeight() - mask_box.top());
423+
// AND the mask and pix, putting the result in pix.
424+
pixRasterop(pix, MAX(0, -mask_x), MAX(0, -mask_y), pixGetWidth(pix),
425+
pixGetHeight(pix), PIX_SRC & PIX_DST, mask, MAX(0, mask_x),
426+
MAX(0, mask_y));
427+
pixDestroy(&mask);
423428
}
424429
return pix;
425430
}
@@ -452,17 +457,24 @@ Pix* PageIterator::GetImage(PageIteratorLevel level, int padding,
452457
Box* box = boxCreate(*left, *top, right - *left, bottom - *top);
453458
Pix* grey_pix = pixClipRectangle(original_img, box, NULL);
454459
boxDestroy(&box);
455-
if (level == RIL_BLOCK) {
456-
Pix* mask = it_->block()->block->render_mask();
457-
Pix* expanded_mask = pixCreate(right - *left, bottom - *top, 1);
458-
pixRasterop(expanded_mask, padding, padding,
459-
pixGetWidth(mask), pixGetHeight(mask),
460-
PIX_SRC, mask, 0, 0);
460+
if (level == RIL_BLOCK || level == RIL_PARA) {
461+
// Clip to the block polygon as well.
462+
TBOX mask_box;
463+
Pix* mask = it_->block()->block->render_mask(&mask_box);
464+
// Copy the mask registered correctly into an image the size of grey_pix.
465+
int mask_x = *left - mask_box.left();
466+
int mask_y = *top - (pixGetHeight(original_img) - mask_box.top());
467+
int width = pixGetWidth(grey_pix);
468+
int height = pixGetHeight(grey_pix);
469+
Pix* resized_mask = pixCreate(width, height, 1);
470+
pixRasterop(resized_mask, MAX(0, -mask_x), MAX(0, -mask_y), width, height,
471+
PIX_SRC, mask, MAX(0, mask_x), MAX(0, mask_y));
461472
pixDestroy(&mask);
462-
pixDilateBrick(expanded_mask, expanded_mask, 2*padding + 1, 2*padding + 1);
463-
pixInvert(expanded_mask, expanded_mask);
464-
pixSetMasked(grey_pix, expanded_mask, MAX_UINT32);
465-
pixDestroy(&expanded_mask);
473+
pixDilateBrick(resized_mask, resized_mask, 2 * padding + 1,
474+
2 * padding + 1);
475+
pixInvert(resized_mask, resized_mask);
476+
pixSetMasked(grey_pix, resized_mask, MAX_UINT32);
477+
pixDestroy(&resized_mask);
466478
}
467479
return grey_pix;
468480
}

ccmain/pageiterator.h

+18
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,21 @@ class TESS_API PageIterator {
179179
// If an image rectangle has been set in the API, then returned coordinates
180180
// relate to the original (full) image, rather than the rectangle.
181181

182+
/**
183+
* Controls what to include in a bounding box. Bounding boxes of all levels
184+
* between RIL_WORD and RIL_BLOCK can include or exclude potential diacritics.
185+
* Between layout analysis and recognition, it isn't known where all
186+
* diacritics belong, so this control is used to include or exclude some
187+
* diacritics that are above or below the main body of the word. In most cases
188+
* where the placement is obvious, and after recognition, it doesn't make as
189+
* much difference, as the diacritics will already be included in the word.
190+
*/
191+
void SetBoundingBoxComponents(bool include_upper_dots,
192+
bool include_lower_dots) {
193+
include_upper_dots_ = include_upper_dots;
194+
include_lower_dots_ = include_lower_dots;
195+
}
196+
182197
/**
183198
* Returns the bounding rectangle of the current object at the given level.
184199
* See comment on coordinate system above.
@@ -332,6 +347,9 @@ class TESS_API PageIterator {
332347
* Owned by this ResultIterator.
333348
*/
334349
C_BLOB_IT* cblob_it_;
350+
/** Control over what to include in bounding boxes. */
351+
bool include_upper_dots_;
352+
bool include_lower_dots_;
335353
/** Parameters saved from the Thresholder. Needed to rebuild coordinates.*/
336354
int scale_;
337355
int scaled_yres_;

ccmain/pagesegmain.cpp

+26-14
Original file line numberDiff line numberDiff line change
@@ -134,12 +134,20 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
134134
// UNLV file present. Use PSM_SINGLE_BLOCK.
135135
pageseg_mode = PSM_SINGLE_BLOCK;
136136
}
137+
// The diacritic_blobs holds noise blobs that may be diacritics. They
138+
// are separated out on areas of the image that seem noisy and short-circuit
139+
// the layout process, going straight from the initial partition creation
140+
// right through to after word segmentation, where they are added to the
141+
// rej_cblobs list of the most appropriate word. From there classification
142+
// will determine whether they are used.
143+
BLOBNBOX_LIST diacritic_blobs;
137144
int auto_page_seg_ret_val = 0;
138145
TO_BLOCK_LIST to_blocks;
139146
if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
140147
PSM_SPARSE(pageseg_mode)) {
141-
auto_page_seg_ret_val =
142-
AutoPageSeg(pageseg_mode, blocks, &to_blocks, osd_tess, osr);
148+
auto_page_seg_ret_val = AutoPageSeg(
149+
pageseg_mode, blocks, &to_blocks,
150+
enable_noise_removal ? &diacritic_blobs : NULL, osd_tess, osr);
143151
if (pageseg_mode == PSM_OSD_ONLY)
144152
return auto_page_seg_ret_val;
145153
// To create blobs from the image region bounds uncomment this line:
@@ -171,7 +179,7 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks,
171179

172180
textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
173181
pix_thresholds_, pix_grey_, splitting || cjk_mode,
174-
blocks, &to_blocks);
182+
&diacritic_blobs, blocks, &to_blocks);
175183
return auto_page_seg_ret_val;
176184
}
177185

@@ -197,7 +205,6 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) {
197205
pixDestroy(&grey_pix);
198206
}
199207

200-
201208
/**
202209
* Auto page segmentation. Divide the page image into blocks of uniform
203210
* text linespacing and images.
@@ -207,19 +214,25 @@ static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) {
207214
* The output goes in the blocks list with corresponding TO_BLOCKs in the
208215
* to_blocks list.
209216
*
210-
* If single_column is true, then no attempt is made to divide the image
211-
* into columns, but multiple blocks are still made if the text is of
212-
* non-uniform linespacing.
217+
* If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide
218+
* the image into columns, but multiple blocks are still made if the text is
219+
* of non-uniform linespacing.
220+
*
221+
* If diacritic_blobs is non-null, then diacritics/noise blobs, that would
222+
* confuse layout anaylsis by causing textline overlap, are placed there,
223+
* with the expectation that they will be reassigned to words later and
224+
* noise/diacriticness determined via classification.
213225
*
214226
* If osd (orientation and script detection) is true then that is performed
215227
* as well. If only_osd is true, then only orientation and script detection is
216228
* performed. If osd is desired, (osd or only_osd) then osr_tess must be
217229
* another Tesseract that was initialized especially for osd, and the results
218230
* will be output into osr (orientation and script result).
219231
*/
220-
int Tesseract::AutoPageSeg(PageSegMode pageseg_mode,
221-
BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks,
222-
Tesseract* osd_tess, OSResults* osr) {
232+
int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks,
233+
TO_BLOCK_LIST* to_blocks,
234+
BLOBNBOX_LIST* diacritic_blobs, Tesseract* osd_tess,
235+
OSResults* osr) {
223236
if (textord_debug_images) {
224237
WriteDebugBackgroundImage(textord_debug_printable, pix_binary_);
225238
}
@@ -247,10 +260,9 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode,
247260
if (equ_detect_) {
248261
finder->SetEquationDetect(equ_detect_);
249262
}
250-
result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_,
251-
to_block, photomask_pix,
252-
pix_thresholds_, pix_grey_,
253-
&found_blocks, to_blocks);
263+
result = finder->FindBlocks(
264+
pageseg_mode, scaled_color_, scaled_factor_, to_block, photomask_pix,
265+
pix_thresholds_, pix_grey_, &found_blocks, diacritic_blobs, to_blocks);
254266
if (result >= 0)
255267
finder->GetDeskewVectors(&deskew_, &reskew_);
256268
delete finder;

ccmain/pgedit.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -655,7 +655,8 @@ void show_point(PAGE_RES* page_res, float x, float y) {
655655
FCOORD pt(x, y);
656656
PAGE_RES_IT pr_it(page_res);
657657

658-
char msg[160];
658+
const int kBufsize = 512;
659+
char msg[kBufsize];
659660
char *msg_ptr = msg;
660661

661662
msg_ptr += sprintf(msg_ptr, "Pt:(%0.3f, %0.3f) ", x, y);

ccmain/recogtraining.cpp

+1-2
Original file line numberDiff line numberDiff line change
@@ -207,8 +207,7 @@ void Tesseract::ambigs_classify_and_output(const char *label,
207207
fflush(stdout);
208208
WordData word_data(*pr_it);
209209
SetupWordPassN(1, &word_data);
210-
classify_word_and_language(&Tesseract::classify_word_pass1,
211-
pr_it, &word_data);
210+
classify_word_and_language(1, pr_it, &word_data);
212211
WERD_RES* werd_res = word_data.word;
213212
WERD_CHOICE *best_choice = werd_res->best_choice;
214213
ASSERT_HOST(best_choice != NULL);

0 commit comments

Comments
 (0)