@@ -1292,8 +1292,7 @@ WERD_RES* PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES& clone_res,
1292
1292
// Helper computes the boundaries between blobs in the word. The blob bounds
1293
1293
// are likely very poor, if they come from LSTM, where it only outputs the
1294
1294
// character at one pixel within it, so we find the midpoints between them.
1295
- static void ComputeBlobEnds (const WERD_RES& word, const TBOX& clip_box,
1296
- C_BLOB_LIST* next_word_blobs,
1295
+ static void ComputeBlobEnds (const WERD_RES& word, C_BLOB_LIST* next_word_blobs,
1297
1296
GenericVector<int >* blob_ends) {
1298
1297
C_BLOB_IT blob_it (word.word ->cblob_list ());
1299
1298
for (int i = 0 ; i < word.best_state .size (); ++i) {
@@ -1313,74 +1312,8 @@ static void ComputeBlobEnds(const WERD_RES& word, const TBOX& clip_box,
1313
1312
blob_it.set_to_list (next_word_blobs);
1314
1313
blob_end = (blob_box.right () + blob_it.data ()->bounding_box ().left ()) / 2 ;
1315
1314
}
1316
- blob_end = ClipToRange<int >(blob_end, clip_box.left (), clip_box.right ());
1317
1315
blob_ends->push_back (blob_end);
1318
1316
}
1319
- blob_ends->back () = clip_box.right ();
1320
- }
1321
-
1322
- // Helper computes the bounds of a word by restricting it to existing words
1323
- // that significantly overlap.
1324
- static TBOX ComputeWordBounds (const tesseract::PointerVector<WERD_RES>& words,
1325
- int w_index, TBOX prev_box, WERD_RES_IT w_it) {
1326
- constexpr int kSignificantOverlapFraction = 4 ;
1327
- TBOX clipped_box;
1328
- TBOX current_box = words[w_index]->word ->bounding_box ();
1329
- TBOX next_box;
1330
- if (w_index + 1 < words.size () && words[w_index + 1 ] != nullptr &&
1331
- words[w_index + 1 ]->word != nullptr )
1332
- next_box = words[w_index + 1 ]->word ->bounding_box ();
1333
- for (w_it.forward (); !w_it.at_first () && w_it.data ()->part_of_combo ;
1334
- w_it.forward ()) {
1335
- if (w_it.data () == nullptr || w_it.data ()->word == nullptr ) continue ;
1336
- TBOX w_box = w_it.data ()->word ->bounding_box ();
1337
- int height_limit = std::min<int >(w_box.height (), w_box.width () / 2 );
1338
- int width_limit = w_box.width () / kSignificantOverlapFraction ;
1339
- int min_significant_overlap = std::max (height_limit, width_limit);
1340
- int overlap = w_box.intersection (current_box).width ();
1341
- int prev_overlap = w_box.intersection (prev_box).width ();
1342
- int next_overlap = w_box.intersection (next_box).width ();
1343
- if (overlap > min_significant_overlap) {
1344
- if (prev_overlap > min_significant_overlap) {
1345
- // We have no choice but to use the LSTM word edge.
1346
- clipped_box.set_left (current_box.left ());
1347
- } else if (next_overlap > min_significant_overlap) {
1348
- // We have no choice but to use the LSTM word edge.
1349
- clipped_box.set_right (current_box.right ());
1350
- } else {
1351
- clipped_box += w_box;
1352
- }
1353
- }
1354
- }
1355
- if (clipped_box.height () <= 0 ) {
1356
- clipped_box.set_top (current_box.top ());
1357
- clipped_box.set_bottom (current_box.bottom ());
1358
- }
1359
- if (clipped_box.width () <= 0 ) clipped_box = current_box;
1360
- return clipped_box;
1361
- }
1362
-
1363
- // Helper moves the blob from src to dest. If it isn't contained by clip_box,
1364
- // the blob is replaced by a fake that is contained.
1365
- static TBOX MoveAndClipBlob (C_BLOB_IT* src_it, C_BLOB_IT* dest_it,
1366
- const TBOX& clip_box) {
1367
- C_BLOB* src_blob = src_it->extract ();
1368
- TBOX box = src_blob->bounding_box ();
1369
- if (!clip_box.contains (box)) {
1370
- int left =
1371
- ClipToRange<int >(box.left (), clip_box.left (), clip_box.right () - 1 );
1372
- int right =
1373
- ClipToRange<int >(box.right (), clip_box.left () + 1 , clip_box.right ());
1374
- int top =
1375
- ClipToRange<int >(box.top (), clip_box.bottom () + 1 , clip_box.top ());
1376
- int bottom =
1377
- ClipToRange<int >(box.bottom (), clip_box.bottom (), clip_box.top () - 1 );
1378
- box = TBOX (left, bottom, right, top);
1379
- delete src_blob;
1380
- src_blob = C_BLOB::FakeBlob (box);
1381
- }
1382
- dest_it->add_after_then_move (src_blob);
1383
- return box;
1384
1317
}
1385
1318
1386
1319
// Replaces the current WERD/WERD_RES with the given words. The given words
@@ -1431,45 +1364,66 @@ void PAGE_RES_IT::ReplaceCurrentWord(
1431
1364
src_b_it.sort (&C_BLOB::SortByXMiddle);
1432
1365
C_BLOB_IT rej_b_it (input_word->word ->rej_cblob_list ());
1433
1366
rej_b_it.sort (&C_BLOB::SortByXMiddle);
1434
- TBOX clip_box;
1435
1367
for (int w = 0 ; w < words->size (); ++w) {
1436
1368
WERD_RES* word_w = (*words)[w];
1437
- clip_box = ComputeWordBounds (*words, w, clip_box, wr_it_of_current_word);
1438
1369
// Compute blob boundaries.
1439
1370
GenericVector<int > blob_ends;
1440
1371
C_BLOB_LIST* next_word_blobs =
1441
1372
w + 1 < words->size () ? (*words)[w + 1 ]->word ->cblob_list () : nullptr ;
1442
- ComputeBlobEnds (*word_w, clip_box, next_word_blobs, &blob_ends);
1443
- // Remove the fake blobs on the current word, but keep safe for back-up if
1444
- // no blob can be found.
1445
- C_BLOB_LIST fake_blobs;
1446
- C_BLOB_IT fake_b_it (&fake_blobs);
1447
- fake_b_it.add_list_after (word_w->word ->cblob_list ());
1448
- fake_b_it.move_to_first ();
1373
+ ComputeBlobEnds (*word_w, next_word_blobs, &blob_ends);
1374
+ // Delete the fake blobs on the current word.
1449
1375
word_w->word ->cblob_list ()->clear ();
1450
1376
C_BLOB_IT dest_it (word_w->word ->cblob_list ());
1451
1377
// Build the box word as we move the blobs.
1452
1378
tesseract::BoxWord* box_word = new tesseract::BoxWord;
1453
- for (int i = 0 ; i < blob_ends.size (); ++i, fake_b_it. forward () ) {
1379
+ for (int i = 0 ; i < blob_ends.size (); ++i) {
1454
1380
int end_x = blob_ends[i];
1455
1381
TBOX blob_box;
1456
1382
// Add the blobs up to end_x.
1457
1383
while (!src_b_it.empty () &&
1458
1384
src_b_it.data ()->bounding_box ().x_middle () < end_x) {
1459
- blob_box += MoveAndClipBlob (&src_b_it, &dest_it, clip_box);
1385
+ blob_box += src_b_it.data ()->bounding_box ();
1386
+ dest_it.add_after_then_move (src_b_it.extract ());
1460
1387
src_b_it.forward ();
1461
1388
}
1462
1389
while (!rej_b_it.empty () &&
1463
1390
rej_b_it.data ()->bounding_box ().x_middle () < end_x) {
1464
- blob_box += MoveAndClipBlob (&rej_b_it, &dest_it, clip_box);
1391
+ blob_box += rej_b_it.data ()->bounding_box ();
1392
+ dest_it.add_after_then_move (rej_b_it.extract ());
1465
1393
rej_b_it.forward ();
1466
1394
}
1467
- if (blob_box.null_box ()) {
1468
- // Use the original box as a back-up.
1469
- blob_box = MoveAndClipBlob (&fake_b_it, &dest_it, clip_box);
1470
- }
1395
+ // Clip to the previously computed bounds. Although imperfectly accurate,
1396
+ // it is good enough, and much more complicated to determine where else
1397
+ // to clip.
1398
+ if (i > 0 && blob_box.left () < blob_ends[i - 1 ])
1399
+ blob_box.set_left (blob_ends[i - 1 ]);
1400
+ if (blob_box.right () > end_x)
1401
+ blob_box.set_right (end_x);
1471
1402
box_word->InsertBox (i, blob_box);
1472
1403
}
1404
+ // Fix empty boxes. If a very joined blob sits over multiple characters,
1405
+ // then we will have some empty boxes from using the middle, so look for
1406
+ // overlaps.
1407
+ for (int i = 0 ; i < box_word->length (); ++i) {
1408
+ TBOX box = box_word->BlobBox (i);
1409
+ if (box.null_box ()) {
1410
+ // Nothing has its middle in the bounds of this blob, so use anything
1411
+ // that overlaps.
1412
+ for (dest_it.mark_cycle_pt (); !dest_it.cycled_list ();
1413
+ dest_it.forward ()) {
1414
+ TBOX blob_box = dest_it.data ()->bounding_box ();
1415
+ if (blob_box.left () < blob_ends[i] &&
1416
+ (i == 0 || blob_box.right () >= blob_ends[i - 1 ])) {
1417
+ if (i > 0 && blob_box.left () < blob_ends[i - 1 ])
1418
+ blob_box.set_left (blob_ends[i - 1 ]);
1419
+ if (blob_box.right () > blob_ends[i])
1420
+ blob_box.set_right (blob_ends[i]);
1421
+ box_word->ChangeBox (i, blob_box);
1422
+ break ;
1423
+ }
1424
+ }
1425
+ }
1426
+ }
1473
1427
delete word_w->box_word ;
1474
1428
word_w->box_word = box_word;
1475
1429
if (!input_word->combination ) {
@@ -1590,7 +1544,6 @@ void PAGE_RES_IT::ResetWordIterator() {
1590
1544
}
1591
1545
}
1592
1546
ASSERT_HOST (!word_res_it.cycled_list ());
1593
- wr_it_of_next_word = word_res_it;
1594
1547
word_res_it.forward ();
1595
1548
} else {
1596
1549
// word_res_it is OK, but reset word_res and prev_word_res if needed.
@@ -1628,7 +1581,6 @@ WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {
1628
1581
block_res = next_block_res;
1629
1582
row_res = next_row_res;
1630
1583
word_res = next_word_res;
1631
- wr_it_of_current_word = wr_it_of_next_word;
1632
1584
next_block_res = nullptr ;
1633
1585
next_row_res = nullptr ;
1634
1586
next_word_res = nullptr ;
@@ -1657,7 +1609,6 @@ WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {
1657
1609
next_block_res = block_res_it.data ();
1658
1610
next_row_res = row_res_it.data ();
1659
1611
next_word_res = word_res_it.data ();
1660
- wr_it_of_next_word = word_res_it;
1661
1612
word_res_it.forward ();
1662
1613
goto foundword;
1663
1614
}
0 commit comments