Skip to content

Commit f346eb3

Browse files
author
Stefanie
committed
Revert "fix issue tesseract-ocr#1192"
This reverts commit ce88adb.
1 parent c39a95c commit f346eb3

File tree

2 files changed

+38
-91
lines changed

2 files changed

+38
-91
lines changed

src/ccstruct/pageres.cpp

+38-87
Original file line numberDiff line numberDiff line change
@@ -1292,8 +1292,7 @@ WERD_RES* PAGE_RES_IT::InsertSimpleCloneWord(const WERD_RES& clone_res,
12921292
// Helper computes the boundaries between blobs in the word. The blob bounds
12931293
// are likely very poor, if they come from LSTM, where it only outputs the
12941294
// character at one pixel within it, so we find the midpoints between them.
1295-
static void ComputeBlobEnds(const WERD_RES& word, const TBOX& clip_box,
1296-
C_BLOB_LIST* next_word_blobs,
1295+
static void ComputeBlobEnds(const WERD_RES& word, C_BLOB_LIST* next_word_blobs,
12971296
GenericVector<int>* blob_ends) {
12981297
C_BLOB_IT blob_it(word.word->cblob_list());
12991298
for (int i = 0; i < word.best_state.size(); ++i) {
@@ -1313,74 +1312,8 @@ static void ComputeBlobEnds(const WERD_RES& word, const TBOX& clip_box,
13131312
blob_it.set_to_list(next_word_blobs);
13141313
blob_end = (blob_box.right() + blob_it.data()->bounding_box().left()) / 2;
13151314
}
1316-
blob_end = ClipToRange<int>(blob_end, clip_box.left(), clip_box.right());
13171315
blob_ends->push_back(blob_end);
13181316
}
1319-
blob_ends->back() = clip_box.right();
1320-
}
1321-
1322-
// Helper computes the bounds of a word by restricting it to existing words
1323-
// that significantly overlap.
1324-
static TBOX ComputeWordBounds(const tesseract::PointerVector<WERD_RES>& words,
1325-
int w_index, TBOX prev_box, WERD_RES_IT w_it) {
1326-
constexpr int kSignificantOverlapFraction = 4;
1327-
TBOX clipped_box;
1328-
TBOX current_box = words[w_index]->word->bounding_box();
1329-
TBOX next_box;
1330-
if (w_index + 1 < words.size() && words[w_index + 1] != nullptr &&
1331-
words[w_index + 1]->word != nullptr)
1332-
next_box = words[w_index + 1]->word->bounding_box();
1333-
for (w_it.forward(); !w_it.at_first() && w_it.data()->part_of_combo;
1334-
w_it.forward()) {
1335-
if (w_it.data() == nullptr || w_it.data()->word == nullptr) continue;
1336-
TBOX w_box = w_it.data()->word->bounding_box();
1337-
int height_limit = std::min<int>(w_box.height(), w_box.width() / 2);
1338-
int width_limit = w_box.width() / kSignificantOverlapFraction;
1339-
int min_significant_overlap = std::max(height_limit, width_limit);
1340-
int overlap = w_box.intersection(current_box).width();
1341-
int prev_overlap = w_box.intersection(prev_box).width();
1342-
int next_overlap = w_box.intersection(next_box).width();
1343-
if (overlap > min_significant_overlap) {
1344-
if (prev_overlap > min_significant_overlap) {
1345-
// We have no choice but to use the LSTM word edge.
1346-
clipped_box.set_left(current_box.left());
1347-
} else if (next_overlap > min_significant_overlap) {
1348-
// We have no choice but to use the LSTM word edge.
1349-
clipped_box.set_right(current_box.right());
1350-
} else {
1351-
clipped_box += w_box;
1352-
}
1353-
}
1354-
}
1355-
if (clipped_box.height() <= 0) {
1356-
clipped_box.set_top(current_box.top());
1357-
clipped_box.set_bottom(current_box.bottom());
1358-
}
1359-
if (clipped_box.width() <= 0) clipped_box = current_box;
1360-
return clipped_box;
1361-
}
1362-
1363-
// Helper moves the blob from src to dest. If it isn't contained by clip_box,
1364-
// the blob is replaced by a fake that is contained.
1365-
static TBOX MoveAndClipBlob(C_BLOB_IT* src_it, C_BLOB_IT* dest_it,
1366-
const TBOX& clip_box) {
1367-
C_BLOB* src_blob = src_it->extract();
1368-
TBOX box = src_blob->bounding_box();
1369-
if (!clip_box.contains(box)) {
1370-
int left =
1371-
ClipToRange<int>(box.left(), clip_box.left(), clip_box.right() - 1);
1372-
int right =
1373-
ClipToRange<int>(box.right(), clip_box.left() + 1, clip_box.right());
1374-
int top =
1375-
ClipToRange<int>(box.top(), clip_box.bottom() + 1, clip_box.top());
1376-
int bottom =
1377-
ClipToRange<int>(box.bottom(), clip_box.bottom(), clip_box.top() - 1);
1378-
box = TBOX(left, bottom, right, top);
1379-
delete src_blob;
1380-
src_blob = C_BLOB::FakeBlob(box);
1381-
}
1382-
dest_it->add_after_then_move(src_blob);
1383-
return box;
13841317
}
13851318

13861319
// Replaces the current WERD/WERD_RES with the given words. The given words
@@ -1431,45 +1364,66 @@ void PAGE_RES_IT::ReplaceCurrentWord(
14311364
src_b_it.sort(&C_BLOB::SortByXMiddle);
14321365
C_BLOB_IT rej_b_it(input_word->word->rej_cblob_list());
14331366
rej_b_it.sort(&C_BLOB::SortByXMiddle);
1434-
TBOX clip_box;
14351367
for (int w = 0; w < words->size(); ++w) {
14361368
WERD_RES* word_w = (*words)[w];
1437-
clip_box = ComputeWordBounds(*words, w, clip_box, wr_it_of_current_word);
14381369
// Compute blob boundaries.
14391370
GenericVector<int> blob_ends;
14401371
C_BLOB_LIST* next_word_blobs =
14411372
w + 1 < words->size() ? (*words)[w + 1]->word->cblob_list() : nullptr;
1442-
ComputeBlobEnds(*word_w, clip_box, next_word_blobs, &blob_ends);
1443-
// Remove the fake blobs on the current word, but keep safe for back-up if
1444-
// no blob can be found.
1445-
C_BLOB_LIST fake_blobs;
1446-
C_BLOB_IT fake_b_it(&fake_blobs);
1447-
fake_b_it.add_list_after(word_w->word->cblob_list());
1448-
fake_b_it.move_to_first();
1373+
ComputeBlobEnds(*word_w, next_word_blobs, &blob_ends);
1374+
// Delete the fake blobs on the current word.
14491375
word_w->word->cblob_list()->clear();
14501376
C_BLOB_IT dest_it(word_w->word->cblob_list());
14511377
// Build the box word as we move the blobs.
14521378
tesseract::BoxWord* box_word = new tesseract::BoxWord;
1453-
for (int i = 0; i < blob_ends.size(); ++i, fake_b_it.forward()) {
1379+
for (int i = 0; i < blob_ends.size(); ++i) {
14541380
int end_x = blob_ends[i];
14551381
TBOX blob_box;
14561382
// Add the blobs up to end_x.
14571383
while (!src_b_it.empty() &&
14581384
src_b_it.data()->bounding_box().x_middle() < end_x) {
1459-
blob_box += MoveAndClipBlob(&src_b_it, &dest_it, clip_box);
1385+
blob_box += src_b_it.data()->bounding_box();
1386+
dest_it.add_after_then_move(src_b_it.extract());
14601387
src_b_it.forward();
14611388
}
14621389
while (!rej_b_it.empty() &&
14631390
rej_b_it.data()->bounding_box().x_middle() < end_x) {
1464-
blob_box += MoveAndClipBlob(&rej_b_it, &dest_it, clip_box);
1391+
blob_box += rej_b_it.data()->bounding_box();
1392+
dest_it.add_after_then_move(rej_b_it.extract());
14651393
rej_b_it.forward();
14661394
}
1467-
if (blob_box.null_box()) {
1468-
// Use the original box as a back-up.
1469-
blob_box = MoveAndClipBlob(&fake_b_it, &dest_it, clip_box);
1470-
}
1395+
// Clip to the previously computed bounds. Although imperfectly accurate,
1396+
// it is good enough, and much more complicated to determine where else
1397+
// to clip.
1398+
if (i > 0 && blob_box.left() < blob_ends[i - 1])
1399+
blob_box.set_left(blob_ends[i - 1]);
1400+
if (blob_box.right() > end_x)
1401+
blob_box.set_right(end_x);
14711402
box_word->InsertBox(i, blob_box);
14721403
}
1404+
// Fix empty boxes. If a very joined blob sits over multiple characters,
1405+
// then we will have some empty boxes from using the middle, so look for
1406+
// overlaps.
1407+
for (int i = 0; i < box_word->length(); ++i) {
1408+
TBOX box = box_word->BlobBox(i);
1409+
if (box.null_box()) {
1410+
// Nothing has its middle in the bounds of this blob, so use anything
1411+
// that overlaps.
1412+
for (dest_it.mark_cycle_pt(); !dest_it.cycled_list();
1413+
dest_it.forward()) {
1414+
TBOX blob_box = dest_it.data()->bounding_box();
1415+
if (blob_box.left() < blob_ends[i] &&
1416+
(i == 0 || blob_box.right() >= blob_ends[i - 1])) {
1417+
if (i > 0 && blob_box.left() < blob_ends[i - 1])
1418+
blob_box.set_left(blob_ends[i - 1]);
1419+
if (blob_box.right() > blob_ends[i])
1420+
blob_box.set_right(blob_ends[i]);
1421+
box_word->ChangeBox(i, blob_box);
1422+
break;
1423+
}
1424+
}
1425+
}
1426+
}
14731427
delete word_w->box_word;
14741428
word_w->box_word = box_word;
14751429
if (!input_word->combination) {
@@ -1590,7 +1544,6 @@ void PAGE_RES_IT::ResetWordIterator() {
15901544
}
15911545
}
15921546
ASSERT_HOST(!word_res_it.cycled_list());
1593-
wr_it_of_next_word = word_res_it;
15941547
word_res_it.forward();
15951548
} else {
15961549
// word_res_it is OK, but reset word_res and prev_word_res if needed.
@@ -1628,7 +1581,6 @@ WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {
16281581
block_res = next_block_res;
16291582
row_res = next_row_res;
16301583
word_res = next_word_res;
1631-
wr_it_of_current_word = wr_it_of_next_word;
16321584
next_block_res = nullptr;
16331585
next_row_res = nullptr;
16341586
next_word_res = nullptr;
@@ -1657,7 +1609,6 @@ WERD_RES *PAGE_RES_IT::internal_forward(bool new_block, bool empty_ok) {
16571609
next_block_res = block_res_it.data();
16581610
next_row_res = row_res_it.data();
16591611
next_word_res = word_res_it.data();
1660-
wr_it_of_next_word = word_res_it;
16611612
word_res_it.forward();
16621613
goto foundword;
16631614
}

src/ccstruct/pageres.h

-4
Original file line numberDiff line numberDiff line change
@@ -787,9 +787,5 @@ class PAGE_RES_IT {
787787
BLOCK_RES_IT block_res_it; // iterators
788788
ROW_RES_IT row_res_it;
789789
WERD_RES_IT word_res_it;
790-
// Iterators used to get the state of word_res_it for the current word.
791-
// Since word_res_it is 2 words further on, this is otherwise hard to do.
792-
WERD_RES_IT wr_it_of_current_word;
793-
WERD_RES_IT wr_it_of_next_word;
794790
};
795791
#endif

0 commit comments

Comments
 (0)