Skip to content

Commit 36883b4

Browse files
committed
preserve interword spaces patch - Issue 1409
1 parent e0441d0 commit 36883b4

File tree

4 files changed

+22
-5
lines changed

4 files changed

+22
-5
lines changed

ccmain/resultiterator.cpp

+10-2
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,12 @@ ResultIterator::ResultIterator(const LTRResultIterator &resit)
3434
: LTRResultIterator(resit) {
3535
in_minor_direction_ = false;
3636
at_beginning_of_minor_run_ = false;
37+
38+
BoolParam *p = ParamUtils::FindParam<BoolParam>(
39+
"preserve_interword_spaces", GlobalParams()->bool_params,
40+
tesseract_->params()->bool_params);
41+
if (p != NULL) preserve_interword_spaces_ = (bool)(*p);
42+
3743
current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
3844
MoveToLogicalStartOfTextline();
3945
}
@@ -629,14 +635,16 @@ void ResultIterator::IterateAndAppendUTF8TextlineText(STRING *text) {
629635

630636
int words_appended = 0;
631637
do {
638+
int numSpaces = preserve_interword_spaces_ ? it_->word()->word->space() : 1;
639+
for(int i = 0 ; i < numSpaces ; ++i) {
640+
*text += " ";
641+
}
632642
AppendUTF8WordText(text);
633643
words_appended++;
634-
*text += " ";
635644
} while (Next(RIL_WORD) && !IsAtBeginningOf(RIL_TEXTLINE));
636645
if (BidiDebug(1)) {
637646
tprintf("%d words printed\n", words_appended);
638647
}
639-
text->truncate_at(text->length() - 1);
640648
*text += line_separator_;
641649
// If we just finished a paragraph, add an extra newline.
642650
if (it_->block() == NULL || IsAtBeginningOf(RIL_PARA))

ccmain/resultiterator.h

+9-3
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ class TESS_API ResultIterator : public LTRResultIterator {
4646
virtual ~ResultIterator() {}
4747

4848
// ============= Moving around within the page ============.
49-
/**
50-
* Moves the iterator to point to the start of the page to begin
49+
/**
50+
* Moves the iterator to point to the start of the page to begin
5151
* an iteration.
5252
*/
5353
virtual void Begin();
@@ -181,7 +181,7 @@ class TESS_API ResultIterator : public LTRResultIterator {
181181
void MoveToLogicalStartOfTextline();
182182

183183
/**
184-
* Precondition: current_paragraph_is_ltr_ and in_minor_direction_
184+
* Precondition: current_paragraph_is_ltr_ and in_minor_direction_
185185
* are set.
186186
*/
187187
void MoveToLogicalStartOfWord();
@@ -231,6 +231,12 @@ class TESS_API ResultIterator : public LTRResultIterator {
231231

232232
/** Is the currently pointed-at character in a minor-direction sequence? */
233233
bool in_minor_direction_;
234+
235+
/**
236+
* Should detected inter-word spaces be preserved, or "compressed" to a single
237+
* space character (default behavior).
238+
*/
239+
bool preserve_interword_spaces_ = false;
234240
};
235241

236242
} // namespace tesseract.

ccmain/tesseractclass.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,8 @@ Tesseract::Tesseract()
440440
this->params()),
441441
INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible",
442442
this->params()),
443+
BOOL_MEMBER(preserve_interword_spaces, false,
444+
"Preserve multiple interword spaces", this->params()),
443445

444446
// The following parameters were deprecated and removed from their original
445447
// locations. The parameters are temporarily kept here to give Tesseract

ccmain/tesseractclass.h

+1
Original file line numberDiff line numberDiff line change
@@ -1009,6 +1009,7 @@ class Tesseract : public Wordrec {
10091009
double_VAR_H(textord_tabfind_aligned_gap_fraction, 0.75,
10101010
"Fraction of height used as a minimum gap for aligned blobs.");
10111011
INT_VAR_H(tessedit_parallelize, 0, "Run in parallel where possible");
1012+
BOOL_VAR_H(preserve_interword_spaces, false, "Preserve multiple interword spaces");
10121013

10131014
// The following parameters were deprecated and removed from their original
10141015
// locations. The parameters are temporarily kept here to give Tesseract

0 commit comments

Comments
 (0)