Skip to content

Commit 4d13892

Browse files
sundarcftfmorris
authored andcommitted
Adds TessHOcrTsvRenderer class for rendering HOCR info in tsv format.
1 parent d04e325 commit 4d13892

File tree

2 files changed

+72
-0
lines changed

2 files changed

+72
-0
lines changed

api/renderer.cpp

+55
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,61 @@ bool TessHOcrRenderer::AddImageHandler(TessBaseAPI* api) {
179179
return true;
180180
}
181181

182+
/**********************************************************************
183+
* HOcr Text Renderer interface implementation
184+
**********************************************************************/
185+
TessHOcrTsvRenderer::TessHOcrTsvRenderer(const char *outputbase)
186+
: TessResultRenderer(outputbase, "hocr.tsv") {
187+
font_info_ = false;
188+
}
189+
190+
TessHOcrTsvRenderer::TessHOcrTsvRenderer(const char *outputbase, bool font_info)
191+
: TessResultRenderer(outputbase, "hocr.tsv") {
192+
font_info_ = font_info;
193+
}
194+
195+
bool TessHOcrTsvRenderer::BeginDocumentHandler() {
196+
AppendString(
197+
"<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
198+
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n"
199+
" \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n"
200+
"<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" "
201+
"lang=\"en\">\n <head>\n <title>\n");
202+
AppendString(title());
203+
AppendString(
204+
"</title>\n"
205+
"<meta http-equiv=\"Content-Type\" content=\"text/html;"
206+
"charset=utf-8\" />\n"
207+
" <meta name='ocr-system' content='tesseract " TESSERACT_VERSION_STR
208+
"' />\n"
209+
" <meta name='ocr-capabilities' content='ocr_page ocr_carea ocr_par"
210+
" ocr_line ocrx_word");
211+
if (font_info_)
212+
AppendString(
213+
" ocrp_lang ocrp_dir ocrp_font ocrp_fsize ocrp_wconf");
214+
AppendString(
215+
"'/>\n"
216+
"</head>\n<body>\n");
217+
218+
return true;
219+
}
220+
221+
bool TessHOcrTsvRenderer::EndDocumentHandler() {
222+
AppendString(" </body>\n</html>\n");
223+
224+
return true;
225+
}
226+
227+
bool TessHOcrTsvRenderer::AddImageHandler(TessBaseAPI* api) {
228+
char* hocr = api->GetHOCRText(imagenum());
229+
if (hocr == NULL) return false;
230+
231+
AppendString(hocr);
232+
delete[] hocr;
233+
234+
return true;
235+
}
236+
182237
/**********************************************************************
183238
* UNLV Text Renderer interface implementation
184239
**********************************************************************/

api/renderer.h

+17
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,23 @@ class TESS_API TessHOcrRenderer : public TessResultRenderer {
162162
bool font_info_; // whether to print font information
163163
};
164164

165+
/**
166+
* Renders tesseract output into an hocr tsv string
167+
*/
168+
class TESS_API TessHOcrTsvRenderer : public TessResultRenderer {
169+
public:
170+
explicit TessHOcrTsvRenderer(const char *outputbase, bool font_info);
171+
explicit TessHOcrTsvRenderer(const char *outputbase);
172+
173+
protected:
174+
virtual bool BeginDocumentHandler();
175+
virtual bool AddImageHandler(TessBaseAPI* api);
176+
virtual bool EndDocumentHandler();
177+
178+
private:
179+
bool font_info_; // whether to print font information
180+
};
181+
165182
/**
166183
* Renders tesseract output into searchable PDF
167184
*/

0 commit comments

Comments
 (0)