|
| 1 | +// File: altorenderer.cpp |
| 2 | +// Description: ALTO rendering interface |
| 3 | +// Author: Jake Sebright |
| 4 | + |
| 5 | +// (C) Copyright 2018 |
| 6 | +// Licensed under the Apache License, Version 2.0 (the "License"); |
| 7 | +// you may not use this file except in compliance with the License. |
| 8 | +// You may obtain a copy of the License at |
| 9 | +// http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | +// Unless required by applicable law or agreed to in writing, software |
| 11 | +// distributed under the License is distributed on an "AS IS" BASIS, |
| 12 | +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 13 | +// See the License for the specific language governing permissions and |
| 14 | +// limitations under the License. |
| 15 | + |
| 16 | +#include "baseapi.h" |
| 17 | +#include <memory> |
| 18 | +#include "renderer.h" |
| 19 | + |
| 20 | +namespace tesseract { |
| 21 | + |
| 22 | + /// |
| 23 | + /// Add coordinates to specified TextBlock, TextLine, or String bounding box |
| 24 | + /// Add word confidence if adding to a String bounding box |
| 25 | + /// |
| 26 | + static void AddBoxToAlto(const ResultIterator *it, PageIteratorLevel level, |
| 27 | + STRING *alto_str) { |
| 28 | + int left, top, right, bottom; |
| 29 | + it->BoundingBox(level, &left, &top, &right, &bottom); |
| 30 | + |
| 31 | + int hpos = left; |
| 32 | + int vpos = top; |
| 33 | + int height = bottom - top; |
| 34 | + int width = right - left; |
| 35 | + |
| 36 | + *alto_str += " HPOS=\""; |
| 37 | + alto_str->add_str_int("", hpos); |
| 38 | + *alto_str += "\""; |
| 39 | + *alto_str += " VPOS=\""; |
| 40 | + alto_str->add_str_int("", vpos); |
| 41 | + *alto_str += "\""; |
| 42 | + *alto_str += " WIDTH=\""; |
| 43 | + alto_str->add_str_int("", width); |
| 44 | + *alto_str += "\""; |
| 45 | + *alto_str += " HEIGHT=\""; |
| 46 | + alto_str->add_str_int("", height); |
| 47 | + *alto_str += "\""; |
| 48 | + |
| 49 | + if (level == RIL_WORD) { |
| 50 | + int wc = it->Confidence(RIL_WORD); |
| 51 | + *alto_str += " WC=\"0."; |
| 52 | + alto_str->add_str_int("", wc); |
| 53 | + *alto_str += "\""; |
| 54 | + } |
| 55 | + if (level != RIL_WORD) { |
| 56 | + |
| 57 | + *alto_str += ">"; |
| 58 | + } |
| 59 | + } |
| 60 | + |
| 61 | + /// |
| 62 | + /// Add a unique ID to an ALTO element |
| 63 | + /// |
| 64 | + static void AddIdToAlto(STRING *alto_str, const std::string base, int num1) { |
| 65 | + const size_t BUFSIZE = 64; |
| 66 | + char id_buffer[BUFSIZE]; |
| 67 | + snprintf(id_buffer, BUFSIZE - 1, "%s_%d", base.c_str(), num1); |
| 68 | + id_buffer[BUFSIZE - 1] = '\0'; |
| 69 | + *alto_str += " ID=\""; |
| 70 | + *alto_str += id_buffer; |
| 71 | + *alto_str += "\""; |
| 72 | + } |
| 73 | + |
| 74 | + /// |
| 75 | + /// Append the ALTO XML for the beginning of the document |
| 76 | + /// |
| 77 | + bool TessAltoRenderer::BeginDocumentHandler() { |
| 78 | + AppendString( |
| 79 | + "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" |
| 80 | + "<alto xmlns=\"http://www.loc.gov/standards/alto/ns-v3#\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" xmlns:xsi=\"http://www.w3.org/2001/XMLSchema-instance\" xsi:schemaLocation=\"http://www.loc.gov/standards/alto/ns-v3# http://www.loc.gov/alto/v3/alto-3-0.xsd\">\n" |
| 81 | + "\t<Description>\n" |
| 82 | + "\t\t<MeasurementUnit>pixel</MeasurementUnit>\n" |
| 83 | + "\t\t<sourceImageInformation>\n" |
| 84 | + "\t\t\t<fileName>"); |
| 85 | + |
| 86 | + AppendString(title()); |
| 87 | + |
| 88 | + AppendString("\t\t\t</fileName>\n" |
| 89 | + "\t\t</sourceImageInformation>\n" |
| 90 | + "\t\t<OCRProcessing ID=\"OCR_0\">\n" |
| 91 | + "\t\t\t<ocrProcessingStep>\n" |
| 92 | + "\t\t\t\t<processingSoftware>\n" |
| 93 | + "\t\t\t\t\t<softwareName>tesseract "); |
| 94 | + AppendString(TessBaseAPI::Version()); |
| 95 | + AppendString("</softwareName>\n" |
| 96 | + "\t\t\t\t</processingSoftware>\n" |
| 97 | + "\t\t\t</ocrProcessingStep>\n" |
| 98 | + "\t\t</OCRProcessing>\n" |
| 99 | + "\t</Description>\n" |
| 100 | + "\t<Layout>\n"); |
| 101 | + |
| 102 | + return true; |
| 103 | + } |
| 104 | + |
| 105 | + /// |
| 106 | + /// Append the ALTO XML for the layout of the image |
| 107 | + /// |
| 108 | + bool TessAltoRenderer::AddImageHandler(TessBaseAPI* api) { |
| 109 | + const std::unique_ptr<const char[]> hocr(api->GetAltoText(imagenum())); |
| 110 | + if (hocr == nullptr) return false; |
| 111 | + |
| 112 | + AppendString(hocr.get()); |
| 113 | + |
| 114 | + return true; |
| 115 | + } |
| 116 | + |
| 117 | + /// |
| 118 | + /// Append the ALTO XML for the end of the document |
| 119 | + /// |
| 120 | + bool TessAltoRenderer::EndDocumentHandler() { |
| 121 | + AppendString("\t</Layout>\n</alto>\n"); |
| 122 | + |
| 123 | + return true; |
| 124 | + } |
| 125 | + |
| 126 | + TessAltoRenderer::TessAltoRenderer(const char *outputbase) |
| 127 | + : TessResultRenderer(outputbase, "xml") { |
| 128 | + } |
| 129 | + |
| 130 | + /// |
| 131 | + /// Make an XML-formatted string with ALTO markup from the internal |
| 132 | + /// data structures. |
| 133 | + /// |
| 134 | + char *TessBaseAPI::GetAltoText(int page_number) { |
| 135 | + return GetAltoText(nullptr, page_number); |
| 136 | + } |
| 137 | + |
| 138 | + /// |
| 139 | + /// Make an XML-formatted string with ALTO markup from the internal |
| 140 | + /// data structures. |
| 141 | + /// |
| 142 | + char *TessBaseAPI::GetAltoText(ETEXT_DESC *monitor, int page_number) { |
| 143 | + if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(monitor) < 0)) |
| 144 | + return nullptr; |
| 145 | + |
| 146 | + int lcnt = 0, bcnt = 0, wcnt = 0; |
| 147 | + int page_id = page_number; |
| 148 | + |
| 149 | + STRING alto_str(""); |
| 150 | + |
| 151 | + if (input_file_ == nullptr) |
| 152 | + SetInputName(nullptr); |
| 153 | + |
| 154 | + #ifdef _WIN32 |
| 155 | + // convert input name from ANSI encoding to utf-8 |
| 156 | + int str16_len = |
| 157 | + MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, nullptr, 0); |
| 158 | + wchar_t *uni16_str = new WCHAR[str16_len]; |
| 159 | + str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, |
| 160 | + uni16_str, str16_len); |
| 161 | + int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, nullptr, 0, |
| 162 | + nullptr, nullptr); |
| 163 | + char *utf8_str = new char[utf8_len]; |
| 164 | + WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, |
| 165 | + utf8_len, nullptr, nullptr); |
| 166 | + *input_file_ = utf8_str; |
| 167 | + delete[] uni16_str; |
| 168 | + delete[] utf8_str; |
| 169 | + #endif |
| 170 | + |
| 171 | + alto_str += "\t\t<Page WIDTH=\""; |
| 172 | + alto_str.add_str_int("", rect_width_); |
| 173 | + alto_str += "\" HEIGHT=\""; |
| 174 | + alto_str.add_str_int("", rect_height_); |
| 175 | + alto_str += "\" PHYSICAL_IMG_NR=\""; |
| 176 | + alto_str.add_str_int("", rect_height_); |
| 177 | + alto_str += "\""; |
| 178 | + AddIdToAlto(&alto_str, "page", page_id); |
| 179 | + alto_str += ">\n"; |
| 180 | + alto_str += ("\t\t\t<PrintSpace HPOS=\"0\" " |
| 181 | + "VPOS=\"0\"" |
| 182 | + " WIDTH=\""); |
| 183 | + alto_str.add_str_int("", rect_width_); |
| 184 | + alto_str += "\" HEIGHT=\""; |
| 185 | + alto_str.add_str_int("", rect_height_); |
| 186 | + alto_str += "\">\n"; |
| 187 | + |
| 188 | + ResultIterator *res_it = GetIterator(); |
| 189 | + while (!res_it->Empty(RIL_BLOCK)) { |
| 190 | + if (res_it->Empty(RIL_WORD)) { |
| 191 | + res_it->Next(RIL_WORD); |
| 192 | + continue; |
| 193 | + } |
| 194 | + |
| 195 | + if (res_it->IsAtBeginningOf(RIL_BLOCK)) { |
| 196 | + alto_str += "\t\t\t\t<TextBlock "; |
| 197 | + AddIdToAlto(&alto_str, "block", bcnt); |
| 198 | + AddBoxToAlto(res_it, RIL_BLOCK, &alto_str); |
| 199 | + alto_str += "\n"; |
| 200 | + } |
| 201 | + |
| 202 | + if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { |
| 203 | + |
| 204 | + alto_str += "\t\t\t\t\t<TextLine "; |
| 205 | + AddIdToAlto(&alto_str, "line", lcnt); |
| 206 | + AddBoxToAlto(res_it, RIL_TEXTLINE, &alto_str); |
| 207 | + alto_str += "\n"; |
| 208 | + } |
| 209 | + |
| 210 | + alto_str += "\t\t\t\t\t\t<String "; |
| 211 | + AddIdToAlto(&alto_str, "string", wcnt); |
| 212 | + AddBoxToAlto(res_it, RIL_WORD, &alto_str); |
| 213 | + alto_str += " CONTENT=\""; |
| 214 | + |
| 215 | + |
| 216 | + bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD); |
| 217 | + bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD); |
| 218 | + |
| 219 | + do { |
| 220 | + const std::unique_ptr<const char[]> grapheme( |
| 221 | + res_it->GetUTF8Text(RIL_SYMBOL)); |
| 222 | + if (grapheme && grapheme[0] != 0) { |
| 223 | + alto_str += HOcrEscape(grapheme.get()); |
| 224 | + } |
| 225 | + res_it->Next(RIL_SYMBOL); |
| 226 | + } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); |
| 227 | + |
| 228 | + alto_str += "\"/>\n"; |
| 229 | + |
| 230 | + wcnt++; |
| 231 | + |
| 232 | + if (last_word_in_line) { |
| 233 | + alto_str += "\t\t\t\t\t</TextLine>\n"; |
| 234 | + lcnt++; |
| 235 | + } |
| 236 | + |
| 237 | + if (last_word_in_block) { |
| 238 | + alto_str += "\t\t\t\t</TextBlock>\n"; |
| 239 | + bcnt++; |
| 240 | + } |
| 241 | + } |
| 242 | + |
| 243 | + alto_str += "\t\t\t</PrintSpace>\n"; |
| 244 | + alto_str += "\t\t</Page>\n"; |
| 245 | + |
| 246 | + char *ret = new char[alto_str.length() + 1]; |
| 247 | + strcpy(ret, alto_str.string()); |
| 248 | + delete res_it; |
| 249 | + return ret; |
| 250 | + } |
| 251 | + |
| 252 | +} |
0 commit comments