Skip to content

Commit d5e4191

Browse files
chg: improved HTML annotator.
1 parent 6c89cc7 commit d5e4191

File tree

6 files changed

+77
-27
lines changed

6 files changed

+77
-27
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "inscriptis"
3-
version = "2.6.0"
3+
version = "2.6.1"
44
authors = ["Albert Weichselbraun <[email protected]>", "Fabian Odoni <[email protected]>"]
55
description = "inscriptis - HTML to text converter."
66
keywords = ["HTML", "converter", "text"]

src/inscriptis/annotation/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,12 @@ class Annotation(NamedTuple):
2828
metadata: str
2929
"""the tag to be attached to the annotation."""
3030

31+
def __lt__(self, other):
32+
return (self.start, -self.end) < (other.start, -other.end)
33+
34+
def __gt__(self, other):
35+
return not self.__lt__(other)
36+
3137

3238
def horizontal_shift(
3339
annotations: List[Annotation],

src/inscriptis/annotation/output/html.py

Lines changed: 17 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
from itertools import cycle
55
from typing import Dict, Any, List
66

7+
from inscriptis.annotation import Annotation
78
from inscriptis.annotation.output import AnnotationProcessor
89

910
COLOR_SCHEMA = ("#D8115980", "#8F2D5680", "#21838080", "#FBB13C80", "#73D2DE80")
@@ -20,12 +21,11 @@ class HtmlExtractor(AnnotationProcessor):
2021

2122
def __call__(self, annotated_text: Dict[str, Any]) -> str:
2223
tag_dict = defaultdict(list)
23-
24-
for start, end, label in reversed(annotated_text["label"]):
25-
tag_dict[start].append(
26-
f'<span class="{label}-label">{label}</span><span class="{label}">'
27-
)
28-
tag_dict[end].insert(0, "</span>")
24+
for start, end, label in sorted(
25+
Annotation(s, e, t) for s, e, t in reversed(annotated_text["label"])
26+
):
27+
tag_dict[start].append(f'<span class="{label}" data-label="{label}">')
28+
tag_dict[end].insert(0, f"</span>")
2929

3030
tagged_content = [
3131
"<html><head><style>",
@@ -36,10 +36,10 @@ def __call__(self, annotated_text: Dict[str, Any]) -> str:
3636
text = annotated_text["text"]
3737
current_idx = 0
3838
for idx, tags in sorted(tag_dict.items()):
39-
tagged_content.append(text[current_idx:idx].replace("\n", "</pre>\n<pre>"))
39+
tagged_content.append(text[current_idx:idx])
4040
current_idx = idx
4141
tagged_content.extend(tags)
42-
tagged_content.append(text[current_idx:].replace("\n", "</pre>\n</pre>"))
42+
tagged_content.append(text[current_idx:])
4343
return "".join(tagged_content) + "</pre></body></html>"
4444

4545
@staticmethod
@@ -72,17 +72,16 @@ def _get_css(self, labels: List[str]) -> str:
7272
for label, color in sorted(self._get_label_colors(labels).items()):
7373
css.append(
7474
"pre{{"
75-
" position: relative;\n"
75+
"position: relative; white-space: pre; line-height: 2.5; font-family: monospace;"
7676
"}}\n"
77-
".{label} {{\n"
78-
" background-color: {color};\n"
79-
" border-radius: 0.4em;\n"
77+
".{label} {{ position: relative; display: inline-block; white-space: pre;"
78+
" background-color: {color}; \n"
79+
" border-radius: 0.4em; padding: 0 4px\n"
8080
"}}\n"
81-
".{label}-label {{\n"
82-
" top: -1.0em;\n"
83-
' content: "{label}";\n'
84-
" position: absolute;\n"
85-
" background-color: {color};\n"
86-
" font-size: 75%; }}\n".format(label=label, color=color)
81+
".{label}::before {{"
82+
"content: attr(data-label); position: absolute; top: -1.3em; left: 0;"
83+
"background-color: {color}; "
84+
"font-size: 65%; padding: 0px 2px; border-radius: 1px; white-space: nowrap; font-weight: bold;"
85+
" }}\n".format(label=label, color=color)
8786
)
8887
return "\n".join(css)

src/inscriptis/annotation/output/xml.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from collections import defaultdict
44
from typing import Dict, Any
55

6+
from inscriptis.annotation import Annotation
67
from inscriptis.annotation.output import AnnotationProcessor
78

89

@@ -13,7 +14,9 @@ class XmlExtractor(AnnotationProcessor):
1314

1415
def __call__(self, annotated_text: Dict[str, Any], root_element="content"):
1516
tag_dict = defaultdict(list)
16-
for start, end, tag in reversed(annotated_text["label"]):
17+
for start, end, tag in sorted(
18+
Annotation(s, e, t) for s, e, t in reversed(annotated_text["label"])
19+
): # noqa: C414
1720
tag_dict[start].append(f"<{tag}>")
1821
tag_dict[end].insert(0, f"</{tag}>")
1922

tests/test_annotation.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,3 +69,47 @@ def test_horizontal_shift():
6969
align=HorizontalAlignment.center,
7070
shift=7,
7171
).pop() == Annotation(10, 14, "test")
72+
73+
74+
def test_comparison():
75+
a = Annotation(24, 62, "tableheading")
76+
b = Annotation(24, 600, "table")
77+
c = Annotation(51, 56, "tableheading")
78+
79+
assert b > b
80+
assert a < c
81+
assert b < c
82+
83+
84+
def test_sorting():
85+
annotations = [
86+
Annotation(24, 62, "tableheading"),
87+
Annotation(24, 600, "table"),
88+
Annotation(51, 56, "tableheading"),
89+
Annotation(59, 115, "tableheading"),
90+
Annotation(79, 104, "emphasis"),
91+
Annotation(125, 139, "tableheading"),
92+
Annotation(140, 160, "emphasis"),
93+
Annotation(254, 263, "link"),
94+
Annotation(254, 271, "bold"),
95+
Annotation(266, 268, "link"),
96+
Annotation(271, 280, "link"),
97+
Annotation(369, 385, "link"),
98+
Annotation(484, 498, "link"),
99+
]
100+
101+
assert sorted(annotations) == [
102+
Annotation(24, 600, "table"),
103+
Annotation(24, 62, "tableheading"),
104+
Annotation(51, 56, "tableheading"),
105+
Annotation(59, 115, "tableheading"),
106+
Annotation(79, 104, "emphasis"),
107+
Annotation(125, 139, "tableheading"),
108+
Annotation(140, 160, "emphasis"),
109+
Annotation(254, 271, "bold"),
110+
Annotation(254, 263, "link"),
111+
Annotation(266, 268, "link"),
112+
Annotation(271, 280, "link"),
113+
Annotation(369, 385, "link"),
114+
Annotation(484, 498, "link"),
115+
]

tests/test_annotation_output_processor.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -63,13 +63,11 @@ def test_html_annotator():
6363
assert result.startswith("<html><head><style>")
6464
assert result.split("</style>")[1] == (
6565
"</head>"
66-
'<body><pre><span class="heading-label">heading'
67-
'</span><span class="heading">'
68-
'<span class="h1-label">h1</span><span class="h1">'
69-
"Chur</span></span></pre>\n"
70-
"<pre></pre>\n"
71-
'<pre><span class="emphasis-label">emphasis</span>'
72-
'<span class="emphasis">Chur</span> is the capital '
66+
'<body><pre><span class="heading" data-label="heading">'
67+
'<span class="h1" data-label="h1">'
68+
"Chur</span></span>\n\n"
69+
'<span class="emphasis" data-label="emphasis">'
70+
"Chur</span> is the capital "
7371
"and largest town of the Swiss canton of the "
7472
"Grisons and lies in the Grisonian Rhine Valley."
7573
"</pre></body></html>"

0 commit comments

Comments
 (0)