Skip to content

Commit 2ef7e3b

Browse files
Merge pull request #94 from weblyzard/fix/bug-93-incorrect-tag-order
Fix/bug 93 incorrect tag order
2 parents 0ea55b3 + 6c89cc7 commit 2ef7e3b

12 files changed

+135
-81
lines changed

README.rst

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -236,7 +236,7 @@ that are suitable for your particular application. Post processors can be
236236
specified with the ``-p`` or ``--postprocessor`` command line argument::
237237

238238
$ inscript https://www.fhgr.ch \
239-
-r ./annotation/examples/annotation-profile.json \
239+
-r ./examples/annotation/annotation-profile.json \
240240
-p surface
241241

242242

@@ -265,10 +265,12 @@ Currently, inscriptis supports the following postprocessors:
265265
- xml: returns an additional annotated text version::
266266

267267
<?xml version="1.0" encoding="UTF-8" ?>
268+
<content>
268269
<heading>Chur</heading>
269270

270271
<emphasis>Chur</emphasis> is the capital and largest town of the Swiss
271272
canton of the Grisons and lies in the Grisonian Rhine Valley.
273+
</content>
272274

273275
- html: creates an HTML file which contains the converted text and highlights all annotations as outlined below:
274276

@@ -282,7 +284,7 @@ Currently, inscriptis supports the following postprocessors:
282284
283285
inscript --annotation-rules ./wikipedia.json \
284286
--postprocessor html \
285-
https://en.wikipedia.org/wiki/Chur.html
287+
https://en.wikipedia.org/wiki/Chur
286288
287289
Annotation rules encoded in the ``wikipedia.json`` file:
288290

pyproject.toml

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "inscriptis"
3-
version = "2.5.3"
3+
version = "2.6.0"
44
authors = ["Albert Weichselbraun <[email protected]>", "Fabian Odoni <[email protected]>"]
55
description = "inscriptis - HTML to text converter."
66
keywords = ["HTML", "converter", "text"]
@@ -44,8 +44,11 @@ requests = ">=2.32.2"
4444
lxml = ">=4.9.3"
4545

4646
# optional dependencies
47-
fastapi = { version = "^0.109.1", optional = true }
48-
uvicorn = { version = "^0.27.1", optional = true }
47+
fastapi = { version = "^0.115.11", optional = true }
48+
uvicorn = { version = "^0.34.0", optional = true }
49+
50+
[tool.poetry.group.dev.dependencies]
51+
pytest = "^8.3.5"
4952

5053

5154
[build-system]

src/inscriptis/annotation/output/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@
1010
2. The overwritten :meth:`__call__` method may either extend the original
1111
dictionary which contains the extracted text and annotations (e.g.,
1212
:class:`~inscriptis.annotation.output.surface.SurfaceExtractor`) or
13-
may replace it with an custom output (e.g.,
13+
may replace it with a custom output (e.g.,
1414
:class:`~inscriptis.annotation.output.html.HtmlExtractor` and
15-
:class:`~inscriptis.annotation.output.xml.XmlExtractor`.
15+
:class:`~inscriptis.annotation.output.xml.XmlExtractor`).
1616
1717
Currently, Inscriptis supports the following built-in AnnotationProcessors:
1818
@@ -25,6 +25,7 @@
2525
of the extracted annotations.
2626
2727
"""
28+
2829
from typing import Dict, Any
2930

3031

src/inscriptis/annotation/output/html.py

Lines changed: 14 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""HTML Annotation Processor."""
2+
23
from collections import defaultdict
34
from itertools import cycle
45
from typing import Dict, Any, List
@@ -18,44 +19,27 @@ class HtmlExtractor(AnnotationProcessor):
1819
verbatim = True
1920

2021
def __call__(self, annotated_text: Dict[str, Any]) -> str:
21-
tag_indices = defaultdict(list)
22+
tag_dict = defaultdict(list)
2223

23-
for start, end, label in sorted(annotated_text["label"]):
24-
tag_indices[start].append(label)
25-
tag_indices[end].append("/" + label)
24+
for start, end, label in reversed(annotated_text["label"]):
25+
tag_dict[start].append(
26+
f'<span class="{label}-label">{label}</span><span class="{label}">'
27+
)
28+
tag_dict[end].insert(0, "</span>")
2629

27-
open_tags = []
2830
tagged_content = [
2931
"<html><head><style>",
3032
self._get_css(annotated_text["label"]),
3133
"</style></head><body><pre>",
3234
]
33-
for idx, ch in enumerate(annotated_text["text"]):
34-
if idx in tag_indices:
35-
tags = tag_indices[idx]
36-
# close tags:
37-
for _ in (t for t in sorted(tags, reverse=True) if t.startswith("/")):
38-
open_tags.pop()
39-
tagged_content.append("</span>")
40-
# open tags
41-
for tag in (
42-
t for t in sorted(tags, reverse=True) if not t.startswith("/")
43-
):
44-
open_tags.append(tag)
45-
tagged_content.append(
46-
'<span class="{tag}-label">{tag}</span>'
47-
'<span class="{tag}">'.format(tag=tag)
48-
)
49-
50-
if ch == "\n":
51-
tagged_content.extend(["</span>" for _ in open_tags])
52-
tagged_content.append("</pre>\n<pre>")
53-
tagged_content.extend(
54-
['<span class="{tag}">'.format(tag=tag) for tag in open_tags]
55-
)
56-
else:
57-
tagged_content.append(ch)
5835

36+
text = annotated_text["text"]
37+
current_idx = 0
38+
for idx, tags in sorted(tag_dict.items()):
39+
tagged_content.append(text[current_idx:idx].replace("\n", "</pre>\n<pre>"))
40+
current_idx = idx
41+
tagged_content.extend(tags)
42+
tagged_content.append(text[current_idx:].replace("\n", "</pre>\n</pre>"))
5943
return "".join(tagged_content) + "</pre></body></html>"
6044

6145
@staticmethod
Lines changed: 13 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""XML Annotation processor."""
2+
23
from collections import defaultdict
34
from typing import Dict, Any
45

@@ -10,40 +11,20 @@ class XmlExtractor(AnnotationProcessor):
1011

1112
verbatim = True
1213

13-
def __call__(self, annotated_text: Dict[str, Any]) -> str:
14-
"""Provide an XML version of the given text and annotations.
15-
16-
Args:
17-
annotated_text: a dictionary containing the plain text and the
18-
extracted annotations.
19-
20-
Returns:
21-
A string with the XML-version of the content.
22-
"""
23-
tag_indices = defaultdict(list)
24-
25-
for start, end, label in sorted(annotated_text["label"]):
26-
tag_indices[start].append(label)
27-
tag_indices[end].append("/" + label)
14+
def __call__(self, annotated_text: Dict[str, Any], root_element="content"):
15+
tag_dict = defaultdict(list)
16+
for start, end, tag in reversed(annotated_text["label"]):
17+
tag_dict[start].append(f"<{tag}>")
18+
tag_dict[end].insert(0, f"</{tag}>")
2819

2920
current_idx = 0
30-
tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n']
3121
text = annotated_text["text"]
32-
for index, tags in sorted(tag_indices.items()):
33-
tagged_content.append(text[current_idx:index])
34-
# close tags
35-
tagged_content.extend(
36-
[
37-
"<" + tag + ">"
38-
for tag in sorted(tags, reverse=True)
39-
if tag.startswith("/")
40-
]
41-
)
42-
# open tags
43-
tagged_content.extend(
44-
["<" + tag + ">" for tag in sorted(tags) if not tag.startswith("/")]
45-
)
46-
current_idx = index
47-
tagged_content.append(text[current_idx:])
22+
tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n', "<content>\n"]
23+
for idx, tags in sorted(tag_dict.items()):
24+
tagged_content.append(text[current_idx:idx])
25+
current_idx = idx
26+
tagged_content.extend(tags)
4827

28+
tagged_content.append(text[current_idx:])
29+
tagged_content.append("\n</content>")
4930
return "".join(tagged_content)

src/inscriptis/html_engine.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,9 @@ class Inscriptis:
5151
text = parser.get_text()
5252
"""
5353

54-
def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None):
54+
def __init__(
55+
self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None
56+
) -> None:
5557
# use the default configuration, if no config object is provided
5658
config = config or ParserConfig()
5759

tests/test_annotation_engine.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,12 @@
1111
def test_get_annotation():
1212
"""Test get_anntation from the Inscriptis class"""
1313
html = "<b>Chur</b> is a City in <b>Switzerland</b>"
14-
rules = {'b': ['bold']}
14+
rules = {"b": ["bold"]}
1515

1616
inscriptis = Inscriptis(fromstring(html), ParserConfig(annotation_rules=rules))
1717

1818
assert inscriptis.get_text() == "Chur is a City in Switzerland"
19-
assert inscriptis.get_annotations() == [Annotation(start=0, end=4, metadata='bold'), Annotation(start=18, end=29, metadata='bold')]
19+
assert inscriptis.get_annotations() == [
20+
Annotation(start=0, end=4, metadata="bold"),
21+
Annotation(start=18, end=29, metadata="bold"),
22+
]

tests/test_annotation_output_processor.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
"text": "Chur\n\nChur is the capital and largest town of "
1616
"the Swiss canton of the Grisons and lies in the "
1717
"Grisonian Rhine Valley.",
18-
"label": [[0, 4, "heading"], [0, 4, "h1"], [6, 10, "emphasis"]],
18+
"label": [[0, 4, "h1"], [0, 4, "heading"], [6, 10, "emphasis"]],
1919
}
2020

2121

@@ -36,8 +36,8 @@ def test_surface_annotator():
3636

3737
# and we have additional information on surface forms :)
3838
assert result["surface"] == [
39-
("heading", "Chur"),
4039
("h1", "Chur"),
40+
("heading", "Chur"),
4141
("emphasis", "Chur"),
4242
]
4343

@@ -48,11 +48,11 @@ def test_xml_annotator():
4848

4949
# and we have additional information on surface forms :)
5050
assert result == (
51-
'<?xml version="1.0" encoding="UTF-8" ?>\n'
52-
"<h1><heading>Chur</heading></h1>\n\n<emphasis>"
51+
'<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n'
52+
"<heading><h1>Chur</h1></heading>\n\n<emphasis>"
5353
"Chur</emphasis> is the capital and largest town "
5454
"of the Swiss canton of the Grisons and lies in "
55-
"the Grisonian Rhine Valley."
55+
"the Grisonian Rhine Valley.\n</content>"
5656
)
5757

5858

@@ -61,8 +61,8 @@ def test_html_annotator():
6161
result = processor(EXAMPLE_OUTPUT)
6262

6363
assert result.startswith("<html><head><style>")
64-
assert result.endswith(
65-
"</style></head>"
64+
assert result.split("</style>")[1] == (
65+
"</head>"
6666
'<body><pre><span class="heading-label">heading'
6767
'</span><span class="heading">'
6868
'<span class="h1-label">h1</span><span class="h1">'
@@ -81,6 +81,6 @@ def test_trailing_tag_annotation():
8181
result = processor({"text": "Ehre sei Gott!", "label": [[9, 14, "emphasis"]]})
8282

8383
assert result == (
84-
'<?xml version="1.0" encoding="UTF-8" ?>\n'
85-
"Ehre sei <emphasis>Gott!</emphasis>"
84+
'<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n'
85+
"Ehre sei <emphasis>Gott!</emphasis>\n</content>"
8686
)

tests/test_annotation_output_xml.py

Lines changed: 75 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,75 @@
1+
#!/usr/bin/env python
2+
3+
"""
4+
Test the annotation XmlExtractor.
5+
"""
6+
from lxml.html import fromstring
7+
8+
from inscriptis import Inscriptis, ParserConfig
9+
from inscriptis.annotation.output.xml import XmlExtractor
10+
11+
12+
def test_tag_error_issue_93():
13+
"""
14+
Test for the correct tag order in the XmlOutput as described in Issue #93.
15+
"""
16+
html_issue_93 = """<html>
17+
<body>
18+
<div class="a">
19+
<span class="b">Item1</span>
20+
<span class="b">Item2</span>
21+
<span class="b">Item3</span>
22+
<span class="b">Item4</span>
23+
</div>
24+
</body>
25+
</html>"""
26+
27+
expected_output_issue_93 = (
28+
"""<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n"""
29+
"<outer><inner> Item1 </inner><inner>Item2 </inner><inner>Item3 </inner>"
30+
"<inner>Item4</inner></outer>\n</content>"
31+
)
32+
rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]}
33+
34+
inscriptis = Inscriptis(
35+
fromstring(html_issue_93), ParserConfig(annotation_rules=rules)
36+
)
37+
annotated_html = {
38+
"text": inscriptis.get_text(),
39+
"label": inscriptis.get_annotations(),
40+
}
41+
result = XmlExtractor()(annotated_html)
42+
assert result == expected_output_issue_93
43+
44+
45+
def test_tag_folding_issue_93_extended():
46+
html_issue_93 = """<html>
47+
<body>
48+
<div class="a">
49+
Some Test to add :)
50+
<span class="b">Item<b>1</b></span>
51+
<span class="b">Item2</span>
52+
<span class="b"><b>Item3</b></span>
53+
<span class="b"><b>It</b>e<b>m4</b></span>
54+
</div>
55+
</body>
56+
</html>"""
57+
58+
expected_output_issue_93 = (
59+
"""<?xml version="1.0" encoding="UTF-8" ?>\n"""
60+
"""<content>\n"""
61+
"""<outer> Some Test to add :) <inner>Item <bold>1</bold></inner> <inner>Item2 </inner>"""
62+
"""<inner><bold>Item3</bold></inner> <inner><bold>It</bold> e <bold>m4</bold></inner></outer>\n"""
63+
"""</content>"""
64+
)
65+
rules = {"div#class=a": ["outer"], "span#class=b": ["inner"], "b": ["bold"]}
66+
67+
inscriptis = Inscriptis(
68+
fromstring(html_issue_93), ParserConfig(annotation_rules=rules)
69+
)
70+
annotated_html = {
71+
"text": inscriptis.get_text(),
72+
"label": inscriptis.get_annotations(),
73+
}
74+
result = XmlExtractor()(annotated_html)
75+
assert result == expected_output_issue_93

tests/test_block.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""
22
Test cases for the Block class.
33
"""
4+
45
from inscriptis.model.canvas.block import Block
56
from inscriptis.model.canvas.prefix import Prefix
67

tests/test_cli.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""
22
Tests the Inscriptis CLI client.
33
"""
4+
45
from io import StringIO
56
from pathlib import Path
67
from json import loads

tests/test_custom_html_tag_handling.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
"""Test the custom HTML tag handling."""
2+
23
from lxml.html import fromstring
34

45
from inscriptis import Inscriptis, ParserConfig

0 commit comments

Comments
 (0)