Skip to content

Commit 3654c4f

Browse files
fix: #93 - correct output XML
1. correct tag order 2. added a root tag with default name <content> to ensure that valid xml is created.
1 parent d09574a commit 3654c4f

File tree

5 files changed

+52
-120
lines changed

5 files changed

+52
-120
lines changed

pyproject.toml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,11 @@ requests = ">=2.32.2"
4444
lxml = ">=4.9.3"
4545

4646
# optional dependencies
47-
fastapi = { version = "^0.109.1", optional = true }
48-
uvicorn = { version = "^0.27.1", optional = true }
47+
fastapi = { version = "^0.115.11", optional = true }
48+
uvicorn = { version = "^0.34.0", optional = true }
49+
50+
[tool.poetry.group.dev.dependencies]
51+
pytest = "^8.3.5"
4952

5053

5154
[build-system]

src/inscriptis/annotation/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
"""The model used for saving annotations."""
22

3+
from functools import total_ordering
34
from typing import List
45
from typing import NamedTuple
56

Lines changed: 11 additions & 92 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
"""XML Annotation processor."""
2+
23
from collections import defaultdict
3-
from typing import Dict, Any, Tuple
4+
from typing import Dict, Any
45

5-
from lxml import etree
66
from inscriptis.annotation.output import AnnotationProcessor
77

88

@@ -11,101 +11,20 @@ class XmlExtractor(AnnotationProcessor):
1111

1212
verbatim = True
1313

14-
def traverse_element(self, root, text, start, end, annotations, idx) -> int:
15-
while idx + 1 < len(annotations):
16-
idx += 1
17-
next_start, next_end, label = annotations[idx]["label"]
18-
# recurse?
19-
if next_start < end:
20-
leaf = etree.Element(root, label)
21-
cascaded_end = self.traverse_element(leaf, text, next_start, next_end, idx)
22-
else:
23-
root.tail += text[start: cascaded_end]
24-
25-
26-
27-
def __call__(self, annotated_text: Dict[str, Any], root_element='r') -> str:
28-
text = annotated_text["text"]
29-
annotations = sorted(annotated_text["label"])
30-
root = etree.Element(root_element)
31-
current_annotation_idx = 0
32-
while current_annotation_idx < len(annotations):
33-
current_annotation_idx = self.traverse_element(root, text, annotations, idx)
34-
35-
36-
for start, end, label in sorted(annotated_text["label"]):
37-
current_element = etree.SubElement(root, label)
38-
current_element.text = text[start:end]
39-
40-
return etree.tostring(root, pretty_print=True, xml_declaration=True, encoding="UTF-8")
41-
42-
def call3(self, annotated_text: Dict[str, Any]) -> str:
43-
tag_indices = defaultdict(list)
44-
45-
for start, end, label in sorted(annotated_text["label"]):
46-
length = end - start
47-
tag_indices[start].append((label, length))
48-
tag_indices[end].append(("/" + label, length))
14+
def __call__(self, annotated_text: Dict[str, Any], root_element="content"):
15+
tag_dict = defaultdict(list)
16+
for start, end, tag in reversed(annotated_text["label"]):
17+
tag_dict[start].append(f"<{tag}>")
18+
tag_dict[end].insert(0, f"</{tag}>")
4919

5020
current_idx = 0
51-
tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n']
5221
text = annotated_text["text"]
53-
for index, tags in sorted(tag_indices.items()):
22+
tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n', "<content>\n"]
23+
for index, tags in sorted(tag_dict.items()):
5424
tagged_content.append(text[current_idx:index])
55-
56-
# Separate closing vs opening tags
57-
closing_tags = [t for t in tags if t[0].startswith("/")]
58-
opening_tags = [t for t in tags if not t[0].startswith("/")]
59-
60-
# Sort closing tags by ascending length (so outer closes last)
61-
closing_tags.sort(key=lambda x: x[1])
62-
for tag, _ in closing_tags:
63-
tagged_content.append(f"<{tag}>")
64-
65-
# Sort opening tags by descending length (so outer opens first)
66-
opening_tags.sort(key=lambda x: x[1], reverse=True)
67-
for tag, _ in opening_tags:
68-
tagged_content.append(f"<{tag}>")
69-
7025
current_idx = index
71-
tagged_content.append(text[current_idx:])
72-
73-
return "".join(tagged_content)
74-
75-
def call2(self, annotated_text: Dict[str, Any]) -> str:
76-
"""Provide an XML version of the given text and annotations.
77-
78-
Args:
79-
annotated_text: a dictionary containing the plain text and the
80-
extracted annotations.
81-
82-
Returns:
83-
A string with the XML-version of the content.
84-
"""
85-
tag_indices = defaultdict(list)
26+
tagged_content.extend(tags)
8627

87-
for start, end, label in sorted(annotated_text["label"]):
88-
tag_indices[start].append(label)
89-
tag_indices[end].append("/" + label)
90-
91-
current_idx = 0
92-
tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n']
93-
text = annotated_text["text"]
94-
for index, tags in sorted(tag_indices.items()):
95-
tagged_content.append(text[current_idx:index])
96-
# close tags
97-
tagged_content.extend(
98-
[
99-
"<" + tag + ">"
100-
for tag in sorted(tags, reverse=True)
101-
if tag.startswith("/")
102-
]
103-
)
104-
# open tags
105-
tagged_content.extend(
106-
["<" + tag + ">" for tag in sorted(tags) if not tag.startswith("/")]
107-
)
108-
current_idx = index
10928
tagged_content.append(text[current_idx:])
110-
29+
tagged_content.append("\n</content>")
11130
return "".join(tagged_content)

src/inscriptis/html_engine.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -51,7 +51,9 @@ class Inscriptis:
5151
text = parser.get_text()
5252
"""
5353

54-
def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None):
54+
def __init__(
55+
self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None
56+
) -> None:
5557
# use the default configuration, if no config object is provided
5658
config = config or ParserConfig()
5759

tests/test_annotation_output_xml.py

Lines changed: 32 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,12 @@
33
"""
44
Test the annotation XmlExtractor.
55
"""
6-
from platform import processor
7-
from xml.etree.ElementTree import fromstring
6+
from lxml.html import fromstring
87

98
from inscriptis import Inscriptis, ParserConfig
109
from inscriptis.annotation.output.xml import XmlExtractor
1110

1211

13-
1412
def test_tag_error_issue_93():
1513
"""
1614
Test for the correct tag order in the XmlOutput as described in Issue #93.
@@ -26,43 +24,52 @@ def test_tag_error_issue_93():
2624
</body>
2725
</html>"""
2826

29-
expected_output_issue_93 = ("""<?xml version="1.0" encoding="UTF-8" ?>\n"""
30-
"<outer><inner> Item1 </inner><inner>Item2 </inner><inner>Item3 </inner>"
31-
"<inner>Item4</inner></outer>")
27+
expected_output_issue_93 = (
28+
"""<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n"""
29+
"<outer><inner> Item1 </inner><inner>Item2 </inner><inner>Item3 </inner>"
30+
"<inner>Item4</inner></outer>\n</content>"
31+
)
3232
rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]}
3333

34-
inscriptis = Inscriptis(fromstring(html_issue_93), ParserConfig(annotation_rules=rules))
35-
annotated_html = {'text': inscriptis.get_text(),
36-
'label': inscriptis.get_annotations()}
37-
print(">>>", annotated_html)
38-
34+
inscriptis = Inscriptis(
35+
fromstring(html_issue_93), ParserConfig(annotation_rules=rules)
36+
)
37+
annotated_html = {
38+
"text": inscriptis.get_text(),
39+
"label": inscriptis.get_annotations(),
40+
}
3941
result = XmlExtractor()(annotated_html)
40-
print(result)
4142
assert result == expected_output_issue_93
4243

44+
4345
def test_tag_folding_issue_93_extended():
4446
html_issue_93 = """<html>
4547
<body>
4648
<div class="a">
4749
Some Test to add :)
48-
<span class="b">Item1</span>
50+
<span class="b">Item<b>1</b></span>
4951
<span class="b">Item2</span>
50-
<span class="b">Item3</span>
51-
<span class="b">Item4</span>
52+
<span class="b"><b>Item3</b></span>
53+
<span class="b"><b>It</b>e<b>m4</b></span>
5254
</div>
5355
</body>
5456
</html>"""
5557

56-
expected_output_issue_93 = ("""<?xml version="1.0" encoding="UTF-8" ?>\n"""
57-
"<outer> Some Test to add :) <inner>Item1 </inner><inner>Item2 </inner><inner>Item3 </inner>"
58-
"<inner>Item4</inner></outer>")
59-
rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]}
60-
61-
inscriptis = Inscriptis(fromstring(html_issue_93), ParserConfig(annotation_rules=rules))
62-
annotated_html = {'text': inscriptis.get_text(),
63-
'label': inscriptis.get_annotations()}
64-
print(">>>", annotated_html)
58+
expected_output_issue_93 = (
59+
"""<?xml version="1.0" encoding="UTF-8" ?>\n"""
60+
"""<content>\n"""
61+
"""<outer> Some Test to add :) <inner>Item <bold>1</bold></inner> <inner>Item2 </inner>"""
62+
"""<inner><bold>Item3</bold></inner> <inner><bold>It</bold> e <bold>m4</bold></inner></outer>\n"""
63+
"""</content>"""
64+
)
65+
rules = {"div#class=a": ["outer"], "span#class=b": ["inner"], "b": ["bold"]}
6566

67+
inscriptis = Inscriptis(
68+
fromstring(html_issue_93), ParserConfig(annotation_rules=rules)
69+
)
70+
annotated_html = {
71+
"text": inscriptis.get_text(),
72+
"label": inscriptis.get_annotations(),
73+
}
6674
result = XmlExtractor()(annotated_html)
67-
print(result)
6875
assert result == expected_output_issue_93

0 commit comments

Comments
 (0)