Skip to content

Commit d09574a

Browse files
wip: lxml-based XML generation.
1 parent 0ea55b3 commit d09574a

File tree

2 files changed

+132
-2
lines changed

2 files changed

+132
-2
lines changed

src/inscriptis/annotation/output/xml.py

Lines changed: 64 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
"""XML Annotation processor."""
22
from collections import defaultdict
3-
from typing import Dict, Any
3+
from typing import Dict, Any, Tuple
44

5+
from lxml import etree
56
from inscriptis.annotation.output import AnnotationProcessor
67

78

@@ -10,7 +11,68 @@ class XmlExtractor(AnnotationProcessor):
1011

1112
verbatim = True
1213

13-
def __call__(self, annotated_text: Dict[str, Any]) -> str:
14+
def traverse_element(self, root, text, start, end, annotations, idx) -> int:
15+
while idx + 1 < len(annotations):
16+
idx += 1
17+
next_start, next_end, label = annotations[idx]["label"]
18+
# recurse?
19+
if next_start < end:
20+
leaf = etree.Element(root, label)
21+
cascaded_end = self.traverse_element(leaf, text, next_start, next_end, idx)
22+
else:
23+
root.tail += text[start: cascaded_end]
24+
25+
26+
27+
def __call__(self, annotated_text: Dict[str, Any], root_element='r') -> str:
28+
text = annotated_text["text"]
29+
annotations = sorted(annotated_text["label"])
30+
root = etree.Element(root_element)
31+
current_annotation_idx = 0
32+
while current_annotation_idx < len(annotations):
33+
current_annotation_idx = self.traverse_element(root, text, annotations, idx)
34+
35+
36+
for start, end, label in sorted(annotated_text["label"]):
37+
current_element = etree.SubElement(root, label)
38+
current_element.text = text[start:end]
39+
40+
return etree.tostring(root, pretty_print=True, xml_declaration=True, encoding="UTF-8")
41+
42+
def call3(self, annotated_text: Dict[str, Any]) -> str:
43+
tag_indices = defaultdict(list)
44+
45+
for start, end, label in sorted(annotated_text["label"]):
46+
length = end - start
47+
tag_indices[start].append((label, length))
48+
tag_indices[end].append(("/" + label, length))
49+
50+
current_idx = 0
51+
tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n']
52+
text = annotated_text["text"]
53+
for index, tags in sorted(tag_indices.items()):
54+
tagged_content.append(text[current_idx:index])
55+
56+
# Separate closing vs opening tags
57+
closing_tags = [t for t in tags if t[0].startswith("/")]
58+
opening_tags = [t for t in tags if not t[0].startswith("/")]
59+
60+
# Sort closing tags by ascending length (so outer closes last)
61+
closing_tags.sort(key=lambda x: x[1])
62+
for tag, _ in closing_tags:
63+
tagged_content.append(f"<{tag}>")
64+
65+
# Sort opening tags by descending length (so outer opens first)
66+
opening_tags.sort(key=lambda x: x[1], reverse=True)
67+
for tag, _ in opening_tags:
68+
tagged_content.append(f"<{tag}>")
69+
70+
current_idx = index
71+
tagged_content.append(text[current_idx:])
72+
73+
return "".join(tagged_content)
74+
75+
def call2(self, annotated_text: Dict[str, Any]) -> str:
1476
"""Provide an XML version of the given text and annotations.
1577
1678
Args:

tests/test_annotation_output_xml.py

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
#!/usr/bin/env python
2+
3+
"""
4+
Test the annotation XmlExtractor.
5+
"""
6+
from platform import processor
7+
from xml.etree.ElementTree import fromstring
8+
9+
from inscriptis import Inscriptis, ParserConfig
10+
from inscriptis.annotation.output.xml import XmlExtractor
11+
12+
13+
14+
def test_tag_error_issue_93():
15+
"""
16+
Test for the correct tag order in the XmlOutput as described in Issue #93.
17+
"""
18+
html_issue_93 = """<html>
19+
<body>
20+
<div class="a">
21+
<span class="b">Item1</span>
22+
<span class="b">Item2</span>
23+
<span class="b">Item3</span>
24+
<span class="b">Item4</span>
25+
</div>
26+
</body>
27+
</html>"""
28+
29+
expected_output_issue_93 = ("""<?xml version="1.0" encoding="UTF-8" ?>\n"""
30+
"<outer><inner> Item1 </inner><inner>Item2 </inner><inner>Item3 </inner>"
31+
"<inner>Item4</inner></outer>")
32+
rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]}
33+
34+
inscriptis = Inscriptis(fromstring(html_issue_93), ParserConfig(annotation_rules=rules))
35+
annotated_html = {'text': inscriptis.get_text(),
36+
'label': inscriptis.get_annotations()}
37+
print(">>>", annotated_html)
38+
39+
result = XmlExtractor()(annotated_html)
40+
print(result)
41+
assert result == expected_output_issue_93
42+
43+
def test_tag_folding_issue_93_extended():
44+
html_issue_93 = """<html>
45+
<body>
46+
<div class="a">
47+
Some Test to add :)
48+
<span class="b">Item1</span>
49+
<span class="b">Item2</span>
50+
<span class="b">Item3</span>
51+
<span class="b">Item4</span>
52+
</div>
53+
</body>
54+
</html>"""
55+
56+
expected_output_issue_93 = ("""<?xml version="1.0" encoding="UTF-8" ?>\n"""
57+
"<outer> Some Test to add :) <inner>Item1 </inner><inner>Item2 </inner><inner>Item3 </inner>"
58+
"<inner>Item4</inner></outer>")
59+
rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]}
60+
61+
inscriptis = Inscriptis(fromstring(html_issue_93), ParserConfig(annotation_rules=rules))
62+
annotated_html = {'text': inscriptis.get_text(),
63+
'label': inscriptis.get_annotations()}
64+
print(">>>", annotated_html)
65+
66+
result = XmlExtractor()(annotated_html)
67+
print(result)
68+
assert result == expected_output_issue_93

0 commit comments

Comments
 (0)