wip: lxml-based XML generation.

AlbertWeichselbraun · AlbertWeichselbraun · commit d09574a736a4 · 2025-03-22T15:05:19.000+01:00
diff --git a/src/inscriptis/annotation/output/xml.py b/src/inscriptis/annotation/output/xml.py
@@ -1,7 +1,8 @@
 """XML Annotation processor."""
 from collections import defaultdict
-from typing import Dict, Any
+from typing import Dict, Any, Tuple
 
+from lxml import etree
 from inscriptis.annotation.output import AnnotationProcessor
 
 
@@ -10,7 +11,68 @@ class XmlExtractor(AnnotationProcessor):
 
     verbatim = True
 
-    def __call__(self, annotated_text: Dict[str, Any]) -> str:
+    def traverse_element(self, root, text, start, end, annotations, idx) -> int:
+        while idx + 1 < len(annotations):
+            idx += 1
+            next_start, next_end, label = annotations[idx]["label"]
+            # recurse?
+            if next_start < end:
+                leaf = etree.Element(root, label)
+                cascaded_end = self.traverse_element(leaf, text, next_start, next_end, idx)
+            else:
+                root.tail += text[start: cascaded_end]
+
+
+
+    def __call__(self, annotated_text: Dict[str, Any], root_element='r') -> str:
+        text = annotated_text["text"]
+        annotations = sorted(annotated_text["label"])
+        root = etree.Element(root_element)
+        current_annotation_idx = 0
+        while current_annotation_idx < len(annotations):
+            current_annotation_idx = self.traverse_element(root, text, annotations, idx)
+
+
+        for start, end, label in sorted(annotated_text["label"]):
+            current_element = etree.SubElement(root, label)
+            current_element.text = text[start:end]
+
+        return etree.tostring(root, pretty_print=True, xml_declaration=True, encoding="UTF-8")
+
+    def call3(self, annotated_text: Dict[str, Any]) -> str:
+        tag_indices = defaultdict(list)
+
+        for start, end, label in sorted(annotated_text["label"]):
+            length = end - start
+            tag_indices[start].append((label, length))
+            tag_indices[end].append(("/" + label, length))
+
+        current_idx = 0
+        tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n']
+        text = annotated_text["text"]
+        for index, tags in sorted(tag_indices.items()):
+            tagged_content.append(text[current_idx:index])
+
+            # Separate closing vs opening tags
+            closing_tags = [t for t in tags if t[0].startswith("/")]
+            opening_tags = [t for t in tags if not t[0].startswith("/")]
+
+            # Sort closing tags by ascending length (so outer closes last)
+            closing_tags.sort(key=lambda x: x[1])
+            for tag, _ in closing_tags:
+                tagged_content.append(f"<{tag}>")
+
+            # Sort opening tags by descending length (so outer opens first)
+            opening_tags.sort(key=lambda x: x[1], reverse=True)
+            for tag, _ in opening_tags:
+                tagged_content.append(f"<{tag}>")
+
+            current_idx = index
+        tagged_content.append(text[current_idx:])
+
+        return "".join(tagged_content)
+
+    def call2(self, annotated_text: Dict[str, Any]) -> str:
         """Provide an XML version of the given text and annotations.
 
         Args:
diff --git a/tests/test_annotation_output_xml.py b/tests/test_annotation_output_xml.py
@@ -0,0 +1,68 @@
+#!/usr/bin/env python
+
+"""
+Test the annotation XmlExtractor.
+"""
+from platform import processor
+from xml.etree.ElementTree import fromstring
+
+from inscriptis import Inscriptis, ParserConfig
+from inscriptis.annotation.output.xml import XmlExtractor
+
+
+
+def test_tag_error_issue_93():
+    """
+    Test for the correct tag order in the XmlOutput as described in Issue #93.
+    """
+    html_issue_93 = """<html>
+       <body>
+         <div class="a">
+            <span class="b">Item1</span>
+            <span class="b">Item2</span>
+            <span class="b">Item3</span>
+            <span class="b">Item4</span>
+         </div>
+       </body>
+    </html>"""
+
+    expected_output_issue_93 = ("""<?xml version="1.0" encoding="UTF-8" ?>\n"""
+                                "<outer><inner>  Item1 </inner><inner>Item2 </inner><inner>Item3 </inner>"
+                                "<inner>Item4</inner></outer>")
+    rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]}
+
+    inscriptis = Inscriptis(fromstring(html_issue_93), ParserConfig(annotation_rules=rules))
+    annotated_html = {'text': inscriptis.get_text(),
+                      'label': inscriptis.get_annotations()}
+    print(">>>", annotated_html)
+
+    result = XmlExtractor()(annotated_html)
+    print(result)
+    assert result == expected_output_issue_93
+
+def test_tag_folding_issue_93_extended():
+    html_issue_93 = """<html>
+       <body>
+         <div class="a">
+         Some Test to add :)
+            <span class="b">Item1</span>
+            <span class="b">Item2</span>
+            <span class="b">Item3</span>
+            <span class="b">Item4</span>
+         </div>
+       </body>
+    </html>"""
+
+    expected_output_issue_93 = ("""<?xml version="1.0" encoding="UTF-8" ?>\n"""
+                                "<outer>  Some Test to add :) <inner>Item1 </inner><inner>Item2 </inner><inner>Item3 </inner>"
+                                "<inner>Item4</inner></outer>")
+    rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]}
+
+    inscriptis = Inscriptis(fromstring(html_issue_93), ParserConfig(annotation_rules=rules))
+    annotated_html = {'text': inscriptis.get_text(),
+                      'label': inscriptis.get_annotations()}
+    print(">>>", annotated_html)
+
+    result = XmlExtractor()(annotated_html)
+    print(result)
+    assert result == expected_output_issue_93