fix: #93 - correct output XML

AlbertWeichselbraun · AlbertWeichselbraun · commit 3654c4f33e8e · 2025-03-22T17:26:31.000+01:00
1. correct tag order
2. added a root tag with default name &lt;content&gt; to ensure that valid xml
   is created.
diff --git a/pyproject.toml b/pyproject.toml
@@ -44,8 +44,11 @@ requests = ">=2.32.2"
 lxml = ">=4.9.3"
 
 # optional dependencies
-fastapi = { version = "^0.109.1", optional = true }
-uvicorn = { version = "^0.27.1", optional = true }
+fastapi = { version = "^0.115.11", optional = true }
+uvicorn = { version = "^0.34.0", optional = true }
+
+[tool.poetry.group.dev.dependencies]
+pytest = "^8.3.5"
 
 
 [build-system]
diff --git a/src/inscriptis/annotation/__init__.py b/src/inscriptis/annotation/__init__.py
@@ -1,5 +1,6 @@
 """The model used for saving annotations."""
 
+from functools import total_ordering
 from typing import List
 from typing import NamedTuple
 
diff --git a/src/inscriptis/annotation/output/xml.py b/src/inscriptis/annotation/output/xml.py
@@ -1,8 +1,8 @@
 """XML Annotation processor."""
+
 from collections import defaultdict
-from typing import Dict, Any, Tuple
+from typing import Dict, Any
 
-from lxml import etree
 from inscriptis.annotation.output import AnnotationProcessor
 
 
@@ -11,101 +11,20 @@ class XmlExtractor(AnnotationProcessor):
 
     verbatim = True
 
-    def traverse_element(self, root, text, start, end, annotations, idx) -> int:
-        while idx + 1 < len(annotations):
-            idx += 1
-            next_start, next_end, label = annotations[idx]["label"]
-            # recurse?
-            if next_start < end:
-                leaf = etree.Element(root, label)
-                cascaded_end = self.traverse_element(leaf, text, next_start, next_end, idx)
-            else:
-                root.tail += text[start: cascaded_end]
-
-
-
-    def __call__(self, annotated_text: Dict[str, Any], root_element='r') -> str:
-        text = annotated_text["text"]
-        annotations = sorted(annotated_text["label"])
-        root = etree.Element(root_element)
-        current_annotation_idx = 0
-        while current_annotation_idx < len(annotations):
-            current_annotation_idx = self.traverse_element(root, text, annotations, idx)
-
-
-        for start, end, label in sorted(annotated_text["label"]):
-            current_element = etree.SubElement(root, label)
-            current_element.text = text[start:end]
-
-        return etree.tostring(root, pretty_print=True, xml_declaration=True, encoding="UTF-8")
-
-    def call3(self, annotated_text: Dict[str, Any]) -> str:
-        tag_indices = defaultdict(list)
-
-        for start, end, label in sorted(annotated_text["label"]):
-            length = end - start
-            tag_indices[start].append((label, length))
-            tag_indices[end].append(("/" + label, length))
+    def __call__(self, annotated_text: Dict[str, Any], root_element="content"):
+        tag_dict = defaultdict(list)
+        for start, end, tag in reversed(annotated_text["label"]):
+            tag_dict[start].append(f"<{tag}>")
+            tag_dict[end].insert(0, f"</{tag}>")
 
         current_idx = 0
-        tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n']
         text = annotated_text["text"]
-        for index, tags in sorted(tag_indices.items()):
+        tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n', "<content>\n"]
+        for index, tags in sorted(tag_dict.items()):
             tagged_content.append(text[current_idx:index])
-
-            # Separate closing vs opening tags
-            closing_tags = [t for t in tags if t[0].startswith("/")]
-            opening_tags = [t for t in tags if not t[0].startswith("/")]
-
-            # Sort closing tags by ascending length (so outer closes last)
-            closing_tags.sort(key=lambda x: x[1])
-            for tag, _ in closing_tags:
-                tagged_content.append(f"<{tag}>")
-
-            # Sort opening tags by descending length (so outer opens first)
-            opening_tags.sort(key=lambda x: x[1], reverse=True)
-            for tag, _ in opening_tags:
-                tagged_content.append(f"<{tag}>")
-
             current_idx = index
-        tagged_content.append(text[current_idx:])
-
-        return "".join(tagged_content)
-
-    def call2(self, annotated_text: Dict[str, Any]) -> str:
-        """Provide an XML version of the given text and annotations.
-
-        Args:
-            annotated_text: a dictionary containing the plain text and the
-                            extracted annotations.
-
-        Returns:
-            A string with the XML-version of the content.
-        """
-        tag_indices = defaultdict(list)
+            tagged_content.extend(tags)
 
-        for start, end, label in sorted(annotated_text["label"]):
-            tag_indices[start].append(label)
-            tag_indices[end].append("/" + label)
-
-        current_idx = 0
-        tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n']
-        text = annotated_text["text"]
-        for index, tags in sorted(tag_indices.items()):
-            tagged_content.append(text[current_idx:index])
-            # close tags
-            tagged_content.extend(
-                [
-                    "<" + tag + ">"
-                    for tag in sorted(tags, reverse=True)
-                    if tag.startswith("/")
-                ]
-            )
-            # open tags
-            tagged_content.extend(
-                ["<" + tag + ">" for tag in sorted(tags) if not tag.startswith("/")]
-            )
-            current_idx = index
         tagged_content.append(text[current_idx:])
-
+        tagged_content.append("\n</content>")
         return "".join(tagged_content)
diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py
@@ -51,7 +51,9 @@ class Inscriptis:
       text = parser.get_text()
     """
 
-    def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None):
+    def __init__(
+        self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None
+    ) -> None:
         # use the default configuration, if no config object is provided
         config = config or ParserConfig()
 
diff --git a/tests/test_annotation_output_xml.py b/tests/test_annotation_output_xml.py
@@ -3,14 +3,12 @@
 """
 Test the annotation XmlExtractor.
 """
-from platform import processor
-from xml.etree.ElementTree import fromstring
+from lxml.html import fromstring
 
 from inscriptis import Inscriptis, ParserConfig
 from inscriptis.annotation.output.xml import XmlExtractor
 
 
-
 def test_tag_error_issue_93():
     """
     Test for the correct tag order in the XmlOutput as described in Issue #93.
@@ -26,43 +24,52 @@ def test_tag_error_issue_93():
        </body>
     </html>"""
 
-    expected_output_issue_93 = ("""<?xml version="1.0" encoding="UTF-8" ?>\n"""
-                                "<outer><inner>  Item1 </inner><inner>Item2 </inner><inner>Item3 </inner>"
-                                "<inner>Item4</inner></outer>")
+    expected_output_issue_93 = (
+        """<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n"""
+        "<outer><inner>  Item1 </inner><inner>Item2 </inner><inner>Item3 </inner>"
+        "<inner>Item4</inner></outer>\n</content>"
+    )
     rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]}
 
-    inscriptis = Inscriptis(fromstring(html_issue_93), ParserConfig(annotation_rules=rules))
-    annotated_html = {'text': inscriptis.get_text(),
-                      'label': inscriptis.get_annotations()}
-    print(">>>", annotated_html)
-
+    inscriptis = Inscriptis(
+        fromstring(html_issue_93), ParserConfig(annotation_rules=rules)
+    )
+    annotated_html = {
+        "text": inscriptis.get_text(),
+        "label": inscriptis.get_annotations(),
+    }
     result = XmlExtractor()(annotated_html)
-    print(result)
     assert result == expected_output_issue_93
 
+
 def test_tag_folding_issue_93_extended():
     html_issue_93 = """<html>
        <body>
          <div class="a">
          Some Test to add :)
-            <span class="b">Item1</span>
+            <span class="b">Item<b>1</b></span>
             <span class="b">Item2</span>
-            <span class="b">Item3</span>
-            <span class="b">Item4</span>
+            <span class="b"><b>Item3</b></span>
+            <span class="b"><b>It</b>e<b>m4</b></span>
          </div>
        </body>
     </html>"""
 
-    expected_output_issue_93 = ("""<?xml version="1.0" encoding="UTF-8" ?>\n"""
-                                "<outer>  Some Test to add :) <inner>Item1 </inner><inner>Item2 </inner><inner>Item3 </inner>"
-                                "<inner>Item4</inner></outer>")
-    rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]}
-
-    inscriptis = Inscriptis(fromstring(html_issue_93), ParserConfig(annotation_rules=rules))
-    annotated_html = {'text': inscriptis.get_text(),
-                      'label': inscriptis.get_annotations()}
-    print(">>>", annotated_html)
+    expected_output_issue_93 = (
+        """<?xml version="1.0" encoding="UTF-8" ?>\n"""
+        """<content>\n"""
+        """<outer>  Some Test to add :) <inner>Item <bold>1</bold></inner> <inner>Item2 </inner>"""
+        """<inner><bold>Item3</bold></inner> <inner><bold>It</bold> e <bold>m4</bold></inner></outer>\n"""
+        """</content>"""
+    )
+    rules = {"div#class=a": ["outer"], "span#class=b": ["inner"], "b": ["bold"]}
 
+    inscriptis = Inscriptis(
+        fromstring(html_issue_93), ParserConfig(annotation_rules=rules)
+    )
+    annotated_html = {
+        "text": inscriptis.get_text(),
+        "label": inscriptis.get_annotations(),
+    }
     result = XmlExtractor()(annotated_html)
-    print(result)
     assert result == expected_output_issue_93