Merge pull request #94 from weblyzard/fix/bug-93-incorrect-tag-order

AlbertWeichselbraun · web-flow · commit 2ef7e3bdc428 · 2025-03-22T19:49:26.000+01:00
Fix/bug 93 incorrect tag order
diff --git a/README.rst b/README.rst
@@ -236,7 +236,7 @@ that are suitable for your particular application. Post processors can be
 specified with the ``-p`` or ``--postprocessor`` command line argument::
 
   $ inscript https://www.fhgr.ch \
-          -r ./annotation/examples/annotation-profile.json \
+          -r ./examples/annotation/annotation-profile.json \
           -p surface
 
 
@@ -265,10 +265,12 @@ Currently, inscriptis supports the following postprocessors:
 - xml: returns an additional annotated text version::
 
     <?xml version="1.0" encoding="UTF-8" ?>
+    <content>
     <heading>Chur</heading>
 
     <emphasis>Chur</emphasis> is the capital and largest town of the Swiss
     canton of the Grisons and lies in the Grisonian Rhine Valley.
+    </content>
 
 - html: creates an HTML file which contains the converted text and highlights all annotations as outlined below:
 
@@ -282,7 +284,7 @@ Currently, inscriptis supports the following postprocessors:
 
       inscript --annotation-rules ./wikipedia.json \
                   --postprocessor html \
-                  https://en.wikipedia.org/wiki/Chur.html
+                  https://en.wikipedia.org/wiki/Chur
 
    Annotation rules encoded in the ``wikipedia.json`` file:
 
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "inscriptis"
-version = "2.5.3"
+version = "2.6.0"
 authors = ["Albert Weichselbraun <albert.weichselbraun@fhgr.ch>", "Fabian Odoni <fabian.odoni@fhgr.ch>"]
 description = "inscriptis - HTML to text converter."
 keywords = ["HTML", "converter", "text"]
@@ -44,8 +44,11 @@ requests = ">=2.32.2"
 lxml = ">=4.9.3"
 
 # optional dependencies
-fastapi = { version = "^0.109.1", optional = true }
-uvicorn = { version = "^0.27.1", optional = true }
+fastapi = { version = "^0.115.11", optional = true }
+uvicorn = { version = "^0.34.0", optional = true }
+
+[tool.poetry.group.dev.dependencies]
+pytest = "^8.3.5"
 
 
 [build-system]
diff --git a/src/inscriptis/annotation/output/__init__.py b/src/inscriptis/annotation/output/__init__.py
@@ -10,9 +10,9 @@
     2. The overwritten :meth:`__call__` method may either extend the original
        dictionary which contains the extracted text and annotations (e.g.,
        :class:`~inscriptis.annotation.output.surface.SurfaceExtractor`) or
-       may replace it with an custom output (e.g.,
+       may replace it with a custom output (e.g.,
        :class:`~inscriptis.annotation.output.html.HtmlExtractor` and
-       :class:`~inscriptis.annotation.output.xml.XmlExtractor`.
+       :class:`~inscriptis.annotation.output.xml.XmlExtractor`).
 
 Currently, Inscriptis supports the following built-in AnnotationProcessors:
 
@@ -25,6 +25,7 @@
     of the extracted annotations.
 
 """
+
 from typing import Dict, Any
 
 
diff --git a/src/inscriptis/annotation/output/html.py b/src/inscriptis/annotation/output/html.py
@@ -1,4 +1,5 @@
 """HTML Annotation Processor."""
+
 from collections import defaultdict
 from itertools import cycle
 from typing import Dict, Any, List
@@ -18,44 +19,27 @@ class HtmlExtractor(AnnotationProcessor):
     verbatim = True
 
     def __call__(self, annotated_text: Dict[str, Any]) -> str:
-        tag_indices = defaultdict(list)
+        tag_dict = defaultdict(list)
 
-        for start, end, label in sorted(annotated_text["label"]):
-            tag_indices[start].append(label)
-            tag_indices[end].append("/" + label)
+        for start, end, label in reversed(annotated_text["label"]):
+            tag_dict[start].append(
+                f'<span class="{label}-label">{label}</span><span class="{label}">'
+            )
+            tag_dict[end].insert(0, "</span>")
 
-        open_tags = []
         tagged_content = [
             "<html><head><style>",
             self._get_css(annotated_text["label"]),
             "</style></head><body><pre>",
         ]
-        for idx, ch in enumerate(annotated_text["text"]):
-            if idx in tag_indices:
-                tags = tag_indices[idx]
-                # close tags:
-                for _ in (t for t in sorted(tags, reverse=True) if t.startswith("/")):
-                    open_tags.pop()
-                    tagged_content.append("</span>")
-                # open tags
-                for tag in (
-                    t for t in sorted(tags, reverse=True) if not t.startswith("/")
-                ):
-                    open_tags.append(tag)
-                    tagged_content.append(
-                        '<span class="{tag}-label">{tag}</span>'
-                        '<span class="{tag}">'.format(tag=tag)
-                    )
-
-            if ch == "\n":
-                tagged_content.extend(["</span>" for _ in open_tags])
-                tagged_content.append("</pre>\n<pre>")
-                tagged_content.extend(
-                    ['<span class="{tag}">'.format(tag=tag) for tag in open_tags]
-                )
-            else:
-                tagged_content.append(ch)
 
+        text = annotated_text["text"]
+        current_idx = 0
+        for idx, tags in sorted(tag_dict.items()):
+            tagged_content.append(text[current_idx:idx].replace("\n", "</pre>\n<pre>"))
+            current_idx = idx
+            tagged_content.extend(tags)
+        tagged_content.append(text[current_idx:].replace("\n", "</pre>\n</pre>"))
         return "".join(tagged_content) + "</pre></body></html>"
 
     @staticmethod
diff --git a/src/inscriptis/annotation/output/xml.py b/src/inscriptis/annotation/output/xml.py
@@ -1,4 +1,5 @@
 """XML Annotation processor."""
+
 from collections import defaultdict
 from typing import Dict, Any
 
@@ -10,40 +11,20 @@ class XmlExtractor(AnnotationProcessor):
 
     verbatim = True
 
-    def __call__(self, annotated_text: Dict[str, Any]) -> str:
-        """Provide an XML version of the given text and annotations.
-
-        Args:
-            annotated_text: a dictionary containing the plain text and the
-                            extracted annotations.
-
-        Returns:
-            A string with the XML-version of the content.
-        """
-        tag_indices = defaultdict(list)
-
-        for start, end, label in sorted(annotated_text["label"]):
-            tag_indices[start].append(label)
-            tag_indices[end].append("/" + label)
+    def __call__(self, annotated_text: Dict[str, Any], root_element="content"):
+        tag_dict = defaultdict(list)
+        for start, end, tag in reversed(annotated_text["label"]):
+            tag_dict[start].append(f"<{tag}>")
+            tag_dict[end].insert(0, f"</{tag}>")
 
         current_idx = 0
-        tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n']
         text = annotated_text["text"]
-        for index, tags in sorted(tag_indices.items()):
-            tagged_content.append(text[current_idx:index])
-            # close tags
-            tagged_content.extend(
-                [
-                    "<" + tag + ">"
-                    for tag in sorted(tags, reverse=True)
-                    if tag.startswith("/")
-                ]
-            )
-            # open tags
-            tagged_content.extend(
-                ["<" + tag + ">" for tag in sorted(tags) if not tag.startswith("/")]
-            )
-            current_idx = index
-        tagged_content.append(text[current_idx:])
+        tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n', "<content>\n"]
+        for idx, tags in sorted(tag_dict.items()):
+            tagged_content.append(text[current_idx:idx])
+            current_idx = idx
+            tagged_content.extend(tags)
 
+        tagged_content.append(text[current_idx:])
+        tagged_content.append("\n</content>")
         return "".join(tagged_content)
diff --git a/src/inscriptis/html_engine.py b/src/inscriptis/html_engine.py
@@ -51,7 +51,9 @@ class Inscriptis:
       text = parser.get_text()
     """
 
-    def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None):
+    def __init__(
+        self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None
+    ) -> None:
         # use the default configuration, if no config object is provided
         config = config or ParserConfig()
 
diff --git a/tests/test_annotation_engine.py b/tests/test_annotation_engine.py
@@ -11,9 +11,12 @@
 def test_get_annotation():
     """Test get_anntation from the Inscriptis class"""
     html = "<b>Chur</b> is a City in <b>Switzerland</b>"
-    rules = {'b': ['bold']}
+    rules = {"b": ["bold"]}
 
     inscriptis = Inscriptis(fromstring(html), ParserConfig(annotation_rules=rules))
 
     assert inscriptis.get_text() == "Chur is a City in Switzerland"
-    assert inscriptis.get_annotations() == [Annotation(start=0, end=4, metadata='bold'), Annotation(start=18, end=29, metadata='bold')]
+    assert inscriptis.get_annotations() == [
+        Annotation(start=0, end=4, metadata="bold"),
+        Annotation(start=18, end=29, metadata="bold"),
+    ]
diff --git a/tests/test_annotation_output_processor.py b/tests/test_annotation_output_processor.py
@@ -15,7 +15,7 @@
     "text": "Chur\n\nChur is the capital and largest town of "
     "the Swiss canton of the Grisons and lies in the "
     "Grisonian Rhine Valley.",
-    "label": [[0, 4, "heading"], [0, 4, "h1"], [6, 10, "emphasis"]],
+    "label": [[0, 4, "h1"], [0, 4, "heading"], [6, 10, "emphasis"]],
 }
 
 
@@ -36,8 +36,8 @@ def test_surface_annotator():
 
     # and we have additional information on surface forms :)
     assert result["surface"] == [
-        ("heading", "Chur"),
         ("h1", "Chur"),
+        ("heading", "Chur"),
         ("emphasis", "Chur"),
     ]
 
@@ -48,11 +48,11 @@ def test_xml_annotator():
 
     # and we have additional information on surface forms :)
     assert result == (
-        '<?xml version="1.0" encoding="UTF-8" ?>\n'
-        "<h1><heading>Chur</heading></h1>\n\n<emphasis>"
+        '<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n'
+        "<heading><h1>Chur</h1></heading>\n\n<emphasis>"
         "Chur</emphasis> is the capital and largest town "
         "of the Swiss canton of the Grisons and lies in "
-        "the Grisonian Rhine Valley."
+        "the Grisonian Rhine Valley.\n</content>"
     )
 
 
@@ -61,8 +61,8 @@ def test_html_annotator():
     result = processor(EXAMPLE_OUTPUT)
 
     assert result.startswith("<html><head><style>")
-    assert result.endswith(
-        "</style></head>"
+    assert result.split("</style>")[1] == (
+        "</head>"
         '<body><pre><span class="heading-label">heading'
         '</span><span class="heading">'
         '<span class="h1-label">h1</span><span class="h1">'
@@ -81,6 +81,6 @@ def test_trailing_tag_annotation():
     result = processor({"text": "Ehre sei Gott!", "label": [[9, 14, "emphasis"]]})
 
     assert result == (
-        '<?xml version="1.0" encoding="UTF-8" ?>\n'
-        "Ehre sei <emphasis>Gott!</emphasis>"
+        '<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n'
+        "Ehre sei <emphasis>Gott!</emphasis>\n</content>"
     )
diff --git a/tests/test_annotation_output_xml.py b/tests/test_annotation_output_xml.py
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+
+"""
+Test the annotation XmlExtractor.
+"""
+from lxml.html import fromstring
+
+from inscriptis import Inscriptis, ParserConfig
+from inscriptis.annotation.output.xml import XmlExtractor
+
+
+def test_tag_error_issue_93():
+    """
+    Test for the correct tag order in the XmlOutput as described in Issue #93.
+    """
+    html_issue_93 = """<html>
+       <body>
+         <div class="a">
+            <span class="b">Item1</span>
+            <span class="b">Item2</span>
+            <span class="b">Item3</span>
+            <span class="b">Item4</span>
+         </div>
+       </body>
+    </html>"""
+
+    expected_output_issue_93 = (
+        """<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n"""
+        "<outer><inner>  Item1 </inner><inner>Item2 </inner><inner>Item3 </inner>"
+        "<inner>Item4</inner></outer>\n</content>"
+    )
+    rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]}
+
+    inscriptis = Inscriptis(
+        fromstring(html_issue_93), ParserConfig(annotation_rules=rules)
+    )
+    annotated_html = {
+        "text": inscriptis.get_text(),
+        "label": inscriptis.get_annotations(),
+    }
+    result = XmlExtractor()(annotated_html)
+    assert result == expected_output_issue_93
+
+
+def test_tag_folding_issue_93_extended():
+    html_issue_93 = """<html>
+       <body>
+         <div class="a">
+         Some Test to add :)
+            <span class="b">Item<b>1</b></span>
+            <span class="b">Item2</span>
+            <span class="b"><b>Item3</b></span>
+            <span class="b"><b>It</b>e<b>m4</b></span>
+         </div>
+       </body>
+    </html>"""
+
+    expected_output_issue_93 = (
+        """<?xml version="1.0" encoding="UTF-8" ?>\n"""
+        """<content>\n"""
+        """<outer>  Some Test to add :) <inner>Item <bold>1</bold></inner> <inner>Item2 </inner>"""
+        """<inner><bold>Item3</bold></inner> <inner><bold>It</bold> e <bold>m4</bold></inner></outer>\n"""
+        """</content>"""
+    )
+    rules = {"div#class=a": ["outer"], "span#class=b": ["inner"], "b": ["bold"]}
+
+    inscriptis = Inscriptis(
+        fromstring(html_issue_93), ParserConfig(annotation_rules=rules)
+    )
+    annotated_html = {
+        "text": inscriptis.get_text(),
+        "label": inscriptis.get_annotations(),
+    }
+    result = XmlExtractor()(annotated_html)
+    assert result == expected_output_issue_93
diff --git a/tests/test_block.py b/tests/test_block.py
@@ -1,6 +1,7 @@
 """
 Test cases for the Block class.
 """
+
 from inscriptis.model.canvas.block import Block
 from inscriptis.model.canvas.prefix import Prefix
 
diff --git a/tests/test_cli.py b/tests/test_cli.py
@@ -1,6 +1,7 @@
 """
 Tests the Inscriptis CLI client.
 """
+
 from io import StringIO
 from pathlib import Path
 from json import loads
diff --git a/tests/test_custom_html_tag_handling.py b/tests/test_custom_html_tag_handling.py
@@ -1,4 +1,5 @@
 """Test the custom HTML tag handling."""
+
 from lxml.html import fromstring
 
 from inscriptis import Inscriptis, ParserConfig

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`"""Test the custom HTML tag handling."""`
	`2`	`+`
`2`	`3`	`from lxml.html import fromstring`
`3`	`4`
`4`	`5`	`from inscriptis import Inscriptis, ParserConfig`