Skip to content

Fix/bug 93 incorrect tag order #94

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Mar 22, 2025
6 changes: 4 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -236,7 +236,7 @@ that are suitable for your particular application. Post processors can be
specified with the ``-p`` or ``--postprocessor`` command line argument::

$ inscript https://www.fhgr.ch \
-r ./annotation/examples/annotation-profile.json \
-r ./examples/annotation/annotation-profile.json \
-p surface


Expand Down Expand Up @@ -265,10 +265,12 @@ Currently, inscriptis supports the following postprocessors:
- xml: returns an additional annotated text version::

<?xml version="1.0" encoding="UTF-8" ?>
<content>
<heading>Chur</heading>

<emphasis>Chur</emphasis> is the capital and largest town of the Swiss
canton of the Grisons and lies in the Grisonian Rhine Valley.
</content>

- html: creates an HTML file which contains the converted text and highlights all annotations as outlined below:

Expand All @@ -282,7 +284,7 @@ Currently, inscriptis supports the following postprocessors:

inscript --annotation-rules ./wikipedia.json \
--postprocessor html \
https://en.wikipedia.org/wiki/Chur.html
https://en.wikipedia.org/wiki/Chur

Annotation rules encoded in the ``wikipedia.json`` file:

Expand Down
9 changes: 6 additions & 3 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "inscriptis"
version = "2.5.3"
version = "2.6.0"
authors = ["Albert Weichselbraun <[email protected]>", "Fabian Odoni <[email protected]>"]
description = "inscriptis - HTML to text converter."
keywords = ["HTML", "converter", "text"]
Expand Down Expand Up @@ -44,8 +44,11 @@ requests = ">=2.32.2"
lxml = ">=4.9.3"

# optional dependencies
fastapi = { version = "^0.109.1", optional = true }
uvicorn = { version = "^0.27.1", optional = true }
fastapi = { version = "^0.115.11", optional = true }
uvicorn = { version = "^0.34.0", optional = true }

[tool.poetry.group.dev.dependencies]
pytest = "^8.3.5"


[build-system]
Expand Down
5 changes: 3 additions & 2 deletions src/inscriptis/annotation/output/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
2. The overwritten :meth:`__call__` method may either extend the original
dictionary which contains the extracted text and annotations (e.g.,
:class:`~inscriptis.annotation.output.surface.SurfaceExtractor`) or
may replace it with an custom output (e.g.,
may replace it with a custom output (e.g.,
:class:`~inscriptis.annotation.output.html.HtmlExtractor` and
:class:`~inscriptis.annotation.output.xml.XmlExtractor`.
:class:`~inscriptis.annotation.output.xml.XmlExtractor`).

Currently, Inscriptis supports the following built-in AnnotationProcessors:

Expand All @@ -25,6 +25,7 @@
of the extracted annotations.

"""

from typing import Dict, Any


Expand Down
44 changes: 14 additions & 30 deletions src/inscriptis/annotation/output/html.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""HTML Annotation Processor."""

from collections import defaultdict
from itertools import cycle
from typing import Dict, Any, List
Expand All @@ -18,44 +19,27 @@ class HtmlExtractor(AnnotationProcessor):
verbatim = True

def __call__(self, annotated_text: Dict[str, Any]) -> str:
tag_indices = defaultdict(list)
tag_dict = defaultdict(list)

for start, end, label in sorted(annotated_text["label"]):
tag_indices[start].append(label)
tag_indices[end].append("/" + label)
for start, end, label in reversed(annotated_text["label"]):
tag_dict[start].append(
f'<span class="{label}-label">{label}</span><span class="{label}">'
)
tag_dict[end].insert(0, "</span>")

open_tags = []
tagged_content = [
"<html><head><style>",
self._get_css(annotated_text["label"]),
"</style></head><body><pre>",
]
for idx, ch in enumerate(annotated_text["text"]):
if idx in tag_indices:
tags = tag_indices[idx]
# close tags:
for _ in (t for t in sorted(tags, reverse=True) if t.startswith("/")):
open_tags.pop()
tagged_content.append("</span>")
# open tags
for tag in (
t for t in sorted(tags, reverse=True) if not t.startswith("/")
):
open_tags.append(tag)
tagged_content.append(
'<span class="{tag}-label">{tag}</span>'
'<span class="{tag}">'.format(tag=tag)
)

if ch == "\n":
tagged_content.extend(["</span>" for _ in open_tags])
tagged_content.append("</pre>\n<pre>")
tagged_content.extend(
['<span class="{tag}">'.format(tag=tag) for tag in open_tags]
)
else:
tagged_content.append(ch)

text = annotated_text["text"]
current_idx = 0
for idx, tags in sorted(tag_dict.items()):
tagged_content.append(text[current_idx:idx].replace("\n", "</pre>\n<pre>"))
current_idx = idx
tagged_content.extend(tags)
tagged_content.append(text[current_idx:].replace("\n", "</pre>\n</pre>"))
return "".join(tagged_content) + "</pre></body></html>"

@staticmethod
Expand Down
45 changes: 13 additions & 32 deletions src/inscriptis/annotation/output/xml.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""XML Annotation processor."""

from collections import defaultdict
from typing import Dict, Any

Expand All @@ -10,40 +11,20 @@ class XmlExtractor(AnnotationProcessor):

verbatim = True

def __call__(self, annotated_text: Dict[str, Any]) -> str:
"""Provide an XML version of the given text and annotations.

Args:
annotated_text: a dictionary containing the plain text and the
extracted annotations.

Returns:
A string with the XML-version of the content.
"""
tag_indices = defaultdict(list)

for start, end, label in sorted(annotated_text["label"]):
tag_indices[start].append(label)
tag_indices[end].append("/" + label)
def __call__(self, annotated_text: Dict[str, Any], root_element="content"):
tag_dict = defaultdict(list)
for start, end, tag in reversed(annotated_text["label"]):
tag_dict[start].append(f"<{tag}>")
tag_dict[end].insert(0, f"</{tag}>")

current_idx = 0
tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n']
text = annotated_text["text"]
for index, tags in sorted(tag_indices.items()):
tagged_content.append(text[current_idx:index])
# close tags
tagged_content.extend(
[
"<" + tag + ">"
for tag in sorted(tags, reverse=True)
if tag.startswith("/")
]
)
# open tags
tagged_content.extend(
["<" + tag + ">" for tag in sorted(tags) if not tag.startswith("/")]
)
current_idx = index
tagged_content.append(text[current_idx:])
tagged_content = ['<?xml version="1.0" encoding="UTF-8" ?>\n', "<content>\n"]
for idx, tags in sorted(tag_dict.items()):
tagged_content.append(text[current_idx:idx])
current_idx = idx
tagged_content.extend(tags)

tagged_content.append(text[current_idx:])
tagged_content.append("\n</content>")
return "".join(tagged_content)
4 changes: 3 additions & 1 deletion src/inscriptis/html_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,9 @@ class Inscriptis:
text = parser.get_text()
"""

def __init__(self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None):
def __init__(
self, html_tree: lxml.html.HtmlElement, config: ParserConfig = None
) -> None:
# use the default configuration, if no config object is provided
config = config or ParserConfig()

Expand Down
7 changes: 5 additions & 2 deletions tests/test_annotation_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,12 @@
def test_get_annotation():
"""Test get_anntation from the Inscriptis class"""
html = "<b>Chur</b> is a City in <b>Switzerland</b>"
rules = {'b': ['bold']}
rules = {"b": ["bold"]}

inscriptis = Inscriptis(fromstring(html), ParserConfig(annotation_rules=rules))

assert inscriptis.get_text() == "Chur is a City in Switzerland"
assert inscriptis.get_annotations() == [Annotation(start=0, end=4, metadata='bold'), Annotation(start=18, end=29, metadata='bold')]
assert inscriptis.get_annotations() == [
Annotation(start=0, end=4, metadata="bold"),
Annotation(start=18, end=29, metadata="bold"),
]
18 changes: 9 additions & 9 deletions tests/test_annotation_output_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"text": "Chur\n\nChur is the capital and largest town of "
"the Swiss canton of the Grisons and lies in the "
"Grisonian Rhine Valley.",
"label": [[0, 4, "heading"], [0, 4, "h1"], [6, 10, "emphasis"]],
"label": [[0, 4, "h1"], [0, 4, "heading"], [6, 10, "emphasis"]],
}


Expand All @@ -36,8 +36,8 @@ def test_surface_annotator():

# and we have additional information on surface forms :)
assert result["surface"] == [
("heading", "Chur"),
("h1", "Chur"),
("heading", "Chur"),
("emphasis", "Chur"),
]

Expand All @@ -48,11 +48,11 @@ def test_xml_annotator():

# and we have additional information on surface forms :)
assert result == (
'<?xml version="1.0" encoding="UTF-8" ?>\n'
"<h1><heading>Chur</heading></h1>\n\n<emphasis>"
'<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n'
"<heading><h1>Chur</h1></heading>\n\n<emphasis>"
"Chur</emphasis> is the capital and largest town "
"of the Swiss canton of the Grisons and lies in "
"the Grisonian Rhine Valley."
"the Grisonian Rhine Valley.\n</content>"
)


Expand All @@ -61,8 +61,8 @@ def test_html_annotator():
result = processor(EXAMPLE_OUTPUT)

assert result.startswith("<html><head><style>")
assert result.endswith(
"</style></head>"
assert result.split("</style>")[1] == (
"</head>"
'<body><pre><span class="heading-label">heading'
'</span><span class="heading">'
'<span class="h1-label">h1</span><span class="h1">'
Expand All @@ -81,6 +81,6 @@ def test_trailing_tag_annotation():
result = processor({"text": "Ehre sei Gott!", "label": [[9, 14, "emphasis"]]})

assert result == (
'<?xml version="1.0" encoding="UTF-8" ?>\n'
"Ehre sei <emphasis>Gott!</emphasis>"
'<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n'
"Ehre sei <emphasis>Gott!</emphasis>\n</content>"
)
75 changes: 75 additions & 0 deletions tests/test_annotation_output_xml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
#!/usr/bin/env python

"""
Test the annotation XmlExtractor.
"""
from lxml.html import fromstring

from inscriptis import Inscriptis, ParserConfig
from inscriptis.annotation.output.xml import XmlExtractor


def test_tag_error_issue_93():
"""
Test for the correct tag order in the XmlOutput as described in Issue #93.
"""
html_issue_93 = """<html>
<body>
<div class="a">
<span class="b">Item1</span>
<span class="b">Item2</span>
<span class="b">Item3</span>
<span class="b">Item4</span>
</div>
</body>
</html>"""

expected_output_issue_93 = (
"""<?xml version="1.0" encoding="UTF-8" ?>\n<content>\n"""
"<outer><inner> Item1 </inner><inner>Item2 </inner><inner>Item3 </inner>"
"<inner>Item4</inner></outer>\n</content>"
)
rules = {"div#class=a": ["outer"], "span#class=b": ["inner"]}

inscriptis = Inscriptis(
fromstring(html_issue_93), ParserConfig(annotation_rules=rules)
)
annotated_html = {
"text": inscriptis.get_text(),
"label": inscriptis.get_annotations(),
}
result = XmlExtractor()(annotated_html)
assert result == expected_output_issue_93


def test_tag_folding_issue_93_extended():
html_issue_93 = """<html>
<body>
<div class="a">
Some Test to add :)
<span class="b">Item<b>1</b></span>
<span class="b">Item2</span>
<span class="b"><b>Item3</b></span>
<span class="b"><b>It</b>e<b>m4</b></span>
</div>
</body>
</html>"""

expected_output_issue_93 = (
"""<?xml version="1.0" encoding="UTF-8" ?>\n"""
"""<content>\n"""
"""<outer> Some Test to add :) <inner>Item <bold>1</bold></inner> <inner>Item2 </inner>"""
"""<inner><bold>Item3</bold></inner> <inner><bold>It</bold> e <bold>m4</bold></inner></outer>\n"""
"""</content>"""
)
rules = {"div#class=a": ["outer"], "span#class=b": ["inner"], "b": ["bold"]}

inscriptis = Inscriptis(
fromstring(html_issue_93), ParserConfig(annotation_rules=rules)
)
annotated_html = {
"text": inscriptis.get_text(),
"label": inscriptis.get_annotations(),
}
result = XmlExtractor()(annotated_html)
assert result == expected_output_issue_93
1 change: 1 addition & 0 deletions tests/test_block.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Test cases for the Block class.
"""

from inscriptis.model.canvas.block import Block
from inscriptis.model.canvas.prefix import Prefix

Expand Down
1 change: 1 addition & 0 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""
Tests the Inscriptis CLI client.
"""

from io import StringIO
from pathlib import Path
from json import loads
Expand Down
1 change: 1 addition & 0 deletions tests/test_custom_html_tag_handling.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Test the custom HTML tag handling."""

from lxml.html import fromstring

from inscriptis import Inscriptis, ParserConfig
Expand Down
Loading