Skip to content

Commit 667b356

Browse files
Merge pull request #84 from weblyzard/fix/bug-81-custom-html-handling2
Fix/bug 81 custom html handling2
2 parents fc9ee5c + 504863d commit 667b356

40 files changed

+652
-261
lines changed

README.rst

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ HTML to annotated text conversion
185185
---------------------------------
186186
convert and annotate HTML from a Web page using the provided annotation rules.
187187

188-
Download the example `annotation-profile.json <https://github.com/weblyzard/inscriptis/blob/master/examples/annotation-profile.json>`_ and save it to your working directory::
188+
Download the example `annotation-profile.json <https://github.com/weblyzard/inscriptis/blob/master/examples/annotation/annotation-profile.json>`_ and save it to your working directory::
189189

190190
$ inscript https://www.fhgr.ch -r annotation-profile.json
191191

@@ -236,7 +236,7 @@ that are suitable for your particular application. Post processors can be
236236
specified with the ``-p`` or ``--postprocessor`` command line argument::
237237

238238
$ inscript https://www.fhgr.ch \
239-
-r ./examples/annotation-profile.json \
239+
-r ./annotation/examples/annotation-profile.json \
240240
-p surface
241241

242242

@@ -474,7 +474,8 @@ be used within a program:
474474
.. code-block:: python
475475
476476
import urllib.request
477-
from inscriptis import get_annotated_text, ParserConfig
477+
from inscriptis import get_annotated_text
478+
from inscriptis.model.config import ParserConfig
478479
479480
url = "https://www.fhgr.ch"
480481
html = urllib.request.urlopen(url).read().decode('utf-8')
@@ -533,15 +534,24 @@ If the fine-tuning options discussed above are not sufficient, you may even over
533534
534535
.. code-block:: python
535536
536-
inscriptis = Inscriptis(html, config)
537+
from inscriptis import ParserConfig
538+
from inscriptis.html_engine import Inscriptis
539+
from inscriptis.model.tag import CustomHtmlTagHandlerMapping
537540
538-
inscriptis.start_tag_handler_dict['a'] = my_handle_start_a
539-
inscriptis.end_tag_handler_dict['a'] = my_handle_end_a
541+
my_mapping = CustomHtmlTagHandlerMapping(
542+
start_tag_mapping={'a': my_handle_start_a},
543+
end_tag_mapping={'a': my_handle_end_a}
544+
)
545+
inscriptis = Inscriptis(html_tree,
546+
ParserConfig(custom_html_tag_handler_mapping=my_mapping))
540547
text = inscriptis.get_text()
541548
542549
543550
In the example the standard HTML handlers for the ``a`` tag are overwritten with custom versions (i.e., ``my_handle_start_a`` and ``my_handle_end_a``).
544-
You may define custom handlers for any tag, regardless of whether it already exists in ``start_tag_handler_dict`` or ``end_tag_handler_dict``.
551+
You may define custom handlers for any tag, regardless of whether it already exists in the standard mapping.
552+
553+
Please refer to `custom-html-handling.py <https://github.com/weblyzard/inscriptis/blob/master/examples/custom-html-handling.py>`_ for a working example.
554+
The standard HTML tag handlers can be found in the `inscriptis.model.tag <https://github.com/weblyzard/inscriptis/blob/master/src/inscriptis/model/tag>`_ package.
545555
546556
Optimizing memory consumption
547557
-----------------------------
File renamed without changes.
File renamed without changes.
File renamed without changes.

examples/custom-html-handling.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,43 @@
1+
#!/usr/bin/env python3
2+
3+
"""
4+
Custom HTML tag handling example.
5+
6+
Add a custom HTML handler for the bold <b> tag which encloses
7+
bold text with "**".
8+
9+
Example:
10+
"Welcome to <b>Chur</b>" is rendered as "Welcome to **Chur**".
11+
"""
12+
from typing import Dict
13+
14+
from inscriptis import ParserConfig
15+
from inscriptis.html_engine import Inscriptis
16+
from inscriptis.model.html_document_state import HtmlDocumentState
17+
from inscriptis.model.tag import CustomHtmlTagHandlerMapping
18+
from lxml.html import fromstring
19+
20+
21+
def my_handle_start_b(state: HtmlDocumentState, _: Dict) -> None:
22+
"""Handle the opening <b> tag."""
23+
state.tags[-1].write("**")
24+
25+
26+
def my_handle_end_b(state: HtmlDocumentState) -> None:
27+
"""Handle the closing </b> tag."""
28+
state.tags[-1].write("**")
29+
30+
31+
MY_MAPPING = CustomHtmlTagHandlerMapping(
32+
start_tag_mapping={"b": my_handle_start_b},
33+
end_tag_mapping={"b": my_handle_end_b},
34+
)
35+
36+
37+
HTML = "Welcome to <b>Chur</b>"
38+
39+
html_tree = fromstring(HTML)
40+
inscriptis = Inscriptis(
41+
html_tree, ParserConfig(custom_html_tag_handler_mapping=MY_MAPPING)
42+
)
43+
print(inscriptis.get_text())

pyproject.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[tool.poetry]
22
name = "inscriptis"
3-
version = "2.4.0.1"
3+
version = "2.5.0"
44
authors = ["Albert Weichselbraun <[email protected]>", "Fabian Odoni <[email protected]>"]
55
description = "inscriptis - HTML to text converter."
66
keywords = ["HTML", "converter", "text"]
@@ -59,5 +59,5 @@ line-length = 88
5959
target-version = ["py38", "py39", "py310", "py311", "py312"]
6060
extend-exclude = '\.html$|\.json$|\.txt$|/a$|/b$'
6161
include = '''
62-
^/src/|^/tests/|^/benchmarking/
62+
^/src/|^/tests/|^/benchmarking/|^/examples/
6363
'''

src/inscriptis/__init__.py

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -60,12 +60,12 @@
6060
"""
6161

6262
import re
63-
from lxml.html import fromstring, HtmlElement
64-
from lxml.etree import ParserError
65-
6663
from typing import Dict, Optional, Any
67-
6864
from inscriptis.model.config import ParserConfig
65+
66+
from lxml.etree import ParserError
67+
from lxml.html import fromstring, HtmlElement
68+
6969
from inscriptis.html_engine import Inscriptis
7070

7171
RE_STRIP_XML_DECLARATION = re.compile(r"^<\?xml [^>]+?\?>")
@@ -132,5 +132,6 @@ def get_annotated_text(
132132
return {}
133133

134134
inscriptis = Inscriptis(html_tree, config)
135+
text = inscriptis.get_text()
135136
labels = [(a.start, a.end, a.metadata) for a in inscriptis.get_annotations()]
136-
return {"text": inscriptis.get_text(), "label": labels}
137+
return {"text": text, "label": labels}

src/inscriptis/annotation/__init__.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
"""The model used for saving annotations."""
22

3-
from typing import NamedTuple, Tuple
43
from typing import List
4+
from typing import NamedTuple
55

66
from inscriptis.html_properties import HorizontalAlignment
77

@@ -25,8 +25,8 @@ class Annotation(NamedTuple):
2525
"""the annotation's start index within the text output."""
2626
end: int
2727
"""the annotation's end index within the text output."""
28-
metadata: Tuple[str]
29-
"""a tuple of tags to be attached to the annotation."""
28+
metadata: str
29+
"""the tag to be attached to the annotation."""
3030

3131

3232
def horizontal_shift(

src/inscriptis/annotation/parser.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
"""
1919
from collections import defaultdict
2020
from copy import copy
21+
from typing import Dict, Tuple, List
2122

2223
from inscriptis.model.html_element import HtmlElement, DEFAULT_HTML_ELEMENT
2324

@@ -85,7 +86,7 @@ def __init__(self, css_profile, model: dict):
8586
self.css = css_profile
8687

8788
@staticmethod
88-
def _parse(model: dict) -> "AnnotationModel":
89+
def _parse(model: dict) -> Tuple[Dict, List]:
8990
"""Compute the AnnotationModel from a model dictionary.
9091
9192
Returns:

src/inscriptis/cli/inscript.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,14 @@
55
import argparse
66
import sys
77
from json import load, dumps
8-
from typing import Optional
98
from pathlib import Path
9+
from typing import Optional
1010

1111
import requests
1212

1313
from inscriptis import get_text, get_annotated_text
14-
from inscriptis.metadata import __version__, __copyright__, __license__
1514
from inscriptis.css_profiles import CSS_PROFILES
15+
from inscriptis.metadata import __version__, __copyright__, __license__
1616
from inscriptis.model.config import ParserConfig
1717

1818
DEFAULT_ENCODING = "utf8"
@@ -148,24 +148,23 @@ def get_html_content(url: str, timeout: int, encoding: str = None) -> Optional[s
148148
Args:
149149
url: URL to the HTML content, or None if the content is obtained from stdin.
150150
encoding: used encoding.
151+
timeout: timeout in seconds for retrieving the URL.
151152
152153
Returns:
153154
The html_content or None, if no content could be extracted.
154155
155156
"""
156157
if not url:
157158
return sys.stdin.read()
158-
elif Path(url).is_file():
159-
with Path(url).open(
160-
encoding=encoding or DEFAULT_ENCODING, errors="ignore"
161-
) as f:
159+
elif (p := Path(url)).is_file():
160+
with p.open(encoding=encoding or DEFAULT_ENCODING, errors="ignore") as f:
162161
return f.read()
163162
elif url.startswith("http://") or url.startswith("https://"):
164163
req = requests.get(url, timeout=timeout)
165164
return req.content.decode(encoding or req.encoding)
166165

167166

168-
def cli():
167+
def cli() -> None:
169168
"""Run the inscript command line client."""
170169
args = parse_command_line()
171170
if not (html_content := get_html_content(args.input, args.timeout, args.encoding)):

src/inscriptis/css_profiles.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,8 +8,8 @@
88
preventing cases where two words stick together.
99
"""
1010

11-
from inscriptis.model.html_element import HtmlElement
1211
from inscriptis.html_properties import Display, WhiteSpace
12+
from inscriptis.model.html_element import HtmlElement
1313

1414
STRICT_CSS_PROFILE = {
1515
"body": HtmlElement(display=Display.inline, whitespace=WhiteSpace.normal),

0 commit comments

Comments
 (0)