Skip to content

Commit 364d60d

Browse files
authored
<fix>: fix remove script tail (#505)
1 parent be5f371 commit 364d60d

File tree

1 file changed

+8
-2
lines changed

1 file changed

+8
-2
lines changed

llm_web_kit/main_html_parser/parser/layout_batch_parser.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -286,8 +286,14 @@ def htmll_to_content2(self, body_str):
286286
body = html.fromstring(body_str)
287287
tags_to_remove = ['header', 'footer', 'nav', 'aside', 'script', 'style']
288288
for tag in tags_to_remove:
289-
for element in body.xpath(f'//{tag}'):
290-
element.getparent().remove(element)
289+
for element in list(body.xpath(f'//{tag}')):
290+
prev = element.getprevious()
291+
parent = element.getparent()
292+
if prev is not None:
293+
prev.tail = (prev.tail or '') + (element.tail or '')
294+
else:
295+
parent.text = (parent.text or '') + (element.tail or '')
296+
parent.remove(element)
291297
self.add_newline_after_tags(body, ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'br', 'span', 'div', 'p', 'li'])
292298
output = []
293299
main_content = re.split(r'\n{1,}', self.get_text_with_newlines(body))

0 commit comments

Comments
 (0)