Skip to content

Commit bd1e4c8

Browse files
fix: 修复title、list、table、text管线中换行不正确以及缺失内容 (#506)
1 parent 364d60d commit bd1e4c8

23 files changed

+10306
-135
lines changed

llm_web_kit/extractor/html/extractor.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson:
9595

9696
# main_html, method, title = self._extract_main_html(raw_html, base_url, page_layout_type)
9797
main_html_element = html_to_element(main_html)
98-
parsed_html = [(main_html_element, raw_html)]
98+
parsed_html = [(main_html_element, main_html)]
9999
for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list,
100100
self._extract_image,
101101
self._extract_title, self._extract_paragraph]:

llm_web_kit/extractor/html/recognizer/list.py

Lines changed: 38 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,12 @@
88
from llm_web_kit.extractor.html.recognizer.recognizer import (
99
BaseHTMLElementRecognizer, CCTag)
1010
from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType
11-
from llm_web_kit.libs.html_utils import process_sub_sup_tags
11+
from llm_web_kit.libs.html_utils import (html_normalize_space,
12+
process_sub_sup_tags)
1213
from llm_web_kit.libs.text_utils import normalize_text_segment
1314

15+
from .text import inline_tags
16+
1417

1518
class ListAttribute():
1619
"""列表属性."""
@@ -130,7 +133,7 @@ def __extract_list_item_text_recusive(el: HtmlElement):
130133
elif el.tag == CCTag.CC_CODE_INLINE and el.text and el.text.strip():
131134
paragraph.append({'c': f'`{el.text}`', 't': ParagraphTextType.CODE_INLINE})
132135
elif el.tag == 'br':
133-
paragraph.append({'c': '\n\n', 't': ParagraphTextType.TEXT})
136+
paragraph.append({'c': '$br$', 't': ParagraphTextType.TEXT})
134137
elif el.tag == 'sub' or el.tag == 'sup':
135138
# 处理sub和sup标签,转换为GitHub Flavored Markdown格式
136139
current_text = ''
@@ -154,9 +157,17 @@ def __extract_list_item_text_recusive(el: HtmlElement):
154157
result['child_list'] = child_list
155158
else:
156159
if el.text and el.text.strip():
157-
paragraph.append({'c': el.text, 't': ParagraphTextType.TEXT})
160+
_new_text = html_normalize_space(el.text.strip())
161+
if len(el) == 0 and el.tag not in inline_tags:
162+
_new_text += '$br$'
163+
paragraph.append({'c': _new_text, 't': ParagraphTextType.TEXT})
158164
el.text = None
159-
for child in el.getchildren():
165+
166+
for child in el:
167+
if child.tag not in inline_tags:
168+
if paragraph:
169+
paragraph[-1]['c'] += '$br$'
170+
160171
p = __extract_list_item_text_recusive(child)
161172
if len(p) > 0:
162173
# 如果子元素有child_list,需要保存
@@ -166,24 +177,40 @@ def __extract_list_item_text_recusive(el: HtmlElement):
166177
if 'c' in p:
167178
if p['c'] != '':
168179
paragraph.append({'c': p['c'], 't': p.get('t', ParagraphTextType.TEXT)})
180+
else:
181+
if paragraph:
182+
last_paragraph = paragraph[-1]['c']
183+
if last_paragraph == '$br$':
184+
del paragraph[-1]
185+
else:
186+
if last_paragraph.endswith('$br$'):
187+
paragraph[-1]['c'] = last_paragraph[:-4]
188+
169189
if el.tag != 'li' and el.tail and el.tail.strip():
190+
_new_tail = html_normalize_space(el.tail.strip())
170191
if is_sub_sup:
171192
# 如果尾部文本跟在sub/sup后面,直接附加到最后一个文本段落中
172193
if len(paragraph) > 0 and paragraph[-1]['t'] == ParagraphTextType.TEXT:
173-
paragraph[-1]['c'] += el.tail
174-
else:
175-
paragraph.append({'c': el.tail, 't': ParagraphTextType.TEXT})
194+
paragraph[-1]['c'] += _new_tail
176195
else:
177-
paragraph.append({'c': el.tail, 't': ParagraphTextType.TEXT})
196+
paragraph.append({'c': _new_tail, 't': ParagraphTextType.TEXT})
197+
178198
if paragraph:
179199
# item['c'].strip(): 会导致前面处理br标签,添加的\n\n失效
180200
result['c'] = ' '.join(normalize_text_segment(item['c'].strip()) for item in paragraph)
181201
return result
182-
list_item_tags = ('li', 'dd', 'dt', 'ul', 'div', 'p')
202+
list_item_tags = ('li', 'dd', 'dt', 'ul', 'div', 'p', 'span')
183203
if child.tag in list_item_tags:
184204
paragraph = __extract_list_item_text_recusive(child)
185205
if len(paragraph) > 0:
186-
text_paragraph.append(paragraph)
206+
tem_json = json.dumps(paragraph).replace('$br$\"}', '\"}')
207+
new_paragraph = json.loads(tem_json)
208+
text_paragraph.append(new_paragraph)
209+
210+
for n, item in enumerate(text_paragraph):
211+
tem_json = json.dumps(item).replace('$br$', '\\n\\n')
212+
text_paragraph[n] = json.loads(tem_json)
213+
187214
return text_paragraph
188215

189216
def __get_list_content_list(self, ele: HtmlElement, list_nest_level: int) -> list:
@@ -201,7 +228,7 @@ def __get_list_content_list(self, ele: HtmlElement, list_nest_level: int) -> lis
201228
# 处理根元素文本
202229
if ele.text and ele.text.strip():
203230
# 检查元素是否包含数学或代码相关属性
204-
text_content = ele.text.strip()
231+
text_content = html_normalize_space(ele.text.strip())
205232
root_item = {
206233
'c': text_content,
207234
't': ParagraphTextType.TEXT,

llm_web_kit/extractor/html/recognizer/table.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -289,6 +289,8 @@ def __get_table_body(self, table_type, table_nest_level, table_root):
289289
def __do_extract_tables(self, root: HtmlElement) -> None:
290290
"""递归处理所有子标签."""
291291
if root.tag in ['table']:
292+
temp_tail = root.tail
293+
root.tail = None
292294
table_raw_html = self._element_to_html(root)
293295
table_type = self.__get_table_type(root)
294296
table_nest_level = self.__is_table_nested(root)
@@ -297,6 +299,7 @@ def __do_extract_tables(self, root: HtmlElement) -> None:
297299
cc_element = self._build_cc_element(
298300
CCTag.CC_TABLE, table_body, tail_text, table_type=table_type, table_nest_level=table_nest_level,
299301
html=table_raw_html)
302+
cc_element.tail = temp_tail
300303
self._replace_element(root, cc_element)
301304
return
302305
for child in root.iterchildren():

llm_web_kit/extractor/html/recognizer/text.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
BaseHTMLElementRecognizer, CCTag)
1313
from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType
1414
from llm_web_kit.libs.html_utils import (element_to_html_unescaped,
15-
html_to_element, process_sub_sup_tags)
15+
html_normalize_space, html_to_element,
16+
process_sub_sup_tags)
1617

1718
special_symbols = [ # TODO 从文件读取
1819
'®', # 注册商标符号
@@ -231,29 +232,33 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
231232
text = ''
232233
para_text.append({'c': el.text, 't': ParagraphTextType.CODE_INLINE})
233234
elif el.tag in ['br']:
234-
text += PARAGRAPH_SEPARATOR # TODO 这个地方直接加换行是错误点做法,需要利用数据结构来保证段落。
235+
text += '$br$'
235236
elif el.tag == 'sub' or el.tag == 'sup':
236237
text = process_sub_sup_tags(el, text, recursive=False)
237238
elif el.tag == 'audio': # 避免audio被识别为paragraph
238239
pass
239240
else:
240241
if el.text and el.text.strip():
241-
text = self.__combine_text(text, el.text.strip(), language)
242+
tem_text = html_normalize_space(text)
243+
_text = html_normalize_space(el.text.strip())
244+
text = self.__combine_text(tem_text, _text, language)
242245
for child in el:
243246
text = __get_paragraph_text_recusive(child, text)
244247

245248
# 处理尾部文本
246249
if el.tail and el.tail.strip():
247250
if is_sub_sup:
248-
text += el.tail
251+
_new_tail = html_normalize_space(el.tail.strip())
252+
text += _new_tail
249253
else:
250-
new_tail = f' {el.tail.strip()}' if el.tail.startswith(' ') and el.tail.strip()[0] in string.punctuation else el.tail.strip()
254+
_new_tail = html_normalize_space(el.tail.strip())
255+
new_tail = f' {_new_tail}' if el.tail.startswith(' ') and el.tail.strip()[0] in string.punctuation else _new_tail
251256
text = self.__combine_text(text, new_tail, language)
252257

253258
return text
254259

255260
if final := __get_paragraph_text_recusive(root, ''):
256-
para_text.append({'c': final, 't': ParagraphTextType.TEXT})
261+
para_text.append({'c': final.replace('$br$', PARAGRAPH_SEPARATOR), 't': ParagraphTextType.TEXT})
257262

258263
return para_text
259264

llm_web_kit/extractor/html/recognizer/title.py

Lines changed: 14 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,10 @@
88
from llm_web_kit.extractor.html.recognizer.recognizer import (
99
BaseHTMLElementRecognizer, CCTag)
1010
from llm_web_kit.libs.doc_element_type import DocElementType
11-
from llm_web_kit.libs.html_utils import process_sub_sup_tags
11+
from llm_web_kit.libs.html_utils import (html_normalize_space,
12+
process_sub_sup_tags)
13+
14+
from .text import PARAGRAPH_SEPARATOR
1215

1316

1417
class TitleRecognizer(BaseHTMLElementRecognizer):
@@ -124,16 +127,18 @@ def __extract_title_text_recusive(el: HtmlElement, with_tail: bool = True) -> li
124127

125128
if el.tag == CCTag.CC_CODE_INLINE:
126129
blks.append(f'`{el.text}`')
127-
elif el.tag in ['sub', 'sup']:
128-
# 使用process_sub_sup_tags保留原始的sub/sup标签
129-
processed_text = process_sub_sup_tags(el, '', 'en', True)
130-
if processed_text:
131-
blks.append(processed_text)
130+
elif el.tag in ['br']:
131+
blks.extend(['$br$'])
132132
else:
133-
blks.append((el.text or '').strip())
133+
if el.text and el.text.strip():
134+
_new_text = html_normalize_space(el.text.strip())
135+
blks.append(_new_text)
134136

135137
for child in el.getchildren():
136-
blks.extend(__extract_title_text_recusive(child))
138+
if child.tag == 'sub' or child.tag == 'sup':
139+
blks.extend([process_sub_sup_tags(child, '', recursive=False), child.tail])
140+
else:
141+
blks.extend(__extract_title_text_recusive(child))
137142

138143
if with_tail:
139144
blks.append((el.tail or '').strip())
@@ -143,7 +148,7 @@ def __extract_title_text_recusive(el: HtmlElement, with_tail: bool = True) -> li
143148
# 根元素不保留结尾
144149
blks = __extract_title_text_recusive(header_el, False)
145150

146-
return ' '.join(blk for blk in blks if blk)
151+
return ' '.join(blk for blk in blks if blk).replace('$br$', PARAGRAPH_SEPARATOR)
147152

148153
def __get_attribute(self, html:HtmlElement) -> Tuple[int, str]:
149154
"""获取element的属性."""

llm_web_kit/libs/html_utils.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import string
44
from copy import deepcopy
55

6+
from lxml import html as lxmlhtml
67
from lxml.html import HtmlElement, HTMLParser, fromstring, tostring
78

89
special_symbols = [ # TODO 从文件读取
@@ -430,3 +431,22 @@ def get_cc_select_html(element: HtmlElement) -> HtmlElement:
430431
container.append(elem_copy)
431432

432433
return container
434+
435+
436+
def html_normalize_space(text: str) -> str:
437+
"""
438+
标准化html中字符串中的空白字符
439+
Args:
440+
text:
441+
442+
Returns:
443+
444+
"""
445+
if not text.strip():
446+
return ''
447+
try:
448+
tem_text_el = lxmlhtml.fromstring(text.strip())
449+
_text = tem_text_el.xpath('normalize-space()')
450+
return _text
451+
except Exception:
452+
return text

0 commit comments

Comments
 (0)