Skip to content

Commit 6ceb74e

Browse files
committed
Merge remote-tracking branch 'origin/asciimath2' into asciimath2
# Conflicts: # llm_web_kit/extractor/html/recognizer/cc_math/render/mathjax.py # tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/math_test_ascii_delimiter.html # tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html_data_input.jsonl # tests/llm_web_kit/extractor/test_extractor_chain.py
2 parents 2c008b5 + 7479ec2 commit 6ceb74e

File tree

21 files changed

+1990
-180
lines changed

21 files changed

+1990
-180
lines changed

llm_web_kit/extractor/html/extractor.py

Lines changed: 20 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson:
9090
raw_html:str = data_json['html']
9191
base_url:str = data_json['url']
9292
main_html:str = data_json['main_html']
93+
language:str = data_json.get('language', 'en')
9394
# page_layout_type:str = data_json.get('page_layout_type', HTMLPageLayoutType.LAYOUT_ARTICLE) # 默认是文章类型
9495

9596
# main_html, method, title = self._extract_main_html(raw_html, base_url, page_layout_type)
@@ -98,7 +99,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson:
9899
for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list,
99100
self._extract_image,
100101
self._extract_title, self._extract_paragraph]:
101-
parsed_html = extract_func(base_url, parsed_html, raw_html)
102+
parsed_html = extract_func(base_url, parsed_html, raw_html, language)
102103

103104
# 过滤掉包含script和style标签的元素,在这里改,是因为math提取需要保留script标签
104105
filtered_parsed_html = []
@@ -111,7 +112,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson:
111112
# data_json['title'] = title
112113
return data_json
113114

114-
def _extract_code(self, base_url:str, html_lst:List[Tuple[HtmlElement, HtmlElement]], raw_html:str) -> List[Tuple[HtmlElement,HtmlElement]]:
115+
def _extract_code(self, base_url:str, html_lst:List[Tuple[HtmlElement, HtmlElement]], raw_html:str, language:str) -> List[Tuple[HtmlElement,HtmlElement]]:
115116
"""从html文本中提取代码.
116117
117118
Args:
@@ -121,10 +122,10 @@ def _extract_code(self, base_url:str, html_lst:List[Tuple[HtmlElement, HtmlEleme
121122
Returns:
122123
"""
123124

124-
lst = self.__code_recognizer.recognize(base_url, html_lst, raw_html)
125+
lst = self.__code_recognizer.recognize(base_url, html_lst, raw_html, language)
125126
return lst
126127

127-
def _extract_math(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]:
128+
def _extract_math(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str, language:str) -> List[Tuple[str,str]]:
128129
"""从html文本中提取数学公式.
129130
130131
Args:
@@ -135,10 +136,10 @@ def _extract_math(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:st
135136
Returns:
136137
"""
137138

138-
lst = self.__math_recognizer.recognize(base_url, html_lst, raw_html)
139+
lst = self.__math_recognizer.recognize(base_url, html_lst, raw_html, language)
139140
return lst
140141

141-
def _extract_image(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]:
142+
def _extract_image(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str, language:str) -> List[Tuple[str,str]]:
142143
"""从html文本中提取图片.
143144
144145
Args:
@@ -149,10 +150,10 @@ def _extract_image(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:s
149150
Returns:
150151
"""
151152

152-
lst = self.__image_recognizer.recognize(base_url, html_lst, raw_html)
153+
lst = self.__image_recognizer.recognize(base_url, html_lst, raw_html, language)
153154
return lst
154155

155-
def _extract_audio(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]:
156+
def _extract_audio(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str, language:str) -> List[Tuple[str,str]]:
156157
"""从html文本中提取音频.
157158
158159
Args:
@@ -163,10 +164,10 @@ def _extract_audio(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:s
163164
Returns:
164165
"""
165166

166-
lst = self.__audio_recognizer.recognize(base_url, html_lst, raw_html)
167+
lst = self.__audio_recognizer.recognize(base_url, html_lst, raw_html, language)
167168
return lst
168169

169-
def _extract_video(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]:
170+
def _extract_video(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str, language:str) -> List[Tuple[str,str]]:
170171
"""从html文本中提取视频.
171172
172173
Args:
@@ -177,10 +178,10 @@ def _extract_video(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:s
177178
Returns:
178179
"""
179180

180-
lst = self.__video_recognizer.recognize(base_url, html_lst, raw_html)
181+
lst = self.__video_recognizer.recognize(base_url, html_lst, raw_html, language)
181182
return lst
182183

183-
def _extract_table(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]:
184+
def _extract_table(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str, language:str) -> List[Tuple[str,str]]:
184185
"""从html文本中提取表格.
185186
186187
Args:
@@ -191,10 +192,10 @@ def _extract_table(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:s
191192
Returns:
192193
"""
193194

194-
lst = self.__table_recognizer.recognize(base_url, html_lst, raw_html)
195+
lst = self.__table_recognizer.recognize(base_url, html_lst, raw_html, language)
195196
return lst
196197

197-
def _extract_list(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]:
198+
def _extract_list(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str, language:str) -> List[Tuple[str,str]]:
198199
"""从html文本中提取列表.
199200
200201
Args:
@@ -205,10 +206,10 @@ def _extract_list(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:st
205206
Returns:
206207
"""
207208

208-
lst = self.__list_recognizer.recognize(base_url, html_lst, raw_html)
209+
lst = self.__list_recognizer.recognize(base_url, html_lst, raw_html, language)
209210
return lst
210211

211-
def _extract_title(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]:
212+
def _extract_title(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str, language:str) -> List[Tuple[str,str]]:
212213
"""从html文本中提取标题.
213214
214215
Args:
@@ -219,10 +220,10 @@ def _extract_title(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:s
219220
Returns:
220221
"""
221222

222-
lst = self.__title_recognizer.recognize(base_url, html_lst, raw_html)
223+
lst = self.__title_recognizer.recognize(base_url, html_lst, raw_html, language)
223224
return lst
224225

225-
def _extract_paragraph(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str) -> List[Tuple[str,str]]:
226+
def _extract_paragraph(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:str, language:str) -> List[Tuple[str,str]]:
226227
"""从html文本中提取段落.
227228
228229
Args:
@@ -233,7 +234,7 @@ def _extract_paragraph(self, base_url:str, html_lst:List[Tuple[str,str]], raw_ht
233234
Returns:
234235
"""
235236

236-
lst = self.__paragraph_recognizer.recognize(base_url, html_lst, raw_html)
237+
lst = self.__paragraph_recognizer.recognize(base_url, html_lst, raw_html, language)
237238
return lst
238239

239240
def __is_valid_node(self, node: dict) -> bool:

llm_web_kit/extractor/html/recognizer/audio.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
class AudioRecognizer(BaseHTMLElementRecognizer):
1111
"""解析音频元素."""
1212
@override
13-
def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement,HtmlElement]], raw_html:str) -> List[Tuple[HtmlElement,HtmlElement]]:
13+
def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement,HtmlElement]], raw_html:str, language:str = 'en') -> List[Tuple[HtmlElement,HtmlElement]]:
1414
"""父类,解析音频元素.
1515
1616
Args:

llm_web_kit/extractor/html/recognizer/cccode.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,8 @@ def recognize(
2727
self,
2828
base_url: str,
2929
main_html_lst: List[Tuple[HtmlElement, HtmlElement]],
30-
raw_html: str
30+
raw_html: str,
31+
language:str = 'en'
3132
) -> List[Tuple[HtmlElement, HtmlElement]]:
3233
"""父类,解析代码元素.
3334

llm_web_kit/extractor/html/recognizer/ccmath.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ def __init__(self):
2626
self.cm = CCMATH()
2727

2828
@override
29-
def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]:
29+
def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str, language:str = 'en') -> List[Tuple[HtmlElement, HtmlElement]]:
3030
"""父类,解析数学公式元素.
3131
3232
Args:

llm_web_kit/extractor/html/recognizer/image.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ def __ccimg_to_content_list(self, raw_html_segment: str, html_obj: HtmlElement)
6767
return result
6868

6969
@override
70-
def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str) -> List[
70+
def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str, language:str = 'en') -> List[
7171
Tuple[HtmlElement, HtmlElement]]:
7272
"""父类,解析图片元素.
7373

llm_web_kit/extractor/html/recognizer/list.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
4848
return ele_node
4949

5050
@override
51-
def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]:
51+
def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str, language:str = 'en') -> List[Tuple[HtmlElement, HtmlElement]]:
5252
"""父类,解析列表元素.
5353
5454
Args:

llm_web_kit/extractor/html/recognizer/recognizer.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ class BaseHTMLElementRecognizer(ABC):
2929

3030
"""基本的元素解析类."""
3131
@abstractmethod
32-
def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html:str) -> List[Tuple[HtmlElement, HtmlElement]]:
32+
def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html:str, language:str) -> List[Tuple[HtmlElement, HtmlElement]]:
3333
"""父类,解析html中的元素.
3434
3535
Args:

llm_web_kit/extractor/html/recognizer/table.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,8 @@ def __init__(self):
2424
def recognize(self,
2525
base_url: str,
2626
main_html_lst: List[Tuple[HtmlElement, HtmlElement]],
27-
raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]:
27+
raw_html: str,
28+
language:str = 'en') -> List[Tuple[HtmlElement, HtmlElement]]:
2829
"""父类,解析表格元素.
2930
3031
Args:
@@ -256,7 +257,8 @@ def __simplify_td_th_content(self, table_nest_level, elem: HtmlElement) -> None:
256257
def __get_table_body(self, table_type, table_nest_level, table_root):
257258
"""获取并处理table body,返回处理后的HTML字符串。"""
258259
if table_type == 'empty':
259-
return None
260+
content = table_root.text_content()
261+
return content
260262
allowed_attributes = ['colspan', 'rowspan']
261263
# 清理除了colspan和rowspan之外的属性
262264
if len(table_root.attrib) > 0:

llm_web_kit/extractor/html/recognizer/text.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -67,6 +67,9 @@
6767
'mjx-container', 'mjx-assistive-mml', 'strike', 'wbr', 'ins'
6868
}
6969

70+
# 词间无分隔符的语言
71+
no_separation_language = ['zh', 'ja', 'ko', 'wuu', 'th', 'km', 'lo', 'bo', 'ii', 'jv']
72+
7073

7174
class TextParagraphRecognizer(BaseHTMLElementRecognizer):
7275
"""解析文本段落元素."""
@@ -93,7 +96,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
9396
return node
9497

9598
@override
96-
def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement | str, HtmlElement | str]], raw_html:str) -> List[Tuple[HtmlElement, HtmlElement]]:
99+
def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement | str, HtmlElement | str]], raw_html:str, language:str = 'en') -> List[Tuple[HtmlElement, HtmlElement]]:
97100
"""父类,解析文本段落元素.
98101
99102
Args:
@@ -111,11 +114,11 @@ def recognize(self, base_url:str, main_html_lst: List[Tuple[HtmlElement | str, H
111114
new_html_lst.append((html_element, raw_html_element))
112115
else:
113116
lst = list(self.__extract_paragraphs(html_element))
114-
new_lst = self.__to_cctext_lst(lst)
117+
new_lst = self.__to_cctext_lst(lst, language)
115118
new_html_lst.extend(new_lst)
116119
return new_html_lst
117120

118-
def __to_cctext_lst(self, lst: List[Tuple[HtmlElement | str, HtmlElement | str]]) -> List[Tuple[HtmlElement, HtmlElement]]:
121+
def __to_cctext_lst(self, lst: List[Tuple[HtmlElement | str, HtmlElement | str]], language:str) -> List[Tuple[HtmlElement, HtmlElement]]:
119122
"""将lst[Element, raw_html] 进行处理. 提出Element里的文字,做成<<cctext>>标签.
120123
121124
Args:
@@ -129,7 +132,7 @@ def __to_cctext_lst(self, lst: List[Tuple[HtmlElement | str, HtmlElement | str]]
129132
el_element = html_to_element(el) if isinstance(el, str) else el
130133
raw_html_element = html_to_element(raw_html) if isinstance(raw_html, str) else raw_html
131134

132-
para_text = self.__get_paragraph_text(el_element)
135+
para_text = self.__get_paragraph_text(el_element, language)
133136
if para_text:
134137
cctext_el = self._build_cc_element(CCTag.CC_TEXT, json.dumps(para_text, ensure_ascii=False, indent=4), '', html=element_to_html_unescaped(raw_html_element))
135138
new_lst.append((cctext_el, raw_html_element))
@@ -185,20 +188,20 @@ def __combine_text(self, text1:str, text2:str, lang='en') -> str:
185188
lang: str: 语言 TODO 实现根据语言连接文本的不同方式, 还有就是一些特殊符号开头的连接不加空格。
186189
"""
187190
text1 = text1.strip(' ') if text1 else ''
188-
text2 = text2.strip(' ') if text2 else ''
189-
if lang == 'zh':
191+
text2 = text2.rstrip(' ') if text2 else ''
192+
if lang in no_separation_language:
190193
txt = text1 + text2
191194
return self.replace_entities(txt.strip(), entities_map)
192195
else:
193196
# 根据text1的最后一个字符和text2的第一个字符判断两个text之间的连接
194197
if (text2[0] in string.punctuation) or (text2[0] in special_symbols) or (text2[0] in other_symbols) or (text1 and text1[-1] in other_symbols):
195198
words_sep = ''
196-
else :
199+
else:
197200
words_sep = ' '
198201
txt = text1 + words_sep + text2
199202
return self.replace_entities(txt.strip(), entities_map)
200203

201-
def __get_paragraph_text(self, root: HtmlElement) -> List[dict]:
204+
def __get_paragraph_text(self, root: HtmlElement, language:str = 'en') -> List[dict]:
202205
"""
203206
获取段落全部的文本.
204207
对于段落里的行内公式<equation-inline>需要特定处理,转换为段落格式:
@@ -235,7 +238,7 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
235238
pass
236239
else:
237240
if el.text and el.text.strip():
238-
text = self.__combine_text(text, el.text.strip())
241+
text = self.__combine_text(text, el.text.strip(), language)
239242
for child in el:
240243
text = __get_paragraph_text_recusive(child, text)
241244

@@ -244,7 +247,8 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
244247
if is_sub_sup:
245248
text += el.tail
246249
else:
247-
text = self.__combine_text(text, el.tail.strip())
250+
new_tail = f' {el.tail.strip()}' if el.tail.startswith(' ') and el.tail.strip()[0] in string.punctuation else el.tail.strip()
251+
text = self.__combine_text(text, new_tail, language)
248252

249253
return text
250254

llm_web_kit/extractor/html/recognizer/title.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,7 @@ def to_content_list_node(self, base_url: str, parsed_content: HtmlElement, raw_h
3939
return cctitle_content_node
4040

4141
@override
42-
def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str) -> List[Tuple[HtmlElement, HtmlElement]]:
42+
def recognize(self, base_url: str, main_html_lst: List[Tuple[HtmlElement, HtmlElement]], raw_html: str, language:str = 'en') -> List[Tuple[HtmlElement, HtmlElement]]:
4343
"""父类,解析标题元素.
4444
4545
Args:

0 commit comments

Comments
 (0)