@@ -90,6 +90,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson:
90
90
raw_html :str = data_json ['html' ]
91
91
base_url :str = data_json ['url' ]
92
92
main_html :str = data_json ['main_html' ]
93
+ language :str = data_json .get ('language' , 'en' )
93
94
# page_layout_type:str = data_json.get('page_layout_type', HTMLPageLayoutType.LAYOUT_ARTICLE) # 默认是文章类型
94
95
95
96
# main_html, method, title = self._extract_main_html(raw_html, base_url, page_layout_type)
@@ -98,7 +99,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson:
98
99
for extract_func in [self ._extract_code , self ._extract_table , self ._extract_math , self ._extract_list ,
99
100
self ._extract_image ,
100
101
self ._extract_title , self ._extract_paragraph ]:
101
- parsed_html = extract_func (base_url , parsed_html , raw_html )
102
+ parsed_html = extract_func (base_url , parsed_html , raw_html , language )
102
103
103
104
# 过滤掉包含script和style标签的元素,在这里改,是因为math提取需要保留script标签
104
105
filtered_parsed_html = []
@@ -111,7 +112,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson:
111
112
# data_json['title'] = title
112
113
return data_json
113
114
114
- def _extract_code (self , base_url :str , html_lst :List [Tuple [HtmlElement , HtmlElement ]], raw_html :str ) -> List [Tuple [HtmlElement ,HtmlElement ]]:
115
+ def _extract_code (self , base_url :str , html_lst :List [Tuple [HtmlElement , HtmlElement ]], raw_html :str , language : str ) -> List [Tuple [HtmlElement ,HtmlElement ]]:
115
116
"""从html文本中提取代码.
116
117
117
118
Args:
@@ -121,10 +122,10 @@ def _extract_code(self, base_url:str, html_lst:List[Tuple[HtmlElement, HtmlEleme
121
122
Returns:
122
123
"""
123
124
124
- lst = self .__code_recognizer .recognize (base_url , html_lst , raw_html )
125
+ lst = self .__code_recognizer .recognize (base_url , html_lst , raw_html , language )
125
126
return lst
126
127
127
- def _extract_math (self , base_url :str , html_lst :List [Tuple [str ,str ]], raw_html :str ) -> List [Tuple [str ,str ]]:
128
+ def _extract_math (self , base_url :str , html_lst :List [Tuple [str ,str ]], raw_html :str , language : str ) -> List [Tuple [str ,str ]]:
128
129
"""从html文本中提取数学公式.
129
130
130
131
Args:
@@ -135,10 +136,10 @@ def _extract_math(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:st
135
136
Returns:
136
137
"""
137
138
138
- lst = self .__math_recognizer .recognize (base_url , html_lst , raw_html )
139
+ lst = self .__math_recognizer .recognize (base_url , html_lst , raw_html , language )
139
140
return lst
140
141
141
- def _extract_image (self , base_url :str , html_lst :List [Tuple [str ,str ]], raw_html :str ) -> List [Tuple [str ,str ]]:
142
+ def _extract_image (self , base_url :str , html_lst :List [Tuple [str ,str ]], raw_html :str , language : str ) -> List [Tuple [str ,str ]]:
142
143
"""从html文本中提取图片.
143
144
144
145
Args:
@@ -149,10 +150,10 @@ def _extract_image(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:s
149
150
Returns:
150
151
"""
151
152
152
- lst = self .__image_recognizer .recognize (base_url , html_lst , raw_html )
153
+ lst = self .__image_recognizer .recognize (base_url , html_lst , raw_html , language )
153
154
return lst
154
155
155
- def _extract_audio (self , base_url :str , html_lst :List [Tuple [str ,str ]], raw_html :str ) -> List [Tuple [str ,str ]]:
156
+ def _extract_audio (self , base_url :str , html_lst :List [Tuple [str ,str ]], raw_html :str , language : str ) -> List [Tuple [str ,str ]]:
156
157
"""从html文本中提取音频.
157
158
158
159
Args:
@@ -163,10 +164,10 @@ def _extract_audio(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:s
163
164
Returns:
164
165
"""
165
166
166
- lst = self .__audio_recognizer .recognize (base_url , html_lst , raw_html )
167
+ lst = self .__audio_recognizer .recognize (base_url , html_lst , raw_html , language )
167
168
return lst
168
169
169
- def _extract_video (self , base_url :str , html_lst :List [Tuple [str ,str ]], raw_html :str ) -> List [Tuple [str ,str ]]:
170
+ def _extract_video (self , base_url :str , html_lst :List [Tuple [str ,str ]], raw_html :str , language : str ) -> List [Tuple [str ,str ]]:
170
171
"""从html文本中提取视频.
171
172
172
173
Args:
@@ -177,10 +178,10 @@ def _extract_video(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:s
177
178
Returns:
178
179
"""
179
180
180
- lst = self .__video_recognizer .recognize (base_url , html_lst , raw_html )
181
+ lst = self .__video_recognizer .recognize (base_url , html_lst , raw_html , language )
181
182
return lst
182
183
183
- def _extract_table (self , base_url :str , html_lst :List [Tuple [str ,str ]], raw_html :str ) -> List [Tuple [str ,str ]]:
184
+ def _extract_table (self , base_url :str , html_lst :List [Tuple [str ,str ]], raw_html :str , language : str ) -> List [Tuple [str ,str ]]:
184
185
"""从html文本中提取表格.
185
186
186
187
Args:
@@ -191,10 +192,10 @@ def _extract_table(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:s
191
192
Returns:
192
193
"""
193
194
194
- lst = self .__table_recognizer .recognize (base_url , html_lst , raw_html )
195
+ lst = self .__table_recognizer .recognize (base_url , html_lst , raw_html , language )
195
196
return lst
196
197
197
- def _extract_list (self , base_url :str , html_lst :List [Tuple [str ,str ]], raw_html :str ) -> List [Tuple [str ,str ]]:
198
+ def _extract_list (self , base_url :str , html_lst :List [Tuple [str ,str ]], raw_html :str , language : str ) -> List [Tuple [str ,str ]]:
198
199
"""从html文本中提取列表.
199
200
200
201
Args:
@@ -205,10 +206,10 @@ def _extract_list(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:st
205
206
Returns:
206
207
"""
207
208
208
- lst = self .__list_recognizer .recognize (base_url , html_lst , raw_html )
209
+ lst = self .__list_recognizer .recognize (base_url , html_lst , raw_html , language )
209
210
return lst
210
211
211
- def _extract_title (self , base_url :str , html_lst :List [Tuple [str ,str ]], raw_html :str ) -> List [Tuple [str ,str ]]:
212
+ def _extract_title (self , base_url :str , html_lst :List [Tuple [str ,str ]], raw_html :str , language : str ) -> List [Tuple [str ,str ]]:
212
213
"""从html文本中提取标题.
213
214
214
215
Args:
@@ -219,10 +220,10 @@ def _extract_title(self, base_url:str, html_lst:List[Tuple[str,str]], raw_html:s
219
220
Returns:
220
221
"""
221
222
222
- lst = self .__title_recognizer .recognize (base_url , html_lst , raw_html )
223
+ lst = self .__title_recognizer .recognize (base_url , html_lst , raw_html , language )
223
224
return lst
224
225
225
- def _extract_paragraph (self , base_url :str , html_lst :List [Tuple [str ,str ]], raw_html :str ) -> List [Tuple [str ,str ]]:
226
+ def _extract_paragraph (self , base_url :str , html_lst :List [Tuple [str ,str ]], raw_html :str , language : str ) -> List [Tuple [str ,str ]]:
226
227
"""从html文本中提取段落.
227
228
228
229
Args:
@@ -233,7 +234,7 @@ def _extract_paragraph(self, base_url:str, html_lst:List[Tuple[str,str]], raw_ht
233
234
Returns:
234
235
"""
235
236
236
- lst = self .__paragraph_recognizer .recognize (base_url , html_lst , raw_html )
237
+ lst = self .__paragraph_recognizer .recognize (base_url , html_lst , raw_html , language )
237
238
return lst
238
239
239
240
def __is_valid_node (self , node : dict ) -> bool :
0 commit comments