Skip to content

Commit 8c8363a

Browse files
authored
<fix>: fix table tag integrity (#497)
1 parent 1256ebd commit 8c8363a

File tree

5 files changed

+894
-28
lines changed

5 files changed

+894
-28
lines changed

llm_web_kit/main_html_parser/parser/layout_batch_parser.py

Lines changed: 25 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -154,18 +154,20 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab
154154
# 匹配正文节点
155155
has_red = False
156156
layer_nodes_dict = dict()
157+
layer_nodes_dict_drop_tail = dict()
157158
layer_norm_eles = {}
158159
# 构造当前层的候选映射字典
159160
for ele_keyy, ele_value in layer_nodes.items():
160161
ele_parent_keyy = self.normalize_key(ele_value[1])
161162
if ele_parent_keyy is not None:
162163
ele_parent_keyy = tuple(ele_parent_keyy)
163164
ele_label = ele_value[0]
165+
is_drop_tail = ele_value[3]
164166
norm_ele_keyy = self.normalize_key(ele_keyy[:3])
165167
if norm_ele_keyy in layer_norm_eles:
166-
layer_norm_eles[norm_ele_keyy].append((ele_label, ele_keyy[:3], ele_parent_keyy))
168+
layer_norm_eles[norm_ele_keyy].append((ele_label, ele_keyy[:3], ele_parent_keyy, is_drop_tail))
167169
else:
168-
layer_norm_eles[norm_ele_keyy] = [(ele_label, ele_keyy[:3], ele_parent_keyy)]
170+
layer_norm_eles[norm_ele_keyy] = [(ele_label, ele_keyy[:3], ele_parent_keyy, is_drop_tail)]
169171
# 尝试匹配当前层每个节点,判断是否存在至少一个红色节点
170172
for current_layer_key, current_layer_value in current_layer_keys.items():
171173
current_layer_ori_key = current_layer_value[0]
@@ -175,20 +177,22 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab
175177
if layer_norm_ele_value[2] != parent_keyy:
176178
continue
177179
node_label = layer_norm_ele_value[0]
178-
180+
is_drop_tail = layer_norm_ele_value[3]
179181
if current_layer_key in layer_nodes_dict:
180182
layer_nodes_dict[current_layer_key].append(node_label)
183+
layer_nodes_dict_drop_tail[current_layer_key].append(is_drop_tail)
181184
else:
182185
layer_nodes_dict[current_layer_key] = [node_label]
186+
layer_nodes_dict_drop_tail[current_layer_key] = [is_drop_tail]
183187
if node_label == 'red':
184188
has_red = True
185189
break
186190
# 动态id匹配逻辑
187191
elif self.dynamic_id_enable and current_layer_key[2]:
188-
node_label, matched_ele_key = self.__match_tag_class(layer_nodes, current_layer_ori_key, parent_keyy,
192+
node_label, matched_ele_key, is_drop_tail = self.__match_tag_class(layer_nodes, current_layer_ori_key, parent_keyy,
189193
node_html, template_doc)
190194
if node_label is None and self.dynamic_classid_enable:
191-
node_label, matched_ele_key = self.__match_tag(layer_nodes, current_layer_ori_key, parent_keyy,
195+
node_label, matched_ele_key, is_drop_tail = self.__match_tag(layer_nodes, current_layer_ori_key, parent_keyy,
192196
node_html,
193197
template_doc, False, True)
194198
if node_label is None:
@@ -199,13 +203,14 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab
199203
element.set('id', matched_ele_key[2])
200204
if current_layer_key in layer_nodes_dict:
201205
layer_nodes_dict[matched_ele_key].append(node_label)
206+
layer_nodes_dict_drop_tail[matched_ele_key].append(is_drop_tail)
202207
else:
203208
layer_nodes_dict[matched_ele_key] = [node_label]
204-
209+
layer_nodes_dict_drop_tail[matched_ele_key] = [is_drop_tail]
205210
if node_label == 'red':
206211
has_red = True
207212
elif self.dynamic_id_enable and self.dynamic_classid_enable and current_layer_key[1]:
208-
node_label, matched_ele_key = self.__match_tag(layer_nodes, current_layer_ori_key, parent_keyy,
213+
node_label, matched_ele_key, is_drop_tail = self.__match_tag(layer_nodes, current_layer_ori_key, parent_keyy,
209214
node_html,
210215
template_doc, True, False)
211216
if node_label is None:
@@ -216,11 +221,12 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab
216221
element.set('class', matched_ele_key[1])
217222
if current_layer_key in layer_nodes_dict:
218223
layer_nodes_dict[matched_ele_key].append(node_label)
224+
layer_nodes_dict_drop_tail[matched_ele_key].append(is_drop_tail)
219225
else:
220226
layer_nodes_dict[matched_ele_key] = [node_label]
227+
layer_nodes_dict_drop_tail[matched_ele_key] = [is_drop_tail]
221228
if node_label == 'red':
222229
has_red = True
223-
224230
if not has_red and parent_label != 'red':
225231
parent = element.getparent()
226232
if parent is not None:
@@ -244,6 +250,10 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab
244250
return
245251
else:
246252
label = 'red'
253+
# 正文节点情况下还需要判断tail是否是正文
254+
if False not in layer_nodes_dict_drop_tail[keyy]:
255+
if element.tail:
256+
element.tail = None
247257
elif length > 0 or length_tail > 0 or tag in ['figure', 'img']:
248258
return
249259

@@ -321,6 +331,7 @@ def __match_tag_class(self, layer_nodes, current_layer_key, parent_key, node_htm
321331
if ele_parent_keyy is not None:
322332
ele_parent_keyy = tuple(ele_parent_keyy)
323333
ele_label = ele_value[0]
334+
is_drop_tail = ele_value[3]
324335
norm_ele_keyy = self.normalize_key((ele_keyy[0], ele_keyy[1], None))
325336
norm_ele_keyy_parent = (norm_ele_keyy, ele_parent_keyy)
326337
if current_norm_key == norm_ele_keyy_parent:
@@ -335,10 +346,10 @@ def __match_tag_class(self, layer_nodes, current_layer_key, parent_key, node_htm
335346
template_sim = similarity(feature1, feature2, layer_n=3)
336347

337348
if template_sim > DYNAMIC_ID_SIM_THRESHOLD:
338-
return ele_label, self.normalize_key(ele_keyy[0:3])
349+
return ele_label, self.normalize_key(ele_keyy[0:3]), is_drop_tail
339350
# else:
340351
# logger.info(f'{current_layer_key} and {ele_keyy} similarity is {template_sim}')
341-
return None, None
352+
return None, None, None
342353

343354
def __match_tag(self, layer_nodes, current_layer_key, parent_key, node_html, template_doc, class_must=False,
344355
id_exist=False):
@@ -357,6 +368,7 @@ def __match_tag(self, layer_nodes, current_layer_key, parent_key, node_html, tem
357368
if ele_parent_keyy is not None:
358369
ele_parent_keyy = tuple(ele_parent_keyy)
359370
ele_label = ele_value[0]
371+
is_drop_tail = ele_value[3]
360372
norm_ele_keyy = self.normalize_key((ele_keyy[0], None, None))
361373
norm_ele_keyy_parent = (norm_ele_keyy, ele_parent_keyy)
362374
# 相似度方案
@@ -370,14 +382,14 @@ def __match_tag(self, layer_nodes, current_layer_key, parent_key, node_html, tem
370382
continue
371383
template_sim = similarity(feature1, feature2, layer_n=3)
372384
if template_sim >= self.dynamic_classid_similarity_threshold:
373-
return ele_label, self.normalize_key(ele_keyy[0:3])
385+
return ele_label, self.normalize_key(ele_keyy[0:3]), is_drop_tail
374386
# first class方案
375387
norm_ele_keyy_with_first_class = self.normalize_key((ele_keyy[0], ele_keyy[1].strip().split(' ')[0], None))
376388
norm_ele_keyy_parent_with_first_class = (norm_ele_keyy_with_first_class, ele_parent_keyy)
377389
if current_norm_key_with_first_class == norm_ele_keyy_parent_with_first_class:
378-
return ele_label, self.normalize_key(ele_keyy[0:3])
390+
return ele_label, self.normalize_key(ele_keyy[0:3]), is_drop_tail
379391

380-
return None, None
392+
return None, None, None
381393

382394
def __is_natural_language(self, text, min_words=3):
383395
"""判断文本是否像自然语言.

llm_web_kit/main_html_parser/parser/tag_mapping.py

Lines changed: 22 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -124,27 +124,36 @@ def deal_element_direct(self, item_id, test_root):
124124
elements = test_root.xpath(f'//*[@_item_id="{item_id}"]')
125125
deal_element = elements[0]
126126
deal_element.set('magic_main_html', 'True')
127+
for ele in deal_element:
128+
ele.set('magic_main_html', 'True')
127129

128130
def find_affected_element_after_drop(self, element):
129131
prev_sibling = element.getprevious()
130132
parent = element.getparent()
131-
133+
is_main = bool(element.get('magic_main_html', None))
132134
# 包裹子节点的情况返回element父节点
133135
if len(element) > 0:
134-
if element.get('magic_main_html', None):
136+
if is_main:
135137
for ele in element:
136138
ele.set('magic_main_html', 'True')
137139

138140
element.drop_tag()
139-
return parent
141+
# 如果包含子tag并且还有text,text有可能是兄弟节点的tail
142+
if element.text and element.text.strip():
143+
if prev_sibling is not None:
144+
# 兄弟节点是否drop text, 是否drop tail
145+
return prev_sibling, False, not is_main
146+
else:
147+
return parent, not is_main, False
148+
return parent, False, False
140149

141150
# 只有文本的情况,返回element前面的兄弟节点或者父节点
142151
element.drop_tag()
143152

144153
if prev_sibling is not None:
145-
return prev_sibling
154+
return prev_sibling, False, not is_main
146155
else:
147-
return parent
156+
return parent, not is_main, False
148157

149158
def process_element(self, element):
150159
# 前序遍历元素树(先处理子元素)
@@ -154,9 +163,13 @@ def process_element(self, element):
154163
# 如果是cc-alg-uc-text标签,用drop_tag()删除标签但保留子元素
155164
if element.tag == 'cc-alg-uc-text':
156165
is_main = element.get('magic_main_html', None)
157-
affected = self.find_affected_element_after_drop(element)
166+
affected, drop_text, drop_tail = self.find_affected_element_after_drop(element)
158167
if is_main:
159168
affected.set('magic_main_html', 'True')
169+
if drop_text:
170+
affected.set('drop_text', 'True')
171+
if drop_tail:
172+
affected.set('drop_tail', 'True')
160173

161174
return
162175

@@ -234,6 +247,7 @@ def process_main_tree(self, element, depth, layer_index_counter, all_dict, all_s
234247
all_dict[depth] = {}
235248
all_set[depth] = {}
236249
is_main_html = element.get('magic_main_html', None)
250+
is_drop_tail = element.get('drop_tail', None)
237251
current_dict = all_dict[depth]
238252
current_set = all_set[depth]
239253
tag = element.tag
@@ -260,10 +274,10 @@ def process_main_tree(self, element, depth, layer_index_counter, all_dict, all_s
260274
# 写入该层元素key,如果有重复的green节点,只保留一个
261275
if keyy_for_sim in current_set:
262276
if is_main_html and current_set[keyy_for_sim][0] == 'green':
263-
current_dict[keyy] = ('red', parent_keyy, xpath)
277+
current_dict[keyy] = ('red', parent_keyy, xpath, bool(is_drop_tail))
264278
current_set[keyy_for_sim] = ('red', parent_keyy)
265279
else:
266-
current_dict[keyy] = (color, parent_keyy, xpath)
280+
current_dict[keyy] = (color, parent_keyy, xpath, bool(is_drop_tail))
267281
current_set[keyy_for_sim] = (color, parent_keyy)
268282

269283
for ele in element:

0 commit comments

Comments
 (0)