@@ -154,18 +154,20 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab
154
154
# 匹配正文节点
155
155
has_red = False
156
156
layer_nodes_dict = dict ()
157
+ layer_nodes_dict_drop_tail = dict ()
157
158
layer_norm_eles = {}
158
159
# 构造当前层的候选映射字典
159
160
for ele_keyy , ele_value in layer_nodes .items ():
160
161
ele_parent_keyy = self .normalize_key (ele_value [1 ])
161
162
if ele_parent_keyy is not None :
162
163
ele_parent_keyy = tuple (ele_parent_keyy )
163
164
ele_label = ele_value [0 ]
165
+ is_drop_tail = ele_value [3 ]
164
166
norm_ele_keyy = self .normalize_key (ele_keyy [:3 ])
165
167
if norm_ele_keyy in layer_norm_eles :
166
- layer_norm_eles [norm_ele_keyy ].append ((ele_label , ele_keyy [:3 ], ele_parent_keyy ))
168
+ layer_norm_eles [norm_ele_keyy ].append ((ele_label , ele_keyy [:3 ], ele_parent_keyy , is_drop_tail ))
167
169
else :
168
- layer_norm_eles [norm_ele_keyy ] = [(ele_label , ele_keyy [:3 ], ele_parent_keyy )]
170
+ layer_norm_eles [norm_ele_keyy ] = [(ele_label , ele_keyy [:3 ], ele_parent_keyy , is_drop_tail )]
169
171
# 尝试匹配当前层每个节点,判断是否存在至少一个红色节点
170
172
for current_layer_key , current_layer_value in current_layer_keys .items ():
171
173
current_layer_ori_key = current_layer_value [0 ]
@@ -175,20 +177,22 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab
175
177
if layer_norm_ele_value [2 ] != parent_keyy :
176
178
continue
177
179
node_label = layer_norm_ele_value [0 ]
178
-
180
+ is_drop_tail = layer_norm_ele_value [ 3 ]
179
181
if current_layer_key in layer_nodes_dict :
180
182
layer_nodes_dict [current_layer_key ].append (node_label )
183
+ layer_nodes_dict_drop_tail [current_layer_key ].append (is_drop_tail )
181
184
else :
182
185
layer_nodes_dict [current_layer_key ] = [node_label ]
186
+ layer_nodes_dict_drop_tail [current_layer_key ] = [is_drop_tail ]
183
187
if node_label == 'red' :
184
188
has_red = True
185
189
break
186
190
# 动态id匹配逻辑
187
191
elif self .dynamic_id_enable and current_layer_key [2 ]:
188
- node_label , matched_ele_key = self .__match_tag_class (layer_nodes , current_layer_ori_key , parent_keyy ,
192
+ node_label , matched_ele_key , is_drop_tail = self .__match_tag_class (layer_nodes , current_layer_ori_key , parent_keyy ,
189
193
node_html , template_doc )
190
194
if node_label is None and self .dynamic_classid_enable :
191
- node_label , matched_ele_key = self .__match_tag (layer_nodes , current_layer_ori_key , parent_keyy ,
195
+ node_label , matched_ele_key , is_drop_tail = self .__match_tag (layer_nodes , current_layer_ori_key , parent_keyy ,
192
196
node_html ,
193
197
template_doc , False , True )
194
198
if node_label is None :
@@ -199,13 +203,14 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab
199
203
element .set ('id' , matched_ele_key [2 ])
200
204
if current_layer_key in layer_nodes_dict :
201
205
layer_nodes_dict [matched_ele_key ].append (node_label )
206
+ layer_nodes_dict_drop_tail [matched_ele_key ].append (is_drop_tail )
202
207
else :
203
208
layer_nodes_dict [matched_ele_key ] = [node_label ]
204
-
209
+ layer_nodes_dict_drop_tail [ matched_ele_key ] = [ is_drop_tail ]
205
210
if node_label == 'red' :
206
211
has_red = True
207
212
elif self .dynamic_id_enable and self .dynamic_classid_enable and current_layer_key [1 ]:
208
- node_label , matched_ele_key = self .__match_tag (layer_nodes , current_layer_ori_key , parent_keyy ,
213
+ node_label , matched_ele_key , is_drop_tail = self .__match_tag (layer_nodes , current_layer_ori_key , parent_keyy ,
209
214
node_html ,
210
215
template_doc , True , False )
211
216
if node_label is None :
@@ -216,11 +221,12 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab
216
221
element .set ('class' , matched_ele_key [1 ])
217
222
if current_layer_key in layer_nodes_dict :
218
223
layer_nodes_dict [matched_ele_key ].append (node_label )
224
+ layer_nodes_dict_drop_tail [matched_ele_key ].append (is_drop_tail )
219
225
else :
220
226
layer_nodes_dict [matched_ele_key ] = [node_label ]
227
+ layer_nodes_dict_drop_tail [matched_ele_key ] = [is_drop_tail ]
221
228
if node_label == 'red' :
222
229
has_red = True
223
-
224
230
if not has_red and parent_label != 'red' :
225
231
parent = element .getparent ()
226
232
if parent is not None :
@@ -244,6 +250,10 @@ def find_blocks_drop(self, element, depth, element_dict, parent_keyy, parent_lab
244
250
return
245
251
else :
246
252
label = 'red'
253
+ # 正文节点情况下还需要判断tail是否是正文
254
+ if False not in layer_nodes_dict_drop_tail [keyy ]:
255
+ if element .tail :
256
+ element .tail = None
247
257
elif length > 0 or length_tail > 0 or tag in ['figure' , 'img' ]:
248
258
return
249
259
@@ -321,6 +331,7 @@ def __match_tag_class(self, layer_nodes, current_layer_key, parent_key, node_htm
321
331
if ele_parent_keyy is not None :
322
332
ele_parent_keyy = tuple (ele_parent_keyy )
323
333
ele_label = ele_value [0 ]
334
+ is_drop_tail = ele_value [3 ]
324
335
norm_ele_keyy = self .normalize_key ((ele_keyy [0 ], ele_keyy [1 ], None ))
325
336
norm_ele_keyy_parent = (norm_ele_keyy , ele_parent_keyy )
326
337
if current_norm_key == norm_ele_keyy_parent :
@@ -335,10 +346,10 @@ def __match_tag_class(self, layer_nodes, current_layer_key, parent_key, node_htm
335
346
template_sim = similarity (feature1 , feature2 , layer_n = 3 )
336
347
337
348
if template_sim > DYNAMIC_ID_SIM_THRESHOLD :
338
- return ele_label , self .normalize_key (ele_keyy [0 :3 ])
349
+ return ele_label , self .normalize_key (ele_keyy [0 :3 ]), is_drop_tail
339
350
# else:
340
351
# logger.info(f'{current_layer_key} and {ele_keyy} similarity is {template_sim}')
341
- return None , None
352
+ return None , None , None
342
353
343
354
def __match_tag (self , layer_nodes , current_layer_key , parent_key , node_html , template_doc , class_must = False ,
344
355
id_exist = False ):
@@ -357,6 +368,7 @@ def __match_tag(self, layer_nodes, current_layer_key, parent_key, node_html, tem
357
368
if ele_parent_keyy is not None :
358
369
ele_parent_keyy = tuple (ele_parent_keyy )
359
370
ele_label = ele_value [0 ]
371
+ is_drop_tail = ele_value [3 ]
360
372
norm_ele_keyy = self .normalize_key ((ele_keyy [0 ], None , None ))
361
373
norm_ele_keyy_parent = (norm_ele_keyy , ele_parent_keyy )
362
374
# 相似度方案
@@ -370,14 +382,14 @@ def __match_tag(self, layer_nodes, current_layer_key, parent_key, node_html, tem
370
382
continue
371
383
template_sim = similarity (feature1 , feature2 , layer_n = 3 )
372
384
if template_sim >= self .dynamic_classid_similarity_threshold :
373
- return ele_label , self .normalize_key (ele_keyy [0 :3 ])
385
+ return ele_label , self .normalize_key (ele_keyy [0 :3 ]), is_drop_tail
374
386
# first class方案
375
387
norm_ele_keyy_with_first_class = self .normalize_key ((ele_keyy [0 ], ele_keyy [1 ].strip ().split (' ' )[0 ], None ))
376
388
norm_ele_keyy_parent_with_first_class = (norm_ele_keyy_with_first_class , ele_parent_keyy )
377
389
if current_norm_key_with_first_class == norm_ele_keyy_parent_with_first_class :
378
- return ele_label , self .normalize_key (ele_keyy [0 :3 ])
390
+ return ele_label , self .normalize_key (ele_keyy [0 :3 ]), is_drop_tail
379
391
380
- return None , None
392
+ return None , None , None
381
393
382
394
def __is_natural_language (self , text , min_words = 3 ):
383
395
"""判断文本是否像自然语言.
0 commit comments