Skip to content

Commit be5f371

Browse files
authored
<improv>: modify typical main html similarity threshold to 0.92 (#504)
1 parent 1e2a794 commit be5f371

File tree

4 files changed

+10
-2
lines changed

4 files changed

+10
-2
lines changed

llm_web_kit/input/pre_data_json.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ class PreDataJsonKey:
3030
TYPICAL_MAIN_HTML = 'typical_main_html'
3131
# 模版网页提取正文成功标签, bool类型
3232
TYPICAL_MAIN_HTML_SUCCESS = 'typical_main_html_success'
33+
# similarity between typical main html and html
34+
TYPICAL_MAIN_HTML_SIM = 'typical_main_html_sim'
3335
# 用于生成element dict的html
3436
TYPICAL_DICT_HTML = 'typical_dict_html'
3537
# 动态id开关
@@ -44,6 +46,8 @@ class PreDataJsonKey:
4446
HTML_SOURCE = 'html_source'
4547
# 推广网页提取正文成功标签, bool类型
4648
MAIN_HTML_SUCCESS = 'main_html_success'
49+
# similarity between main html and typical main html
50+
MAIN_HTML_SIM = 'main_html_sim'
4751
# 推广网页提取正文文本
4852
MAIN_HTML = 'main_html'
4953
# 推广网页提取正文树

llm_web_kit/main_html_parser/parser/layout_batch_parser.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson:
8686
sim = None
8787
if feature1 is not None and feature2 is not None:
8888
sim = similarity(feature1, feature2, layer_n=layer)
89+
pre_data[PreDataJsonKey.MAIN_HTML_SIM] = sim
8990
if sim is None or sim < SIMILARITY_THRESHOLD:
9091
pre_data[PreDataJsonKey.MAIN_HTML_SUCCESS] = False
9192
else:

llm_web_kit/main_html_parser/parser/tag_mapping.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
LayoutBatchParser
88
from llm_web_kit.main_html_parser.parser.parser import BaseMainHtmlParser
99

10-
SIMILAR_THRESHOLD = 0.9
10+
SIMILAR_THRESHOLD = 0.92
1111

1212

1313
class MapItemToHtmlTagsParser(BaseMainHtmlParser):
@@ -74,8 +74,9 @@ def parse(self, pre_data: PreDataJson) -> PreDataJson:
7474
template_sim = None
7575
if feature1 is not None and feature2 is not None:
7676
template_sim = similarity(feature1, feature2, layer_n=layer)
77+
pre_data[PreDataJsonKey.TYPICAL_MAIN_HTML_SIM] = template_sim
7778

78-
# 比较模版正文html与原html相似度
79+
# 比较模版正文html与原html相似度
7980
if template_sim is None or template_sim > SIMILAR_THRESHOLD:
8081
pre_data[PreDataJsonKey.TYPICAL_MAIN_HTML_SUCCESS] = False
8182
else:

tests/llm_web_kit/input/test_pre_data_json.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -394,6 +394,8 @@ def test_pre_data_json_key_constants(self):
394394
assert hasattr(PreDataJsonKey, 'DYNAMIC_CLASSID_ENABLE')
395395
assert hasattr(PreDataJsonKey, 'DYNAMIC_CLASSID_SIM_THRESH')
396396
assert hasattr(PreDataJsonKey, 'MORE_NOISE_ENABLE')
397+
assert hasattr(PreDataJsonKey, 'TYPICAL_MAIN_HTML_SIM')
398+
assert hasattr(PreDataJsonKey, 'MAIN_HTML_SIM')
397399

398400
# Check actual values
399401
assert PreDataJsonKey.DOMAIN_NAME == 'domain_name'

0 commit comments

Comments
 (0)