Skip to content

Commit 1256ebd

Browse files
feat: noclip管线新增预处理:删除表单交互式元素 (#495)
1 parent 69e0626 commit 1256ebd

File tree

5 files changed

+1025
-0
lines changed

5 files changed

+1025
-0
lines changed
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
{
2+
"extractor_pipe": {
3+
"enable": true,
4+
"validate_input_format": false,
5+
"pre_extractor": [
6+
{
7+
"enable": true,
8+
"python_class": "llm_web_kit.extractor.html.pre_extractor.TestHTMLFileToDataJsonPreExtractor",
9+
"class_init_kwargs": {
10+
"html_parent_dir": "tests/llm_web_kit/extractor/assets/extractor_chain_input/good_data/html/"
11+
}
12+
},
13+
{
14+
"enable": true,
15+
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatNoClipPreExtractor",
16+
"class_init_kwargs": {}
17+
},
18+
{
19+
"enable": true,
20+
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatFilterTablePreExtractor"
21+
},
22+
{
23+
"enable": true,
24+
"python_class": "llm_web_kit.extractor.html.pre_extractor.HTMLFileFormatCleanTagsPreExtractor",
25+
"class_init_kwargs": {}
26+
}
27+
],
28+
"extractor": [
29+
{
30+
"enable": true,
31+
"python_class": "llm_web_kit.extractor.html.extractor.NoClipHTMLFIleFormatorExtractor",
32+
"class_init_kwargs": {}
33+
}
34+
],
35+
"post_extractor": [
36+
{
37+
"enable": true,
38+
"python_class": "llm_web_kit.extractor.html.post_extractor.HTMLStripSpacePostExtractor"
39+
}
40+
]
41+
}
42+
}

llm_web_kit/extractor/html/pre_extractor.py

Lines changed: 73 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -111,3 +111,76 @@ def __clean_invisible_elements(self, data_json: DataJson) -> str:
111111
for element in elements:
112112
remove_element(element)
113113
return element_to_html(tree)
114+
115+
116+
class TestHTMLFileToDataJsonPreExtractor(HTMLFileFormatFilterPreExtractor):
117+
"""为了方便noclip管线对测试数据进行测试,根据路径读取html文件和main_html文件,然后转换为DataJson格式。"""
118+
119+
def __init__(self, config: dict, html_parent_dir: str):
120+
"""
121+
初始化函数
122+
Args:
123+
config:
124+
html_parent_dir:
125+
"""
126+
super().__init__(config)
127+
self.__html_parent_path = html_parent_dir
128+
129+
@override
130+
def _do_pre_extract(self, data_json: DataJson) -> DataJson:
131+
"""对输入的html和main_html拼装到DataJson中,形成标准输入格式."""
132+
proj_root_dir = get_proj_root_dir()
133+
html_file_path = os.path.join(proj_root_dir, self.__html_parent_path, data_json.get('path'))
134+
main_html_file_path = os.path.join(proj_root_dir, self.__html_parent_path, data_json.get('main_path'))
135+
136+
with open(html_file_path, 'r', encoding='utf-8') as f:
137+
html = f.read()
138+
data_json['html'] = html
139+
del data_json['path']
140+
141+
with open(main_html_file_path, 'r', encoding='utf-8') as f:
142+
main_html = f.read()
143+
data_json['main_html'] = main_html
144+
del data_json['main_path']
145+
return data_json
146+
147+
148+
class HTMLFileFormatNoClipPreExtractor(HTMLFileFormatFilterPreExtractor):
149+
"""noclip管线对main_html预处理."""
150+
def __init__(self, config: dict):
151+
super().__init__(config)
152+
153+
@override
154+
def _do_pre_extract(self, data_json: DataJson) -> DataJson:
155+
data_json['main_html'] = self.__clean_interactive_elements(data_json)
156+
return data_json
157+
158+
def __clean_interactive_elements(self, data_json: DataJson) -> str:
159+
"""清除main_html中交互式元素."""
160+
html_content = data_json['main_html']
161+
tree = html_to_element(html_content)
162+
interactive_tags = ['input', 'select', 'textarea', 'button']
163+
# 删除<body>内的交互标签及关联label
164+
for tag in interactive_tags:
165+
for element in tree.xpath(f'//body//{tag}'):
166+
# 删除标签本身
167+
parent = element.getparent()
168+
if parent is not None:
169+
parent.remove(element)
170+
171+
# 删除关联的label(通过for属性匹配)
172+
if 'id' in element.attrib:
173+
for label in tree.xpath(f'//body//label[@for="{element.attrib["id"]}"]'):
174+
label.getparent().remove(label)
175+
176+
# 处理<form>内的交互标签及关联label
177+
for form in tree.xpath('//form'):
178+
# 删除表单内所有交互标签
179+
form_elements = form.xpath('.//input | .//select | .//textarea | .//button | .//label | .//img')
180+
for element in form_elements:
181+
element.getparent().remove(element)
182+
183+
# 检查表单是否为空(无子元素或仅剩空白文本)
184+
if len(form.getchildren()) == 0 or not form.text_content().strip():
185+
form.getparent().remove(form)
186+
return element_to_html(tree)

0 commit comments

Comments
 (0)