@@ -111,3 +111,76 @@ def __clean_invisible_elements(self, data_json: DataJson) -> str:
111
111
for element in elements :
112
112
remove_element (element )
113
113
return element_to_html (tree )
114
+
115
+
116
+ class TestHTMLFileToDataJsonPreExtractor (HTMLFileFormatFilterPreExtractor ):
117
+ """为了方便noclip管线对测试数据进行测试,根据路径读取html文件和main_html文件,然后转换为DataJson格式。"""
118
+
119
+ def __init__ (self , config : dict , html_parent_dir : str ):
120
+ """
121
+ 初始化函数
122
+ Args:
123
+ config:
124
+ html_parent_dir:
125
+ """
126
+ super ().__init__ (config )
127
+ self .__html_parent_path = html_parent_dir
128
+
129
+ @override
130
+ def _do_pre_extract (self , data_json : DataJson ) -> DataJson :
131
+ """对输入的html和main_html拼装到DataJson中,形成标准输入格式."""
132
+ proj_root_dir = get_proj_root_dir ()
133
+ html_file_path = os .path .join (proj_root_dir , self .__html_parent_path , data_json .get ('path' ))
134
+ main_html_file_path = os .path .join (proj_root_dir , self .__html_parent_path , data_json .get ('main_path' ))
135
+
136
+ with open (html_file_path , 'r' , encoding = 'utf-8' ) as f :
137
+ html = f .read ()
138
+ data_json ['html' ] = html
139
+ del data_json ['path' ]
140
+
141
+ with open (main_html_file_path , 'r' , encoding = 'utf-8' ) as f :
142
+ main_html = f .read ()
143
+ data_json ['main_html' ] = main_html
144
+ del data_json ['main_path' ]
145
+ return data_json
146
+
147
+
148
+ class HTMLFileFormatNoClipPreExtractor (HTMLFileFormatFilterPreExtractor ):
149
+ """noclip管线对main_html预处理."""
150
+ def __init__ (self , config : dict ):
151
+ super ().__init__ (config )
152
+
153
+ @override
154
+ def _do_pre_extract (self , data_json : DataJson ) -> DataJson :
155
+ data_json ['main_html' ] = self .__clean_interactive_elements (data_json )
156
+ return data_json
157
+
158
+ def __clean_interactive_elements (self , data_json : DataJson ) -> str :
159
+ """清除main_html中交互式元素."""
160
+ html_content = data_json ['main_html' ]
161
+ tree = html_to_element (html_content )
162
+ interactive_tags = ['input' , 'select' , 'textarea' , 'button' ]
163
+ # 删除<body>内的交互标签及关联label
164
+ for tag in interactive_tags :
165
+ for element in tree .xpath (f'//body//{ tag } ' ):
166
+ # 删除标签本身
167
+ parent = element .getparent ()
168
+ if parent is not None :
169
+ parent .remove (element )
170
+
171
+ # 删除关联的label(通过for属性匹配)
172
+ if 'id' in element .attrib :
173
+ for label in tree .xpath (f'//body//label[@for="{ element .attrib ["id" ]} "]' ):
174
+ label .getparent ().remove (label )
175
+
176
+ # 处理<form>内的交互标签及关联label
177
+ for form in tree .xpath ('//form' ):
178
+ # 删除表单内所有交互标签
179
+ form_elements = form .xpath ('.//input | .//select | .//textarea | .//button | .//label | .//img' )
180
+ for element in form_elements :
181
+ element .getparent ().remove (element )
182
+
183
+ # 检查表单是否为空(无子元素或仅剩空白文本)
184
+ if len (form .getchildren ()) == 0 or not form .text_content ().strip ():
185
+ form .getparent ().remove (form )
186
+ return element_to_html (tree )
0 commit comments