ccprocessor
diff --git a/‎llm_web_kit/extractor/html/extractor.py
Lines changed: 1 addition & 1 deletion b/‎llm_web_kit/extractor/html/extractor.py
Lines changed: 1 addition & 1 deletion
diff --git a/‎llm_web_kit/extractor/html/recognizer/list.py
Lines changed: 38 additions & 11 deletions b/‎llm_web_kit/extractor/html/recognizer/list.py
Lines changed: 38 additions & 11 deletions
diff --git a/‎llm_web_kit/extractor/html/recognizer/table.py
Lines changed: 3 additions & 0 deletions b/‎llm_web_kit/extractor/html/recognizer/table.py
Lines changed: 3 additions & 0 deletions
diff --git a/‎llm_web_kit/extractor/html/recognizer/text.py
Lines changed: 11 additions & 6 deletions b/‎llm_web_kit/extractor/html/recognizer/text.py
Lines changed: 11 additions & 6 deletions
diff --git a/‎llm_web_kit/extractor/html/recognizer/title.py
Lines changed: 14 additions & 9 deletions b/‎llm_web_kit/extractor/html/recognizer/title.py
Lines changed: 14 additions & 9 deletions
diff --git a/‎llm_web_kit/libs/html_utils.py
Lines changed: 20 additions & 0 deletions b/‎llm_web_kit/libs/html_utils.py
Lines changed: 20 additions & 0 deletions
@@ -95,7 +95,7 @@ def _do_extract(self, data_json: DataJson) -> DataJson:
 
         # main_html, method, title = self._extract_main_html(raw_html, base_url, page_layout_type)
         main_html_element = html_to_element(main_html)
-        parsed_html = [(main_html_element, raw_html)]
+        parsed_html = [(main_html_element, main_html)]
         for extract_func in [self._extract_code, self._extract_table, self._extract_math, self._extract_list,
                              self._extract_image,
                              self._extract_title, self._extract_paragraph]:
 
@@ -8,9 +8,12 @@
 from llm_web_kit.extractor.html.recognizer.recognizer import (
     BaseHTMLElementRecognizer, CCTag)
 from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType
-from llm_web_kit.libs.html_utils import process_sub_sup_tags
+from llm_web_kit.libs.html_utils import (html_normalize_space,
+                                         process_sub_sup_tags)
 from llm_web_kit.libs.text_utils import normalize_text_segment
 
+from .text import inline_tags
+
 
 class ListAttribute():
     """列表属性."""
@@ -130,7 +133,7 @@ def __extract_list_item_text_recusive(el: HtmlElement):
             elif el.tag == CCTag.CC_CODE_INLINE and el.text and el.text.strip():
                 paragraph.append({'c': f'`{el.text}`', 't': ParagraphTextType.CODE_INLINE})
             elif el.tag == 'br':
-                paragraph.append({'c': '\n\n', 't': ParagraphTextType.TEXT})
+                paragraph.append({'c': '$br$', 't': ParagraphTextType.TEXT})
             elif el.tag == 'sub' or el.tag == 'sup':
                 # 处理sub和sup标签，转换为GitHub Flavored Markdown格式
                 current_text = ''
@@ -154,9 +157,17 @@ def __extract_list_item_text_recusive(el: HtmlElement):
                     result['child_list'] = child_list
             else:
                 if el.text and el.text.strip():
-                    paragraph.append({'c': el.text, 't': ParagraphTextType.TEXT})
+                    _new_text = html_normalize_space(el.text.strip())
+                    if len(el) == 0 and el.tag not in inline_tags:
+                        _new_text += '$br$'
+                    paragraph.append({'c': _new_text, 't': ParagraphTextType.TEXT})
                     el.text = None
-                for child in el.getchildren():
+
+                for child in el:
+                    if child.tag not in inline_tags:
+                        if paragraph:
+                            paragraph[-1]['c'] += '$br$'
+
                     p = __extract_list_item_text_recusive(child)
                     if len(p) > 0:
                         # 如果子元素有child_list，需要保存
@@ -166,24 +177,40 @@ def __extract_list_item_text_recusive(el: HtmlElement):
                         if 'c' in p:
                             if p['c'] != '':
                                 paragraph.append({'c': p['c'], 't': p.get('t', ParagraphTextType.TEXT)})
+                    else:
+                        if paragraph:
+                            last_paragraph = paragraph[-1]['c']
+                            if last_paragraph == '$br$':
+                                del paragraph[-1]
+                            else:
+                                if last_paragraph.endswith('$br$'):
+                                    paragraph[-1]['c'] = last_paragraph[:-4]
+
             if el.tag != 'li' and el.tail and el.tail.strip():
+                _new_tail = html_normalize_space(el.tail.strip())
                 if is_sub_sup:
                     # 如果尾部文本跟在sub/sup后面，直接附加到最后一个文本段落中
                     if len(paragraph) > 0 and paragraph[-1]['t'] == ParagraphTextType.TEXT:
-                        paragraph[-1]['c'] += el.tail
-                    else:
-                        paragraph.append({'c': el.tail, 't': ParagraphTextType.TEXT})
+                        paragraph[-1]['c'] += _new_tail
                 else:
-                    paragraph.append({'c': el.tail, 't': ParagraphTextType.TEXT})
+                    paragraph.append({'c': _new_tail, 't': ParagraphTextType.TEXT})
+
             if paragraph:
                 # item['c'].strip(): 会导致前面处理br标签，添加的\n\n失效
                 result['c'] = ' '.join(normalize_text_segment(item['c'].strip()) for item in paragraph)
             return result
-        list_item_tags = ('li', 'dd', 'dt', 'ul', 'div', 'p')
+        list_item_tags = ('li', 'dd', 'dt', 'ul', 'div', 'p', 'span')
         if child.tag in list_item_tags:
             paragraph = __extract_list_item_text_recusive(child)
             if len(paragraph) > 0:
-                text_paragraph.append(paragraph)
+                tem_json = json.dumps(paragraph).replace('$br$\"}', '\"}')
+                new_paragraph = json.loads(tem_json)
+                text_paragraph.append(new_paragraph)
+
+        for n, item in enumerate(text_paragraph):
+            tem_json = json.dumps(item).replace('$br$', '\\n\\n')
+            text_paragraph[n] = json.loads(tem_json)
+
         return text_paragraph
 
     def __get_list_content_list(self, ele: HtmlElement, list_nest_level: int) -> list:
@@ -201,7 +228,7 @@ def __get_list_content_list(self, ele: HtmlElement, list_nest_level: int) -> lis
         # 处理根元素文本
         if ele.text and ele.text.strip():
             # 检查元素是否包含数学或代码相关属性
-            text_content = ele.text.strip()
+            text_content = html_normalize_space(ele.text.strip())
             root_item = {
                 'c': text_content,
                 't': ParagraphTextType.TEXT,
 
@@ -289,6 +289,8 @@ def __get_table_body(self, table_type, table_nest_level, table_root):
     def __do_extract_tables(self, root: HtmlElement) -> None:
         """递归处理所有子标签."""
         if root.tag in ['table']:
+            temp_tail = root.tail
+            root.tail = None
             table_raw_html = self._element_to_html(root)
             table_type = self.__get_table_type(root)
             table_nest_level = self.__is_table_nested(root)
@@ -297,6 +299,7 @@ def __do_extract_tables(self, root: HtmlElement) -> None:
             cc_element = self._build_cc_element(
                 CCTag.CC_TABLE, table_body, tail_text, table_type=table_type, table_nest_level=table_nest_level,
                 html=table_raw_html)
+            cc_element.tail = temp_tail
             self._replace_element(root, cc_element)
             return
         for child in root.iterchildren():
 
@@ -12,7 +12,8 @@
     BaseHTMLElementRecognizer, CCTag)
 from llm_web_kit.libs.doc_element_type import DocElementType, ParagraphTextType
 from llm_web_kit.libs.html_utils import (element_to_html_unescaped,
-                                         html_to_element, process_sub_sup_tags)
+                                         html_normalize_space, html_to_element,
+                                         process_sub_sup_tags)
 
 special_symbols = [  # TODO 从文件读取
     '®',  # 注册商标符号
@@ -231,29 +232,33 @@ def __get_paragraph_text_recusive(el: HtmlElement, text: str) -> str:
                     text = ''
                 para_text.append({'c': el.text, 't': ParagraphTextType.CODE_INLINE})
             elif el.tag in ['br']:
-                text += PARAGRAPH_SEPARATOR  # TODO 这个地方直接加换行是错误点做法，需要利用数据结构来保证段落。
+                text += '$br$'
             elif el.tag == 'sub' or el.tag == 'sup':
                 text = process_sub_sup_tags(el, text, recursive=False)
             elif el.tag == 'audio':  # 避免audio被识别为paragraph
                 pass
             else:
                 if el.text and el.text.strip():
-                    text = self.__combine_text(text, el.text.strip(), language)
+                    tem_text = html_normalize_space(text)
+                    _text = html_normalize_space(el.text.strip())
+                    text = self.__combine_text(tem_text, _text, language)
                 for child in el:
                     text = __get_paragraph_text_recusive(child, text)
 
             # 处理尾部文本
             if el.tail and el.tail.strip():
                 if is_sub_sup:
-                    text += el.tail
+                    _new_tail = html_normalize_space(el.tail.strip())
+                    text += _new_tail
                 else:
-                    new_tail = f' {el.tail.strip()}' if el.tail.startswith(' ') and el.tail.strip()[0] in string.punctuation else el.tail.strip()
+                    _new_tail = html_normalize_space(el.tail.strip())
+                    new_tail = f' {_new_tail}' if el.tail.startswith(' ') and el.tail.strip()[0] in string.punctuation else _new_tail
                     text = self.__combine_text(text, new_tail, language)
 
             return text
 
         if final := __get_paragraph_text_recusive(root, ''):
-            para_text.append({'c': final, 't': ParagraphTextType.TEXT})
+            para_text.append({'c': final.replace('$br$', PARAGRAPH_SEPARATOR), 't': ParagraphTextType.TEXT})
 
         return para_text
 
 
@@ -8,7 +8,10 @@
 from llm_web_kit.extractor.html.recognizer.recognizer import (
     BaseHTMLElementRecognizer, CCTag)
 from llm_web_kit.libs.doc_element_type import DocElementType
-from llm_web_kit.libs.html_utils import process_sub_sup_tags
+from llm_web_kit.libs.html_utils import (html_normalize_space,
+                                         process_sub_sup_tags)
+
+from .text import PARAGRAPH_SEPARATOR
 
 
 class TitleRecognizer(BaseHTMLElementRecognizer):
@@ -124,16 +127,18 @@ def __extract_title_text_recusive(el: HtmlElement, with_tail: bool = True) -> li
 
             if el.tag == CCTag.CC_CODE_INLINE:
                 blks.append(f'`{el.text}`')
-            elif el.tag in ['sub', 'sup']:
-                # 使用process_sub_sup_tags保留原始的sub/sup标签
-                processed_text = process_sub_sup_tags(el, '', 'en', True)
-                if processed_text:
-                    blks.append(processed_text)
+            elif el.tag in ['br']:
+                blks.extend(['$br$'])
             else:
-                blks.append((el.text or '').strip())
+                if el.text and el.text.strip():
+                    _new_text = html_normalize_space(el.text.strip())
+                    blks.append(_new_text)
 
             for child in el.getchildren():
-                blks.extend(__extract_title_text_recusive(child))
+                if child.tag == 'sub' or child.tag == 'sup':
+                    blks.extend([process_sub_sup_tags(child, '', recursive=False), child.tail])
+                else:
+                    blks.extend(__extract_title_text_recusive(child))
 
             if with_tail:
                 blks.append((el.tail or '').strip())
@@ -143,7 +148,7 @@ def __extract_title_text_recusive(el: HtmlElement, with_tail: bool = True) -> li
         # 根元素不保留结尾
         blks = __extract_title_text_recusive(header_el, False)
 
-        return ' '.join(blk for blk in blks if blk)
+        return ' '.join(blk for blk in blks if blk).replace('$br$', PARAGRAPH_SEPARATOR)
 
     def __get_attribute(self, html:HtmlElement) -> Tuple[int, str]:
         """获取element的属性."""
 
@@ -3,6 +3,7 @@
 import string
 from copy import deepcopy
 
+from lxml import html as lxmlhtml
 from lxml.html import HtmlElement, HTMLParser, fromstring, tostring
 
 special_symbols = [  # TODO 从文件读取
@@ -430,3 +431,22 @@ def get_cc_select_html(element: HtmlElement) -> HtmlElement:
         container.append(elem_copy)
 
     return container
+
+
+def html_normalize_space(text: str) -> str:
+    """
+    标准化html中字符串中的空白字符
+    Args:
+        text:
+
+    Returns:
+
+    """
+    if not text.strip():
+        return ''
+    try:
+        tem_text_el = lxmlhtml.fromstring(text.strip())
+        _text = tem_text_el.xpath('normalize-space()')
+        return _text
+    except Exception:
+        return text