Skip to content

refactor: improve handling of leading punctuation removal #10761

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 3 commits into from
Nov 18, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 2 additions & 5 deletions api/core/indexing_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
FixedRecursiveCharacterTextSplitter,
)
from core.rag.splitter.text_splitter import TextSplitter
from core.tools.utils.text_processing_utils import remove_leading_symbols
from extensions.ext_database import db
from extensions.ext_redis import redis_client
from extensions.ext_storage import storage
Expand Down Expand Up @@ -500,11 +501,7 @@ def _split_to_documents(
document_node.metadata["doc_hash"] = hash
# delete Splitter character
page_content = document_node.page_content
if page_content.startswith(".") or page_content.startswith("。"):
page_content = page_content[1:]
else:
page_content = page_content
document_node.page_content = page_content
document_node.page_content = remove_leading_symbols(page_content)

if document_node.page_content:
split_documents.append(document_node)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from core.rag.extractor.extract_processor import ExtractProcessor
from core.rag.index_processor.index_processor_base import BaseIndexProcessor
from core.rag.models.document import Document
from core.tools.utils.text_processing_utils import remove_leading_symbols
from libs import helper
from models.dataset import Dataset

Expand Down Expand Up @@ -43,11 +44,7 @@ def transform(self, documents: list[Document], **kwargs) -> list[Document]:
document_node.metadata["doc_id"] = doc_id
document_node.metadata["doc_hash"] = hash
# delete Splitter character
page_content = document_node.page_content
if page_content.startswith(".") or page_content.startswith("。"):
page_content = page_content[1:].strip()
else:
page_content = page_content
page_content = remove_leading_symbols(document_node.page_content).strip()
if len(page_content) > 0:
document_node.page_content = page_content
split_documents.append(document_node)
Expand Down
7 changes: 2 additions & 5 deletions api/core/rag/index_processor/processor/qa_index_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from core.rag.extractor.extract_processor import ExtractProcessor
from core.rag.index_processor.index_processor_base import BaseIndexProcessor
from core.rag.models.document import Document
from core.tools.utils.text_processing_utils import remove_leading_symbols
from libs import helper
from models.dataset import Dataset

Expand Down Expand Up @@ -53,11 +54,7 @@ def transform(self, documents: list[Document], **kwargs) -> list[Document]:
document_node.metadata["doc_hash"] = hash
# delete Splitter character
page_content = document_node.page_content
if page_content.startswith(".") or page_content.startswith("。"):
page_content = page_content[1:]
else:
page_content = page_content
document_node.page_content = page_content
document_node.page_content = remove_leading_symbols(page_content)
split_documents.append(document_node)
all_documents.extend(split_documents)
for i in range(0, len(all_documents), 10):
Expand Down
16 changes: 16 additions & 0 deletions api/core/tools/utils/text_processing_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import re


def remove_leading_symbols(text: str) -> str:
"""
Remove leading punctuation or symbols from the given text.

Args:
text (str): The input text to process.

Returns:
str: The text with leading punctuation or symbols removed.
"""
# Match Unicode ranges for punctuation and symbols
pattern = r"^[\u2000-\u206F\u2E00-\u2E7F\u3000-\u303F!\"#$%&'()*+,\-./:;<=>?@\[\]^_`{|}~]+"
return re.sub(pattern, "", text)
20 changes: 20 additions & 0 deletions api/tests/unit_tests/utils/test_text_processing.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
from textwrap import dedent

import pytest

from core.tools.utils.text_processing_utils import remove_leading_symbols


@pytest.mark.parametrize(
("input_text", "expected_output"),
[
("...Hello, World!", "Hello, World!"),
("。测试中文标点", "测试中文标点"),
("!@#Test symbols", "Test symbols"),
("Hello, World!", "Hello, World!"),
("", ""),
(" ", " "),
],
)
def test_remove_leading_symbols(input_text, expected_output):
assert remove_leading_symbols(input_text) == expected_output