Skip to content

reopen PR for #14411 #16148

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 27 commits into from
Mar 19, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
2cae7f9
fix function call support issue for ollama and xinference
cyflhn Nov 20, 2024
1cfdf8d
update
cyflhn Nov 23, 2024
3e8c85b
add invoke timeout retry settings for xinference
cyflhn Nov 23, 2024
36430b3
fix bug xinference glm4 function tool call
cyflhn Nov 25, 2024
076b95e
merge from dify
cyflhn Nov 25, 2024
3e90af4
Merge branch 'main' of github.com:cyflhn/dify into main
cyflhn Dec 13, 2024
bc819c3
support user defined conversation id
cyflhn Dec 13, 2024
f54f15a
modify format
cyflhn Dec 13, 2024
5ee5159
ruff format
cyflhn Dec 13, 2024
d2df2ed
add customized conversation id validation
cyflhn Dec 13, 2024
197a756
fix format problem
cyflhn Dec 14, 2024
6894579
remove unnecessary uuid check
cyflhn Dec 14, 2024
412f494
resolve conflict
cyflhn Dec 17, 2024
27c103d
remove unused code
cyflhn Dec 17, 2024
13626f0
resolve conflict
cyflhn Dec 18, 2024
7f5e8b7
handle fix
cyflhn Feb 5, 2025
1055df4
Merge branch 'main' of https://github.com/cyflhn/dify into main
cyflhn Feb 17, 2025
bfb1d09
merge code and fix docx extraction order bug
cyflhn Feb 26, 2025
84d2fba
remove unused code
cyflhn Feb 26, 2025
f87588b
remove unused code
cyflhn Feb 26, 2025
da99b9c
fix style problem
cyflhn Feb 27, 2025
8568847
update unit test
cyflhn Feb 27, 2025
3683558
update unit test
cyflhn Feb 27, 2025
8369717
fix style problem
cyflhn Feb 27, 2025
8f82ef5
fix style problem
cyflhn Feb 27, 2025
0810fec
Merge branch 'main' of https://github.com/cyflhn/dify into main
cyflhn Mar 18, 2025
2b308aa
Merge branch 'main' of https://github.com/cyflhn/dify into main
cyflhn Mar 19, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 17 additions & 11 deletions api/core/workflow/nodes/document_extractor/node.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
import io
import json
import logging
import operator
import os
import tempfile
from collections.abc import Mapping, Sequence
Expand All @@ -12,6 +11,9 @@
import pandas as pd
import pypdfium2 # type: ignore
import yaml # type: ignore
from docx.document import Document
from docx.oxml.table import CT_Tbl
from docx.oxml.text.paragraph import CT_P
from docx.table import Table
from docx.text.paragraph import Paragraph

Expand Down Expand Up @@ -231,6 +233,13 @@ def _extract_text_from_doc(file_content: bytes) -> str:
raise TextExtractionError(f"Failed to extract text from DOC: {str(e)}") from e


def paser_docx_part(block, doc: Document, content_items, i):
if isinstance(block, CT_P):
content_items.append((i, "paragraph", Paragraph(block, doc)))
elif isinstance(block, CT_Tbl):
content_items.append((i, "table", Table(block, doc)))


def _extract_text_from_docx(file_content: bytes) -> str:
"""
Extract text from a DOCX file.
Expand All @@ -244,16 +253,13 @@ def _extract_text_from_docx(file_content: bytes) -> str:
# Keep track of paragraph and table positions
content_items: list[tuple[int, str, Table | Paragraph]] = []

# Process paragraphs and tables
for i, paragraph in enumerate(doc.paragraphs):
if paragraph.text.strip():
content_items.append((i, "paragraph", paragraph))

for i, table in enumerate(doc.tables):
content_items.append((i, "table", table))

# Sort content items based on their original position
content_items.sort(key=operator.itemgetter(0))
it = iter(doc.element.body)
part = next(it, None)
i = 0
while part is not None:
paser_docx_part(part, doc, content_items, i)
i = i + 1
part = next(it, None)

# Process sorted content
for _, item_type, item in content_items:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from unittest.mock import Mock, patch

import pytest
from docx.oxml.text.paragraph import CT_P

from core.file import File, FileTransferMethod
from core.variables import ArrayFileSegment
Expand Down Expand Up @@ -169,7 +170,12 @@ def test_extract_text_from_docx(mock_document):
mock_paragraph2 = Mock()
mock_paragraph2.text = "Paragraph 2"
mock_document.return_value.paragraphs = [mock_paragraph1, mock_paragraph2]

mock_ct_p1 = Mock(spec=CT_P)
mock_ct_p1.text = "Paragraph 1"
mock_ct_p2 = Mock(spec=CT_P)
mock_ct_p2.text = "Paragraph 2"
mock_element = Mock(body=[mock_ct_p1, mock_ct_p2])
mock_document.return_value.element = mock_element
text = _extract_text_from_docx(b"PK\x03\x04")
assert text == "Paragraph 1\nParagraph 2"

Expand Down
Loading