Skip to content

Commit d9271e5

Browse files
laipz8200Nov1c444
authored andcommitted
feat(document_extractor): integrate unstructured API for PPTX extraction (#10180)
1 parent 5aab1ce commit d9271e5

File tree

1 file changed

+10
-1
lines changed
  • api/core/workflow/nodes/document_extractor

1 file changed

+10
-1
lines changed

api/core/workflow/nodes/document_extractor/node.py

+10-1
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,14 @@
66
import pandas as pd
77
import pypdfium2
88
import yaml
9+
from unstructured.partition.api import partition_via_api
910
from unstructured.partition.email import partition_email
1011
from unstructured.partition.epub import partition_epub
1112
from unstructured.partition.msg import partition_msg
1213
from unstructured.partition.ppt import partition_ppt
1314
from unstructured.partition.pptx import partition_pptx
1415

16+
from configs import dify_config
1517
from core.file import File, FileTransferMethod, file_manager
1618
from core.helper import ssrf_proxy
1719
from core.variables import ArrayFileSegment
@@ -263,7 +265,14 @@ def _extract_text_from_ppt(file_content: bytes) -> str:
263265
def _extract_text_from_pptx(file_content: bytes) -> str:
264266
try:
265267
with io.BytesIO(file_content) as file:
266-
elements = partition_pptx(file=file)
268+
if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY:
269+
elements = partition_via_api(
270+
file=file,
271+
api_url=dify_config.UNSTRUCTURED_API_URL,
272+
api_key=dify_config.UNSTRUCTURED_API_KEY,
273+
)
274+
else:
275+
elements = partition_pptx(file=file)
267276
return "\n".join([getattr(element, "text", "") for element in elements])
268277
except Exception as e:
269278
raise TextExtractionError(f"Failed to extract text from PPTX: {str(e)}") from e

0 commit comments

Comments
 (0)