Skip to content

Commit 811e4bd

Browse files
authored
fix unstructured setting (#12116)
1 parent 49feff0 commit 811e4bd

9 files changed

+17
-15
lines changed

api/configs/feature/__init__.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -601,7 +601,7 @@ class RagEtlConfig(BaseSettings):
601601

602602
UNSTRUCTURED_API_KEY: Optional[str] = Field(
603603
description="API key for Unstructured.io service",
604-
default=None,
604+
default="",
605605
)
606606

607607
SCARF_NO_ANALYTICS: Optional[str] = Field(

api/core/rag/extractor/extract_processor.py

+3-4
Original file line numberDiff line numberDiff line change
@@ -102,12 +102,11 @@ def extract(
102102
input_file = Path(file_path)
103103
file_extension = input_file.suffix.lower()
104104
etl_type = dify_config.ETL_TYPE
105-
unstructured_api_url = dify_config.UNSTRUCTURED_API_URL
106-
unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY
107-
assert unstructured_api_url is not None, "unstructured_api_url is required"
108-
assert unstructured_api_key is not None, "unstructured_api_key is required"
109105
extractor: Optional[BaseExtractor] = None
110106
if etl_type == "Unstructured":
107+
unstructured_api_url = dify_config.UNSTRUCTURED_API_URL
108+
unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY or ""
109+
111110
if file_extension in {".xlsx", ".xls"}:
112111
extractor = ExcelExtractor(file_path)
113112
elif file_extension == ".pdf":

api/core/rag/extractor/unstructured/unstructured_eml_extractor.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import base64
22
import logging
3+
from typing import Optional
34

45
from bs4 import BeautifulSoup # type: ignore
56

@@ -15,7 +16,7 @@ class UnstructuredEmailExtractor(BaseExtractor):
1516
file_path: Path to the file to load.
1617
"""
1718

18-
def __init__(self, file_path: str, api_url: str, api_key: str):
19+
def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
1920
"""Initialize with file path."""
2021
self._file_path = file_path
2122
self._api_url = api_url

api/core/rag/extractor/unstructured/unstructured_epub_extractor.py

+1-4
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def __init__(
1919
self,
2020
file_path: str,
2121
api_url: Optional[str] = None,
22-
api_key: Optional[str] = None,
22+
api_key: str = "",
2323
):
2424
"""Initialize with file path."""
2525
self._file_path = file_path
@@ -30,9 +30,6 @@ def extract(self) -> list[Document]:
3030
if self._api_url:
3131
from unstructured.partition.api import partition_via_api
3232

33-
if self._api_key is None:
34-
raise ValueError("api_key is required")
35-
3633
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
3734
else:
3835
from unstructured.partition.epub import partition_epub

api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
from typing import Optional
23

34
from core.rag.extractor.extractor_base import BaseExtractor
45
from core.rag.models.document import Document
@@ -24,7 +25,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
2425
if the specified encoding fails.
2526
"""
2627

27-
def __init__(self, file_path: str, api_url: str, api_key: str):
28+
def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
2829
"""Initialize with file path."""
2930
self._file_path = file_path
3031
self._api_url = api_url

api/core/rag/extractor/unstructured/unstructured_msg_extractor.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
from typing import Optional
23

34
from core.rag.extractor.extractor_base import BaseExtractor
45
from core.rag.models.document import Document
@@ -14,7 +15,7 @@ class UnstructuredMsgExtractor(BaseExtractor):
1415
file_path: Path to the file to load.
1516
"""
1617

17-
def __init__(self, file_path: str, api_url: str, api_key: str):
18+
def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
1819
"""Initialize with file path."""
1920
self._file_path = file_path
2021
self._api_url = api_url

api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
from typing import Optional
23

34
from core.rag.extractor.extractor_base import BaseExtractor
45
from core.rag.models.document import Document
@@ -14,7 +15,7 @@ class UnstructuredPPTExtractor(BaseExtractor):
1415
file_path: Path to the file to load.
1516
"""
1617

17-
def __init__(self, file_path: str, api_url: str, api_key: str):
18+
def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
1819
"""Initialize with file path."""
1920
self._file_path = file_path
2021
self._api_url = api_url

api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
from typing import Optional
23

34
from core.rag.extractor.extractor_base import BaseExtractor
45
from core.rag.models.document import Document
@@ -14,7 +15,7 @@ class UnstructuredPPTXExtractor(BaseExtractor):
1415
file_path: Path to the file to load.
1516
"""
1617

17-
def __init__(self, file_path: str, api_url: str, api_key: str):
18+
def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
1819
"""Initialize with file path."""
1920
self._file_path = file_path
2021
self._api_url = api_url

api/core/rag/extractor/unstructured/unstructured_xml_extractor.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
from typing import Optional
23

34
from core.rag.extractor.extractor_base import BaseExtractor
45
from core.rag.models.document import Document
@@ -14,7 +15,7 @@ class UnstructuredXmlExtractor(BaseExtractor):
1415
file_path: Path to the file to load.
1516
"""
1617

17-
def __init__(self, file_path: str, api_url: str, api_key: str):
18+
def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
1819
"""Initialize with file path."""
1920
self._file_path = file_path
2021
self._api_url = api_url

0 commit comments

Comments
 (0)