3 mesiacov pred · 811e4bd0cf
--- a/api/configs/feature/__init__.py
+++ b/api/configs/feature/__init__.py
@@ -601,7 +601,7 @@ class RagEtlConfig(BaseSettings):
 
				 
			
 
				     UNSTRUCTURED_API_KEY: Optional[str] = Field(
			
 
				         description="API key for Unstructured.io service",
			
 
				-        default=None,
			
 
				+        default="",
			
 
				     )
			
 
				 
			
 
				     SCARF_NO_ANALYTICS: Optional[str] = Field(
			
--- a/api/core/rag/extractor/extract_processor.py
+++ b/api/core/rag/extractor/extract_processor.py
@@ -102,12 +102,11 @@ class ExtractProcessor:
 
				                 input_file = Path(file_path)
			
 
				                 file_extension = input_file.suffix.lower()
			
 
				                 etl_type = dify_config.ETL_TYPE
			
 
				-                unstructured_api_url = dify_config.UNSTRUCTURED_API_URL
			
 
				-                unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY
			
 
				-                assert unstructured_api_url is not None, "unstructured_api_url is required"
			
 
				-                assert unstructured_api_key is not None, "unstructured_api_key is required"
			
 
				                 extractor: Optional[BaseExtractor] = None
			
 
				                 if etl_type == "Unstructured":
			
 
				+                    unstructured_api_url = dify_config.UNSTRUCTURED_API_URL
			
 
				+                    unstructured_api_key = dify_config.UNSTRUCTURED_API_KEY or ""
			
 
				+
			
 
				                     if file_extension in {".xlsx", ".xls"}:
			
 
				                         extractor = ExcelExtractor(file_path)
			
 
				                     elif file_extension == ".pdf":
			
--- a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
@@ -1,5 +1,6 @@
 
				 import base64
			
 
				 import logging
			
 
				+from typing import Optional
			
 
				 
			
 
				 from bs4 import BeautifulSoup  # type: ignore
			
 
				 
			
@@ -15,7 +16,7 @@ class UnstructuredEmailExtractor(BaseExtractor):
 
				         file_path: Path to the file to load.
			
 
				     """
			
 
				 
			
 
				-    def __init__(self, file_path: str, api_url: str, api_key: str):
			
 
				+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
			
 
				         """Initialize with file path."""
			
 
				         self._file_path = file_path
			
 
				         self._api_url = api_url
			
--- a/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py
@@ -19,7 +19,7 @@ class UnstructuredEpubExtractor(BaseExtractor):
 
				         self,
			
 
				         file_path: str,
			
 
				         api_url: Optional[str] = None,
			
 
				-        api_key: Optional[str] = None,
			
 
				+        api_key: str = "",
			
 
				     ):
			
 
				         """Initialize with file path."""
			
 
				         self._file_path = file_path
			
@@ -30,9 +30,6 @@ class UnstructuredEpubExtractor(BaseExtractor):
 
				         if self._api_url:
			
 
				             from unstructured.partition.api import partition_via_api
			
 
				 
			
 
				-            if self._api_key is None:
			
 
				-                raise ValueError("api_key is required")
			
 
				-
			
 
				             elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
			
 
				         else:
			
 
				             from unstructured.partition.epub import partition_epub
			
--- a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
@@ -1,4 +1,5 @@
 
				 import logging
			
 
				+from typing import Optional
			
 
				 
			
 
				 from core.rag.extractor.extractor_base import BaseExtractor
			
 
				 from core.rag.models.document import Document
			
@@ -24,7 +25,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
 
				             if the specified encoding fails.
			
 
				     """
			
 
				 
			
 
				-    def __init__(self, file_path: str, api_url: str, api_key: str):
			
 
				+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
			
 
				         """Initialize with file path."""
			
 
				         self._file_path = file_path
			
 
				         self._api_url = api_url
			
--- a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
@@ -1,4 +1,5 @@
 
				 import logging
			
 
				+from typing import Optional
			
 
				 
			
 
				 from core.rag.extractor.extractor_base import BaseExtractor
			
 
				 from core.rag.models.document import Document
			
@@ -14,7 +15,7 @@ class UnstructuredMsgExtractor(BaseExtractor):
 
				         file_path: Path to the file to load.
			
 
				     """
			
 
				 
			
 
				-    def __init__(self, file_path: str, api_url: str, api_key: str):
			
 
				+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
			
 
				         """Initialize with file path."""
			
 
				         self._file_path = file_path
			
 
				         self._api_url = api_url
			
--- a/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
@@ -1,4 +1,5 @@
 
				 import logging
			
 
				+from typing import Optional
			
 
				 
			
 
				 from core.rag.extractor.extractor_base import BaseExtractor
			
 
				 from core.rag.models.document import Document
			
@@ -14,7 +15,7 @@ class UnstructuredPPTExtractor(BaseExtractor):
 
				         file_path: Path to the file to load.
			
 
				     """
			
 
				 
			
 
				-    def __init__(self, file_path: str, api_url: str, api_key: str):
			
 
				+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
			
 
				         """Initialize with file path."""
			
 
				         self._file_path = file_path
			
 
				         self._api_url = api_url
			
--- a/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
@@ -1,4 +1,5 @@
 
				 import logging
			
 
				+from typing import Optional
			
 
				 
			
 
				 from core.rag.extractor.extractor_base import BaseExtractor
			
 
				 from core.rag.models.document import Document
			
@@ -14,7 +15,7 @@ class UnstructuredPPTXExtractor(BaseExtractor):
 
				         file_path: Path to the file to load.
			
 
				     """
			
 
				 
			
 
				-    def __init__(self, file_path: str, api_url: str, api_key: str):
			
 
				+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
			
 
				         """Initialize with file path."""
			
 
				         self._file_path = file_path
			
 
				         self._api_url = api_url
			
--- a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
@@ -1,4 +1,5 @@
 
				 import logging
			
 
				+from typing import Optional
			
 
				 
			
 
				 from core.rag.extractor.extractor_base import BaseExtractor
			
 
				 from core.rag.models.document import Document
			
@@ -14,7 +15,7 @@ class UnstructuredXmlExtractor(BaseExtractor):
 
				         file_path: Path to the file to load.
			
 
				     """
			
 
				 
			
 
				-    def __init__(self, file_path: str, api_url: str, api_key: str):
			
 
				+    def __init__(self, file_path: str, api_url: Optional[str] = None, api_key: str = ""):
			
 
				         """Initialize with file path."""
			
 
				         self._file_path = file_path
			
 
				         self._api_url = api_url