пре 1 година · 5e34f938c1
--- a/api/.env.example
+++ b/api/.env.example
@@ -117,3 +117,6 @@ HOSTED_ANTHROPIC_API_BASE=
 
				 HOSTED_ANTHROPIC_API_KEY=
			
 
				 HOSTED_ANTHROPIC_QUOTA_LIMIT=600000
			
 
				 HOSTED_ANTHROPIC_PAID_ENABLED=false
			
 
				+
			
 
				+ETL_TYPE=dify
			
 
				+UNSTRUCTURED_API_URL=
			
--- a/api/config.py
+++ b/api/config.py
@@ -54,7 +54,8 @@ DEFAULTS = {
 
				     'UPLOAD_IMAGE_FILE_SIZE_LIMIT': 10,
			
 
				     'OUTPUT_MODERATION_BUFFER_SIZE': 300,
			
 
				     'MULTIMODAL_SEND_IMAGE_FORMAT': 'base64',
			
 
				-    'INVITE_EXPIRY_HOURS': 72
			
 
				+    'INVITE_EXPIRY_HOURS': 72,
			
 
				+    'ETL_TYPE': 'dify',
			
 
				 }
			
 
				 
			
 
				 
			
@@ -276,6 +277,9 @@ class Config:
 
				         self.HOSTED_MODERATION_ENABLED = get_bool_env('HOSTED_MODERATION_ENABLED')
			
 
				         self.HOSTED_MODERATION_PROVIDERS = get_env('HOSTED_MODERATION_PROVIDERS')
			
 
				 
			
 
				+        self.ETL_TYPE = get_env('ETL_TYPE')
			
 
				+        self.UNSTRUCTURED_API_URL = get_env('UNSTRUCTURED_API_URL')
			
 
				+
			
 
				 
			
 
				 class CloudEditionConfig(Config):
			
 
				 
			
--- a/api/controllers/console/datasets/file.py
+++ b/api/controllers/console/datasets/file.py
@@ -69,5 +69,20 @@ class FilePreviewApi(Resource):
 
				         return {'content': text}
			
 
				 
			
 
				 
			
 
				+class FileeSupportTypApi(Resource):
			
 
				+    @setup_required
			
 
				+    @login_required
			
 
				+    @account_initialization_required
			
 
				+    def get(self):
			
 
				+        etl_type = current_app.config['ETL_TYPE']
			
 
				+        if etl_type == 'Unstructured':
			
 
				+            allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx',
			
 
				+                                  'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml']
			
 
				+        else:
			
 
				+            allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv']
			
 
				+        return {'allowed_extensions': allowed_extensions}
			
 
				+
			
 
				+
			
 
				 api.add_resource(FileApi, '/files/upload')
			
 
				 api.add_resource(FilePreviewApi, '/files/<uuid:file_id>/preview')
			
 
				+api.add_resource(FileeSupportTypApi, '/files/support-type')
			
--- a/api/core/data_loader/file_extractor.py
+++ b/api/core/data_loader/file_extractor.py
@@ -3,7 +3,8 @@ from pathlib import Path
 
				 from typing import List, Union, Optional
			
 
				 
			
 
				 import requests
			
 
				-from langchain.document_loaders import TextLoader, Docx2txtLoader, UnstructuredFileLoader, UnstructuredAPIFileLoader
			
 
				+from flask import current_app
			
 
				+from langchain.document_loaders import TextLoader, Docx2txtLoader
			
 
				 from langchain.schema import Document
			
 
				 
			
 
				 from core.data_loader.loader.csv_loader import CSVLoader
			
@@ -11,6 +12,13 @@ from core.data_loader.loader.excel import ExcelLoader
 
				 from core.data_loader.loader.html import HTMLLoader
			
 
				 from core.data_loader.loader.markdown import MarkdownLoader
			
 
				 from core.data_loader.loader.pdf import PdfLoader
			
 
				+from core.data_loader.loader.unstructured.unstructured_eml import UnstructuredEmailLoader
			
 
				+from core.data_loader.loader.unstructured.unstructured_markdown import UnstructuredMarkdownLoader
			
 
				+from core.data_loader.loader.unstructured.unstructured_msg import UnstructuredMsgLoader
			
 
				+from core.data_loader.loader.unstructured.unstructured_ppt import UnstructuredPPTLoader
			
 
				+from core.data_loader.loader.unstructured.unstructured_pptx import UnstructuredPPTXLoader
			
 
				+from core.data_loader.loader.unstructured.unstructured_text import UnstructuredTextLoader
			
 
				+from core.data_loader.loader.unstructured.unstructured_xml import UnstructuredXmlLoader
			
 
				 from extensions.ext_storage import storage
			
 
				 from models.model import UploadFile
			
 
				 
			
@@ -49,14 +57,34 @@ class FileExtractor:
 
				         input_file = Path(file_path)
			
 
				         delimiter = '\n'
			
 
				         file_extension = input_file.suffix.lower()
			
 
				-        if is_automatic:
			
 
				-            loader = UnstructuredFileLoader(
			
 
				-                file_path, strategy="hi_res", mode="elements"
			
 
				-            )
			
 
				-            # loader = UnstructuredAPIFileLoader(
			
 
				-            #     file_path=filenames[0],
			
 
				-            #     api_key="FAKE_API_KEY",
			
 
				-            # )
			
 
				+        etl_type = current_app.config['ETL_TYPE']
			
 
				+        unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL']
			
 
				+        if etl_type == 'Unstructured':
			
 
				+            if file_extension == '.xlsx':
			
 
				+                loader = ExcelLoader(file_path)
			
 
				+            elif file_extension == '.pdf':
			
 
				+                loader = PdfLoader(file_path, upload_file=upload_file)
			
 
				+            elif file_extension in ['.md', '.markdown']:
			
 
				+                loader = UnstructuredMarkdownLoader(file_path, unstructured_api_url)
			
 
				+            elif file_extension in ['.htm', '.html']:
			
 
				+                loader = HTMLLoader(file_path)
			
 
				+            elif file_extension == '.docx':
			
 
				+                loader = Docx2txtLoader(file_path)
			
 
				+            elif file_extension == '.csv':
			
 
				+                loader = CSVLoader(file_path, autodetect_encoding=True)
			
 
				+            elif file_extension == '.msg':
			
 
				+                loader = UnstructuredMsgLoader(file_path, unstructured_api_url)
			
 
				+            elif file_extension == '.eml':
			
 
				+                loader = UnstructuredEmailLoader(file_path, unstructured_api_url)
			
 
				+            elif file_extension == '.ppt':
			
 
				+                loader = UnstructuredPPTLoader(file_path, unstructured_api_url)
			
 
				+            elif file_extension == '.pptx':
			
 
				+                loader = UnstructuredPPTXLoader(file_path, unstructured_api_url)
			
 
				+            elif file_extension == '.xml':
			
 
				+                loader = UnstructuredXmlLoader(file_path, unstructured_api_url)
			
 
				+            else:
			
 
				+                # txt
			
 
				+                loader = UnstructuredTextLoader(file_path, unstructured_api_url)
			
 
				         else:
			
 
				             if file_extension == '.xlsx':
			
 
				                 loader = ExcelLoader(file_path)
			
--- a/api/core/data_loader/loader/unstructured/unstructured_eml.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_eml.py
@@ -0,0 +1,41 @@
 
				+import logging
			
 
				+import re
			
 
				+from typing import Optional, List, Tuple, cast
			
 
				+
			
 
				+from langchain.document_loaders.base import BaseLoader
			
 
				+from langchain.document_loaders.helpers import detect_file_encodings
			
 
				+from langchain.schema import Document
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+class UnstructuredEmailLoader(BaseLoader):
			
 
				+    """Load msg files.
			
 
				+
			
 
				+
			
 
				+    Args:
			
 
				+        file_path: Path to the file to load.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        file_path: str,
			
 
				+        api_url: str,
			
 
				+    ):
			
 
				+        """Initialize with file path."""
			
 
				+        self._file_path = file_path
			
 
				+        self._api_url = api_url
			
 
				+
			
 
				+
			
 
				+    def load(self) -> List[Document]:
			
 
				+        from unstructured.partition.email import partition_email
			
 
				+
			
 
				+        elements = partition_email(filename=self._file_path, api_url=self._api_url)
			
 
				+        from unstructured.chunking.title import chunk_by_title
			
 
				+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
			
 
				+        documents = []
			
 
				+        for chunk in chunks:
			
 
				+            text = chunk.text.strip()
			
 
				+            documents.append(Document(page_content=text))
			
 
				+
			
 
				+        return documents
			
--- a/api/core/data_loader/loader/unstructured/unstructured_markdown.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_markdown.py
@@ -0,0 +1,48 @@
 
				+import logging
			
 
				+from typing import List
			
 
				+
			
 
				+from langchain.document_loaders.base import BaseLoader
			
 
				+from langchain.schema import Document
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+class UnstructuredMarkdownLoader(BaseLoader):
			
 
				+    """Load md files.
			
 
				+
			
 
				+
			
 
				+    Args:
			
 
				+        file_path: Path to the file to load.
			
 
				+
			
 
				+        remove_hyperlinks: Whether to remove hyperlinks from the text.
			
 
				+
			
 
				+        remove_images: Whether to remove images from the text.
			
 
				+
			
 
				+        encoding: File encoding to use. If `None`, the file will be loaded
			
 
				+        with the default system encoding.
			
 
				+
			
 
				+        autodetect_encoding: Whether to try to autodetect the file encoding
			
 
				+            if the specified encoding fails.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        file_path: str,
			
 
				+        api_url: str,
			
 
				+    ):
			
 
				+        """Initialize with file path."""
			
 
				+        self._file_path = file_path
			
 
				+        self._api_url = api_url
			
 
				+
			
 
				+    def load(self) -> List[Document]:
			
 
				+        from unstructured.partition.md import partition_md
			
 
				+
			
 
				+        elements = partition_md(filename=self._file_path, api_url=self._api_url)
			
 
				+        from unstructured.chunking.title import chunk_by_title
			
 
				+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
			
 
				+        documents = []
			
 
				+        for chunk in chunks:
			
 
				+            text = chunk.text.strip()
			
 
				+            documents.append(Document(page_content=text))
			
 
				+
			
 
				+        return documents
			
--- a/api/core/data_loader/loader/unstructured/unstructured_msg.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_msg.py
@@ -0,0 +1,40 @@
 
				+import logging
			
 
				+import re
			
 
				+from typing import Optional, List, Tuple, cast
			
 
				+
			
 
				+from langchain.document_loaders.base import BaseLoader
			
 
				+from langchain.document_loaders.helpers import detect_file_encodings
			
 
				+from langchain.schema import Document
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+class UnstructuredMsgLoader(BaseLoader):
			
 
				+    """Load msg files.
			
 
				+
			
 
				+
			
 
				+    Args:
			
 
				+        file_path: Path to the file to load.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        file_path: str,
			
 
				+        api_url: str
			
 
				+    ):
			
 
				+        """Initialize with file path."""
			
 
				+        self._file_path = file_path
			
 
				+        self._api_url = api_url
			
 
				+
			
 
				+    def load(self) -> List[Document]:
			
 
				+        from unstructured.partition.msg import partition_msg
			
 
				+
			
 
				+        elements = partition_msg(filename=self._file_path, api_url=self._api_url)
			
 
				+        from unstructured.chunking.title import chunk_by_title
			
 
				+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
			
 
				+        documents = []
			
 
				+        for chunk in chunks:
			
 
				+            text = chunk.text.strip()
			
 
				+            documents.append(Document(page_content=text))
			
 
				+
			
 
				+        return documents
			
--- a/api/core/data_loader/loader/unstructured/unstructured_ppt.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_ppt.py
@@ -0,0 +1,40 @@
 
				+import logging
			
 
				+import re
			
 
				+from typing import Optional, List, Tuple, cast
			
 
				+
			
 
				+from langchain.document_loaders.base import BaseLoader
			
 
				+from langchain.document_loaders.helpers import detect_file_encodings
			
 
				+from langchain.schema import Document
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+class UnstructuredPPTLoader(BaseLoader):
			
 
				+    """Load msg files.
			
 
				+
			
 
				+
			
 
				+    Args:
			
 
				+        file_path: Path to the file to load.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        file_path: str,
			
 
				+        api_url: str
			
 
				+    ):
			
 
				+        """Initialize with file path."""
			
 
				+        self._file_path = file_path
			
 
				+        self._api_url = api_url
			
 
				+
			
 
				+    def load(self) -> List[Document]:
			
 
				+        from unstructured.partition.ppt import partition_ppt
			
 
				+
			
 
				+        elements = partition_ppt(filename=self._file_path, api_url=self._api_url)
			
 
				+        from unstructured.chunking.title import chunk_by_title
			
 
				+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
			
 
				+        documents = []
			
 
				+        for chunk in chunks:
			
 
				+            text = chunk.text.strip()
			
 
				+            documents.append(Document(page_content=text))
			
 
				+
			
 
				+        return documents
			
--- a/api/core/data_loader/loader/unstructured/unstructured_pptx.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_pptx.py
@@ -0,0 +1,40 @@
 
				+import logging
			
 
				+import re
			
 
				+from typing import Optional, List, Tuple, cast
			
 
				+
			
 
				+from langchain.document_loaders.base import BaseLoader
			
 
				+from langchain.document_loaders.helpers import detect_file_encodings
			
 
				+from langchain.schema import Document
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+class UnstructuredPPTXLoader(BaseLoader):
			
 
				+    """Load msg files.
			
 
				+
			
 
				+
			
 
				+    Args:
			
 
				+        file_path: Path to the file to load.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        file_path: str,
			
 
				+        api_url: str
			
 
				+    ):
			
 
				+        """Initialize with file path."""
			
 
				+        self._file_path = file_path
			
 
				+        self._api_url = api_url
			
 
				+
			
 
				+    def load(self) -> List[Document]:
			
 
				+        from unstructured.partition.pptx import partition_pptx
			
 
				+
			
 
				+        elements = partition_pptx(filename=self._file_path, api_url=self._api_url)
			
 
				+        from unstructured.chunking.title import chunk_by_title
			
 
				+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
			
 
				+        documents = []
			
 
				+        for chunk in chunks:
			
 
				+            text = chunk.text.strip()
			
 
				+            documents.append(Document(page_content=text))
			
 
				+
			
 
				+        return documents
			
--- a/api/core/data_loader/loader/unstructured/unstructured_text.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_text.py
@@ -0,0 +1,40 @@
 
				+import logging
			
 
				+import re
			
 
				+from typing import Optional, List, Tuple, cast
			
 
				+
			
 
				+from langchain.document_loaders.base import BaseLoader
			
 
				+from langchain.document_loaders.helpers import detect_file_encodings
			
 
				+from langchain.schema import Document
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+class UnstructuredTextLoader(BaseLoader):
			
 
				+    """Load msg files.
			
 
				+
			
 
				+
			
 
				+    Args:
			
 
				+        file_path: Path to the file to load.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        file_path: str,
			
 
				+        api_url: str
			
 
				+    ):
			
 
				+        """Initialize with file path."""
			
 
				+        self._file_path = file_path
			
 
				+        self._api_url = api_url
			
 
				+
			
 
				+    def load(self) -> List[Document]:
			
 
				+        from unstructured.partition.text import partition_text
			
 
				+
			
 
				+        elements = partition_text(filename=self._file_path, api_url=self._api_url)
			
 
				+        from unstructured.chunking.title import chunk_by_title
			
 
				+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
			
 
				+        documents = []
			
 
				+        for chunk in chunks:
			
 
				+            text = chunk.text.strip()
			
 
				+            documents.append(Document(page_content=text))
			
 
				+
			
 
				+        return documents
			
--- a/api/core/data_loader/loader/unstructured/unstructured_xml.py
+++ b/api/core/data_loader/loader/unstructured/unstructured_xml.py
@@ -0,0 +1,40 @@
 
				+import logging
			
 
				+import re
			
 
				+from typing import Optional, List, Tuple, cast
			
 
				+
			
 
				+from langchain.document_loaders.base import BaseLoader
			
 
				+from langchain.document_loaders.helpers import detect_file_encodings
			
 
				+from langchain.schema import Document
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+class UnstructuredXmlLoader(BaseLoader):
			
 
				+    """Load msg files.
			
 
				+
			
 
				+
			
 
				+    Args:
			
 
				+        file_path: Path to the file to load.
			
 
				+    """
			
 
				+
			
 
				+    def __init__(
			
 
				+        self,
			
 
				+        file_path: str,
			
 
				+        api_url: str
			
 
				+    ):
			
 
				+        """Initialize with file path."""
			
 
				+        self._file_path = file_path
			
 
				+        self._api_url = api_url
			
 
				+
			
 
				+    def load(self) -> List[Document]:
			
 
				+        from unstructured.partition.xml import partition_xml
			
 
				+
			
 
				+        elements = partition_xml(filename=self._file_path, xml_keep_tags=True, api_url=self._api_url)
			
 
				+        from unstructured.chunking.title import chunk_by_title
			
 
				+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=0)
			
 
				+        documents = []
			
 
				+        for chunk in chunks:
			
 
				+            text = chunk.text.strip()
			
 
				+            documents.append(Document(page_content=text))
			
 
				+
			
 
				+        return documents
			
--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
@@ -397,7 +397,7 @@ class IndexingRunner:
 
				                 one_or_none()
			
 
				 
			
 
				             if file_detail:
			
 
				-                text_docs = FileExtractor.load(file_detail, is_automatic=False)
			
 
				+                text_docs = FileExtractor.load(file_detail, is_automatic=True)
			
 
				         elif dataset_document.data_source_type == 'notion_import':
			
 
				             loader = NotionLoader.from_document(dataset_document)
			
 
				             text_docs = loader.load()
			
--- a/api/models/dataset.py
+++ b/api/models/dataset.py
@@ -135,7 +135,7 @@ class DatasetProcessRule(db.Model):
 
				         ],
			
 
				         'segmentation': {
			
 
				             'delimiter': '\n',
			
 
				-            'max_tokens': 512
			
 
				+            'max_tokens': 1000
			
 
				         }
			
 
				     }
			
 
				 
			
--- a/api/requirements.txt
+++ b/api/requirements.txt
@@ -53,4 +53,6 @@ zhipuai==1.0.7
 
				 werkzeug==2.3.7
			
 
				 pymilvus==2.3.0
			
 
				 qdrant-client==1.6.4
			
 
				-cohere~=4.32
			
 
				+cohere~=4.32
			
 
				+unstructured~=0.10.27
			
 
				+unstructured[docx,pptx]~=0.10.27
			
--- a/api/services/file_service.py
+++ b/api/services/file_service.py
@@ -27,7 +27,13 @@ class FileService:
 
				     @staticmethod
			
 
				     def upload_file(file: FileStorage, user: Union[Account, EndUser], only_image: bool = False) -> UploadFile:
			
 
				         extension = file.filename.split('.')[-1]
			
 
				-        if extension.lower() not in ALLOWED_EXTENSIONS:
			
 
				+        etl_type = current_app.config['ETL_TYPE']
			
 
				+        if etl_type == 'Unstructured':
			
 
				+            allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx',
			
 
				+                                  'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml']
			
 
				+        else:
			
 
				+            allowed_extensions = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv']
			
 
				+        if extension.lower() not in allowed_extensions:
			
 
				             raise UnsupportedFileTypeError()
			
 
				         elif only_image and extension.lower() not in IMAGE_EXTENSIONS:
			
 
				             raise UnsupportedFileTypeError()