소스 검색

Fix some RAG bugs (#2570)

Co-authored-by: jyong <jyong@dify.ai>
Jyong 1 년 전
부모
커밋
5b953c1ef2

+ 4 - 2
api/controllers/console/datasets/data_source.py

@@ -178,7 +178,8 @@ class DataSourceNotionApi(Resource):
             notion_workspace_id=workspace_id,
             notion_obj_id=page_id,
             notion_page_type=page_type,
-            notion_access_token=data_source_binding.access_token
+            notion_access_token=data_source_binding.access_token,
+            tenant_id=current_user.current_tenant_id
         )
 
         text_docs = extractor.extract()
@@ -208,7 +209,8 @@ class DataSourceNotionApi(Resource):
                     notion_info={
                         "notion_workspace_id": workspace_id,
                         "notion_obj_id": page['page_id'],
-                        "notion_page_type": page['type']
+                        "notion_page_type": page['type'],
+                        "tenant_id": current_user.current_tenant_id
                     },
                     document_model=args['doc_form']
                 )

+ 2 - 1
api/controllers/console/datasets/datasets.py

@@ -298,7 +298,8 @@ class DatasetIndexingEstimateApi(Resource):
                         notion_info={
                             "notion_workspace_id": workspace_id,
                             "notion_obj_id": page['page_id'],
-                            "notion_page_type": page['type']
+                            "notion_page_type": page['type'],
+                            "tenant_id": current_user.current_tenant_id
                         },
                         document_model=args['doc_form']
                     )

+ 2 - 1
api/controllers/console/datasets/datasets_document.py

@@ -455,7 +455,8 @@ class DocumentBatchIndexingEstimateApi(DocumentResource):
                     notion_info={
                         "notion_workspace_id": data_source_info['notion_workspace_id'],
                         "notion_obj_id": data_source_info['notion_page_id'],
-                        "notion_page_type": data_source_info['type']
+                        "notion_page_type": data_source_info['type'],
+                        "tenant_id": current_user.current_tenant_id
                     },
                     document_model=document.doc_form
                 )

+ 2 - 1
api/core/indexing_runner.py

@@ -366,7 +366,8 @@ class IndexingRunner:
                     "notion_workspace_id": data_source_info['notion_workspace_id'],
                     "notion_obj_id": data_source_info['notion_page_id'],
                     "notion_page_type": data_source_info['type'],
-                    "document": dataset_document
+                    "document": dataset_document,
+                    "tenant_id": dataset_document.tenant_id
                 },
                 document_model=dataset_document.doc_form
             )

+ 2 - 1
api/core/rag/datasource/retrieval_service.py

@@ -39,7 +39,8 @@ class RetrievalService:
                 'flask_app': current_app._get_current_object(),
                 'dataset_id': dataset_id,
                 'query': query,
-                'top_k': top_k
+                'top_k': top_k,
+                'all_documents': all_documents
             })
             threads.append(keyword_thread)
             keyword_thread.start()

+ 1 - 0
api/core/rag/extractor/entity/extract_setting.py

@@ -12,6 +12,7 @@ class NotionInfo(BaseModel):
     notion_obj_id: str
     notion_page_type: str
     document: Document = None
+    tenant_id: str
 
     class Config:
         arbitrary_types_allowed = True

+ 2 - 1
api/core/rag/extractor/extract_processor.py

@@ -132,7 +132,8 @@ class ExtractProcessor:
                 notion_workspace_id=extract_setting.notion_info.notion_workspace_id,
                 notion_obj_id=extract_setting.notion_info.notion_obj_id,
                 notion_page_type=extract_setting.notion_info.notion_page_type,
-                document_model=extract_setting.notion_info.document
+                document_model=extract_setting.notion_info.document,
+                tenant_id=extract_setting.notion_info.tenant_id,
             )
             return extractor.extract()
         else:

+ 13 - 50
api/core/rag/extractor/html_extractor.py

@@ -1,13 +1,14 @@
 """Abstract interface for document loader implementations."""
-from typing import Optional
+from bs4 import BeautifulSoup
 
 from core.rag.extractor.extractor_base import BaseExtractor
-from core.rag.extractor.helpers import detect_file_encodings
 from core.rag.models.document import Document
 
 
 class HtmlExtractor(BaseExtractor):
-    """Load html files.
+
+    """
+    Load html files.
 
 
     Args:
@@ -15,57 +16,19 @@ class HtmlExtractor(BaseExtractor):
     """
 
     def __init__(
-            self,
-            file_path: str,
-            encoding: Optional[str] = None,
-            autodetect_encoding: bool = False,
-            source_column: Optional[str] = None,
-            csv_args: Optional[dict] = None,
+        self,
+        file_path: str
     ):
         """Initialize with file path."""
         self._file_path = file_path
-        self._encoding = encoding
-        self._autodetect_encoding = autodetect_encoding
-        self.source_column = source_column
-        self.csv_args = csv_args or {}
 
     def extract(self) -> list[Document]:
-        """Load data into document objects."""
-        try:
-            with open(self._file_path, newline="", encoding=self._encoding) as csvfile:
-                docs = self._read_from_file(csvfile)
-        except UnicodeDecodeError as e:
-            if self._autodetect_encoding:
-                detected_encodings = detect_file_encodings(self._file_path)
-                for encoding in detected_encodings:
-                    try:
-                        with open(self._file_path, newline="", encoding=encoding.encoding) as csvfile:
-                            docs = self._read_from_file(csvfile)
-                        break
-                    except UnicodeDecodeError:
-                        continue
-            else:
-                raise RuntimeError(f"Error loading {self._file_path}") from e
-
-        return docs
+        return [Document(page_content=self._load_as_text())]
 
-    def _read_from_file(self, csvfile) -> list[Document]:
-        docs = []
-        csv_reader = csv.DictReader(csvfile, **self.csv_args)  # type: ignore
-        for i, row in enumerate(csv_reader):
-            content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items())
-            try:
-                source = (
-                    row[self.source_column]
-                    if self.source_column is not None
-                    else ''
-                )
-            except KeyError:
-                raise ValueError(
-                    f"Source column '{self.source_column}' not found in CSV file."
-                )
-            metadata = {"source": source, "row": i}
-            doc = Document(page_content=content, metadata=metadata)
-            docs.append(doc)
+    def _load_as_text(self) -> str:
+        with open(self._file_path, "rb") as fp:
+            soup = BeautifulSoup(fp, 'html.parser')
+            text = soup.get_text()
+            text = text.strip() if text else ''
 
-        return docs
+        return text

+ 3 - 1
api/core/rag/extractor/notion_extractor.py

@@ -30,8 +30,10 @@ class NotionExtractor(BaseExtractor):
             notion_workspace_id: str,
             notion_obj_id: str,
             notion_page_type: str,
+            tenant_id: str,
             document_model: Optional[DocumentModel] = None,
-            notion_access_token: Optional[str] = None
+            notion_access_token: Optional[str] = None,
+
     ):
         self._notion_access_token = None
         self._document_model = document_model

+ 2 - 1
api/tasks/document_indexing_sync_task.py

@@ -58,7 +58,8 @@ def document_indexing_sync_task(dataset_id: str, document_id: str):
             notion_workspace_id=workspace_id,
             notion_obj_id=page_id,
             notion_page_type=page_type,
-            notion_access_token=data_source_binding.access_token
+            notion_access_token=data_source_binding.access_token,
+            tenant_id=document.tenant_id
         )
 
         last_edited_time = loader.get_notion_last_edited_time()