Procházet zdrojové kódy

fix customer spliter character (#1915)

Co-authored-by: jyong <jyong@dify.ai>
Jyong před 1 rokem
rodič
revize
4a3d15b6de

+ 4 - 2
api/core/data_loader/file_extractor.py

@@ -65,7 +65,8 @@ class FileExtractor:
             elif file_extension == '.pdf':
                 loader = PdfLoader(file_path, upload_file=upload_file)
             elif file_extension in ['.md', '.markdown']:
-                loader = UnstructuredMarkdownLoader(file_path, unstructured_api_url)
+                loader = UnstructuredMarkdownLoader(file_path, unstructured_api_url) if is_automatic \
+                    else MarkdownLoader(file_path, autodetect_encoding=True)
             elif file_extension in ['.htm', '.html']:
                 loader = HTMLLoader(file_path)
             elif file_extension == '.docx':
@@ -84,7 +85,8 @@ class FileExtractor:
                 loader = UnstructuredXmlLoader(file_path, unstructured_api_url)
             else:
                 # txt
-                loader = UnstructuredTextLoader(file_path, unstructured_api_url)
+                loader = UnstructuredTextLoader(file_path, unstructured_api_url) if is_automatic \
+                    else TextLoader(file_path, autodetect_encoding=True)
         else:
             if file_extension == '.xlsx':
                 loader = ExcelLoader(file_path)

+ 8 - 8
api/core/indexing_runner.py

@@ -59,7 +59,7 @@ class IndexingRunner:
                     first()
 
                 # load file
-                text_docs = self._load_data(dataset_document)
+                text_docs = self._load_data(dataset_document, processing_rule.mode == 'automatic')
 
                 # get splitter
                 splitter = self._get_splitter(processing_rule)
@@ -113,15 +113,14 @@ class IndexingRunner:
             for document_segment in document_segments:
                 db.session.delete(document_segment)
             db.session.commit()
-
-            # load file
-            text_docs = self._load_data(dataset_document)
-
             # get the process rule
             processing_rule = db.session.query(DatasetProcessRule). \
                 filter(DatasetProcessRule.id == dataset_document.dataset_process_rule_id). \
                 first()
 
+            # load file
+            text_docs = self._load_data(dataset_document, processing_rule.mode == 'automatic')
+
             # get splitter
             splitter = self._get_splitter(processing_rule)
 
@@ -238,14 +237,15 @@ class IndexingRunner:
         preview_texts = []
         total_segments = 0
         for file_detail in file_details:
-            # load data from file
-            text_docs = FileExtractor.load(file_detail)
 
             processing_rule = DatasetProcessRule(
                 mode=tmp_processing_rule["mode"],
                 rules=json.dumps(tmp_processing_rule["rules"])
             )
 
+            # load data from file
+            text_docs = FileExtractor.load(file_detail, is_automatic=processing_rule.mode == 'automatic')
+
             # get splitter
             splitter = self._get_splitter(processing_rule)
 
@@ -459,7 +459,7 @@ class IndexingRunner:
                 one_or_none()
 
             if file_detail:
-                text_docs = FileExtractor.load(file_detail, is_automatic=True)
+                text_docs = FileExtractor.load(file_detail, is_automatic=automatic)
         elif dataset_document.data_source_type == 'notion_import':
             loader = NotionLoader.from_document(dataset_document)
             text_docs = loader.load()