|
@@ -59,7 +59,7 @@ class IndexingRunner:
|
|
|
first()
|
|
|
|
|
|
# load file
|
|
|
- text_docs = self._load_data(dataset_document)
|
|
|
+ text_docs = self._load_data(dataset_document, processing_rule.mode == 'automatic')
|
|
|
|
|
|
# get splitter
|
|
|
splitter = self._get_splitter(processing_rule)
|
|
@@ -113,15 +113,14 @@ class IndexingRunner:
|
|
|
for document_segment in document_segments:
|
|
|
db.session.delete(document_segment)
|
|
|
db.session.commit()
|
|
|
-
|
|
|
- # load file
|
|
|
- text_docs = self._load_data(dataset_document)
|
|
|
-
|
|
|
# get the process rule
|
|
|
processing_rule = db.session.query(DatasetProcessRule). \
|
|
|
filter(DatasetProcessRule.id == dataset_document.dataset_process_rule_id). \
|
|
|
first()
|
|
|
|
|
|
+ # load file
|
|
|
+ text_docs = self._load_data(dataset_document, processing_rule.mode == 'automatic')
|
|
|
+
|
|
|
# get splitter
|
|
|
splitter = self._get_splitter(processing_rule)
|
|
|
|
|
@@ -238,14 +237,15 @@ class IndexingRunner:
|
|
|
preview_texts = []
|
|
|
total_segments = 0
|
|
|
for file_detail in file_details:
|
|
|
- # load data from file
|
|
|
- text_docs = FileExtractor.load(file_detail)
|
|
|
|
|
|
processing_rule = DatasetProcessRule(
|
|
|
mode=tmp_processing_rule["mode"],
|
|
|
rules=json.dumps(tmp_processing_rule["rules"])
|
|
|
)
|
|
|
|
|
|
+ # load data from file
|
|
|
+ text_docs = FileExtractor.load(file_detail, is_automatic=processing_rule.mode == 'automatic')
|
|
|
+
|
|
|
# get splitter
|
|
|
splitter = self._get_splitter(processing_rule)
|
|
|
|
|
@@ -459,7 +459,7 @@ class IndexingRunner:
|
|
|
one_or_none()
|
|
|
|
|
|
if file_detail:
|
|
|
- text_docs = FileExtractor.load(file_detail, is_automatic=True)
|
|
|
+ text_docs = FileExtractor.load(file_detail, is_automatic=automatic)
|
|
|
elif dataset_document.data_source_type == 'notion_import':
|
|
|
loader = NotionLoader.from_document(dataset_document)
|
|
|
text_docs = loader.load()
|