|
@@ -62,7 +62,8 @@ class IndexingRunner:
|
|
|
text_docs = self._extract(index_processor, dataset_document, processing_rule.to_dict())
|
|
|
|
|
|
# transform
|
|
|
- documents = self._transform(index_processor, dataset, text_docs, processing_rule.to_dict())
|
|
|
+ documents = self._transform(index_processor, dataset, text_docs, dataset_document.doc_language,
|
|
|
+ processing_rule.to_dict())
|
|
|
# save segment
|
|
|
self._load_segments(dataset, dataset_document, documents)
|
|
|
|
|
@@ -120,7 +121,8 @@ class IndexingRunner:
|
|
|
text_docs = self._extract(index_processor, dataset_document, processing_rule.to_dict())
|
|
|
|
|
|
# transform
|
|
|
- documents = self._transform(index_processor, dataset, text_docs, processing_rule.to_dict())
|
|
|
+ documents = self._transform(index_processor, dataset, text_docs, dataset_document.doc_language,
|
|
|
+ processing_rule.to_dict())
|
|
|
# save segment
|
|
|
self._load_segments(dataset, dataset_document, documents)
|
|
|
|
|
@@ -750,7 +752,7 @@ class IndexingRunner:
|
|
|
index_processor.load(dataset, documents)
|
|
|
|
|
|
def _transform(self, index_processor: BaseIndexProcessor, dataset: Dataset,
|
|
|
- text_docs: list[Document], process_rule: dict) -> list[Document]:
|
|
|
+ text_docs: list[Document], doc_language: str, process_rule: dict) -> list[Document]:
|
|
|
# get embedding model instance
|
|
|
embedding_model_instance = None
|
|
|
if dataset.indexing_technique == 'high_quality':
|
|
@@ -768,7 +770,8 @@ class IndexingRunner:
|
|
|
)
|
|
|
|
|
|
documents = index_processor.transform(text_docs, embedding_model_instance=embedding_model_instance,
|
|
|
- process_rule=process_rule)
|
|
|
+ process_rule=process_rule, tenant_id=dataset.tenant_id,
|
|
|
+ doc_language=doc_language)
|
|
|
|
|
|
return documents
|
|
|
|