|
@@ -13,7 +13,7 @@ from core.docstore.dataset_docstore import DatasetDocumentStore
|
|
|
from core.errors.error import ProviderTokenNotInitError
|
|
|
from core.generator.llm_generator import LLMGenerator
|
|
|
from core.index.index import IndexBuilder
|
|
|
-from core.model_manager import ModelManager
|
|
|
+from core.model_manager import ModelManager, ModelInstance
|
|
|
from core.model_runtime.entities.model_entities import ModelType, PriceType
|
|
|
from core.model_runtime.model_providers.__base.large_language_model import LargeLanguageModel
|
|
|
from core.model_runtime.model_providers.__base.text_embedding_model import TextEmbeddingModel
|
|
@@ -61,8 +61,24 @@ class IndexingRunner:
|
|
|
# load file
|
|
|
text_docs = self._load_data(dataset_document, processing_rule.mode == 'automatic')
|
|
|
|
|
|
+ # get embedding model instance
|
|
|
+ embedding_model_instance = None
|
|
|
+ if dataset.indexing_technique == 'high_quality':
|
|
|
+ if dataset.embedding_model_provider:
|
|
|
+ embedding_model_instance = self.model_manager.get_model_instance(
|
|
|
+ tenant_id=dataset.tenant_id,
|
|
|
+ provider=dataset.embedding_model_provider,
|
|
|
+ model_type=ModelType.TEXT_EMBEDDING,
|
|
|
+ model=dataset.embedding_model
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ embedding_model_instance = self.model_manager.get_default_model_instance(
|
|
|
+ tenant_id=dataset.tenant_id,
|
|
|
+ model_type=ModelType.TEXT_EMBEDDING,
|
|
|
+ )
|
|
|
+
|
|
|
# get splitter
|
|
|
- splitter = self._get_splitter(processing_rule)
|
|
|
+ splitter = self._get_splitter(processing_rule, embedding_model_instance)
|
|
|
|
|
|
# split to documents
|
|
|
documents = self._step_split(
|
|
@@ -121,8 +137,24 @@ class IndexingRunner:
|
|
|
# load file
|
|
|
text_docs = self._load_data(dataset_document, processing_rule.mode == 'automatic')
|
|
|
|
|
|
+ # get embedding model instance
|
|
|
+ embedding_model_instance = None
|
|
|
+ if dataset.indexing_technique == 'high_quality':
|
|
|
+ if dataset.embedding_model_provider:
|
|
|
+ embedding_model_instance = self.model_manager.get_model_instance(
|
|
|
+ tenant_id=dataset.tenant_id,
|
|
|
+ provider=dataset.embedding_model_provider,
|
|
|
+ model_type=ModelType.TEXT_EMBEDDING,
|
|
|
+ model=dataset.embedding_model
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ embedding_model_instance = self.model_manager.get_default_model_instance(
|
|
|
+ tenant_id=dataset.tenant_id,
|
|
|
+ model_type=ModelType.TEXT_EMBEDDING,
|
|
|
+ )
|
|
|
+
|
|
|
# get splitter
|
|
|
- splitter = self._get_splitter(processing_rule)
|
|
|
+ splitter = self._get_splitter(processing_rule, embedding_model_instance)
|
|
|
|
|
|
# split to documents
|
|
|
documents = self._step_split(
|
|
@@ -253,7 +285,7 @@ class IndexingRunner:
|
|
|
text_docs = FileExtractor.load(file_detail, is_automatic=processing_rule.mode == 'automatic')
|
|
|
|
|
|
# get splitter
|
|
|
- splitter = self._get_splitter(processing_rule)
|
|
|
+ splitter = self._get_splitter(processing_rule, embedding_model_instance)
|
|
|
|
|
|
# split to documents
|
|
|
documents = self._split_to_documents_for_estimate(
|
|
@@ -384,7 +416,7 @@ class IndexingRunner:
|
|
|
)
|
|
|
|
|
|
# get splitter
|
|
|
- splitter = self._get_splitter(processing_rule)
|
|
|
+ splitter = self._get_splitter(processing_rule, embedding_model_instance)
|
|
|
|
|
|
# split to documents
|
|
|
documents = self._split_to_documents_for_estimate(
|
|
@@ -502,7 +534,8 @@ class IndexingRunner:
|
|
|
text = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\xFF]', '', text)
|
|
|
return text
|
|
|
|
|
|
- def _get_splitter(self, processing_rule: DatasetProcessRule) -> TextSplitter:
|
|
|
+ def _get_splitter(self, processing_rule: DatasetProcessRule,
|
|
|
+ embedding_model_instance: Optional[ModelInstance]) -> TextSplitter:
|
|
|
"""
|
|
|
Get the NodeParser object according to the processing rule.
|
|
|
"""
|
|
@@ -517,19 +550,20 @@ class IndexingRunner:
|
|
|
if separator:
|
|
|
separator = separator.replace('\\n', '\n')
|
|
|
|
|
|
-
|
|
|
- character_splitter = FixedRecursiveCharacterTextSplitter.from_gpt2_encoder(
|
|
|
+ character_splitter = FixedRecursiveCharacterTextSplitter.from_encoder(
|
|
|
chunk_size=segmentation["max_tokens"],
|
|
|
chunk_overlap=0,
|
|
|
fixed_separator=separator,
|
|
|
- separators=["\n\n", "。", ".", " ", ""]
|
|
|
+ separators=["\n\n", "。", ".", " ", ""],
|
|
|
+ embedding_model_instance=embedding_model_instance
|
|
|
)
|
|
|
else:
|
|
|
# Automatic segmentation
|
|
|
- character_splitter = EnhanceRecursiveCharacterTextSplitter.from_gpt2_encoder(
|
|
|
+ character_splitter = EnhanceRecursiveCharacterTextSplitter.from_encoder(
|
|
|
chunk_size=DatasetProcessRule.AUTOMATIC_RULES['segmentation']['max_tokens'],
|
|
|
chunk_overlap=0,
|
|
|
- separators=["\n\n", "。", ".", " ", ""]
|
|
|
+ separators=["\n\n", "。", ".", " ", ""],
|
|
|
+ embedding_model_instance=embedding_model_instance
|
|
|
)
|
|
|
|
|
|
return character_splitter
|
|
@@ -714,7 +748,7 @@ class IndexingRunner:
|
|
|
return text
|
|
|
|
|
|
def format_split_text(self, text):
|
|
|
- regex = r"Q\d+:\s*(.*?)\s*A\d+:\s*([\s\S]*?)(?=Q\d+:|$)"
|
|
|
+ regex = r"Q\d+:\s*(.*?)\s*A\d+:\s*([\s\S]*?)(?=Q\d+:|$)"
|
|
|
matches = re.findall(regex, text, re.UNICODE)
|
|
|
|
|
|
return [
|