преди 7 месеца · 14af87527f
--- a/api/core/indexing_runner.py
+++ b/api/core/indexing_runner.py
@@ -16,9 +16,7 @@ from configs import dify_config
 
				 from core.errors.error import ProviderTokenNotInitError
			
 
				 from core.llm_generator.llm_generator import LLMGenerator
			
 
				 from core.model_manager import ModelInstance, ModelManager
			
 
				-from core.model_runtime.entities.model_entities import ModelType, PriceType
			
 
				-from core.model_runtime.model_providers.__base.large_language_model import LargeLanguageModel
			
 
				-from core.model_runtime.model_providers.__base.text_embedding_model import TextEmbeddingModel
			
 
				+from core.model_runtime.entities.model_entities import ModelType
			
 
				 from core.rag.datasource.keyword.keyword_factory import Keyword
			
 
				 from core.rag.docstore.dataset_docstore import DatasetDocumentStore
			
 
				 from core.rag.extractor.entity.extract_setting import ExtractSetting
			
@@ -255,11 +253,8 @@ class IndexingRunner:
 
				                     tenant_id=tenant_id,
			
 
				                     model_type=ModelType.TEXT_EMBEDDING,
			
 
				                 )
			
 
				-        tokens = 0
			
 
				         preview_texts = []
			
 
				         total_segments = 0
			
 
				-        total_price = 0
			
 
				-        currency = 'USD'
			
 
				         index_type = doc_form
			
 
				         index_processor = IndexProcessorFactory(index_type).init_index_processor()
			
 
				         all_text_docs = []
			
@@ -286,54 +281,22 @@ class IndexingRunner:
 
				             for document in documents:
			
 
				                 if len(preview_texts) < 5:
			
 
				                     preview_texts.append(document.page_content)
			
 
				-                if indexing_technique == 'high_quality' or embedding_model_instance:
			
 
				-                    tokens += embedding_model_instance.get_text_embedding_num_tokens(
			
 
				-                        texts=[self.filter_string(document.page_content)]
			
 
				-                    )
			
 
				 
			
 
				         if doc_form and doc_form == 'qa_model':
			
 
				-            model_instance = self.model_manager.get_default_model_instance(
			
 
				-                tenant_id=tenant_id,
			
 
				-                model_type=ModelType.LLM
			
 
				-            )
			
 
				-
			
 
				-            model_type_instance = model_instance.model_type_instance
			
 
				-            model_type_instance = cast(LargeLanguageModel, model_type_instance)
			
 
				 
			
 
				             if len(preview_texts) > 0:
			
 
				                 # qa model document
			
 
				                 response = LLMGenerator.generate_qa_document(current_user.current_tenant_id, preview_texts[0],
			
 
				                                                              doc_language)
			
 
				                 document_qa_list = self.format_split_text(response)
			
 
				-                price_info = model_type_instance.get_price(
			
 
				-                    model=model_instance.model,
			
 
				-                    credentials=model_instance.credentials,
			
 
				-                    price_type=PriceType.INPUT,
			
 
				-                    tokens=total_segments * 2000,
			
 
				-                )
			
 
				+
			
 
				                 return {
			
 
				                     "total_segments": total_segments * 20,
			
 
				-                    "tokens": total_segments * 2000,
			
 
				-                    "total_price": '{:f}'.format(price_info.total_amount),
			
 
				-                    "currency": price_info.currency,
			
 
				                     "qa_preview": document_qa_list,
			
 
				                     "preview": preview_texts
			
 
				                 }
			
 
				-        if embedding_model_instance:
			
 
				-            embedding_model_type_instance = cast(TextEmbeddingModel, embedding_model_instance.model_type_instance)
			
 
				-            embedding_price_info = embedding_model_type_instance.get_price(
			
 
				-                model=embedding_model_instance.model,
			
 
				-                credentials=embedding_model_instance.credentials,
			
 
				-                price_type=PriceType.INPUT,
			
 
				-                tokens=tokens
			
 
				-            )
			
 
				-            total_price = '{:f}'.format(embedding_price_info.total_amount)
			
 
				-            currency = embedding_price_info.currency
			
 
				         return {
			
 
				             "total_segments": total_segments,
			
 
				-            "tokens": tokens,
			
 
				-            "total_price": total_price,
			
 
				-            "currency": currency,
			
 
				             "preview": preview_texts
			
 
				         }
			
 
				 
			
--- a/api/core/rag/splitter/text_splitter.py
+++ b/api/core/rag/splitter/text_splitter.py
@@ -108,7 +108,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
 
				         else:
			
 
				             return text
			
 
				 
			
 
				-    def _merge_splits(self, splits: Iterable[str], separator: str) -> list[str]:
			
 
				+    def _merge_splits(self, splits: Iterable[str], separator: str, lengths: list[int]) -> list[str]:
			
 
				         # We now want to combine these smaller pieces into medium size
			
 
				         # chunks to send to the LLM.
			
 
				         separator_len = self._length_function(separator)
			
@@ -116,8 +116,9 @@ class TextSplitter(BaseDocumentTransformer, ABC):
 
				         docs = []
			
 
				         current_doc: list[str] = []
			
 
				         total = 0
			
 
				+        index = 0
			
 
				         for d in splits:
			
 
				-            _len = self._length_function(d)
			
 
				+            _len = lengths[index]
			
 
				             if (
			
 
				                     total + _len + (separator_len if len(current_doc) > 0 else 0)
			
 
				                     > self._chunk_size
			
@@ -145,6 +146,7 @@ class TextSplitter(BaseDocumentTransformer, ABC):
 
				                         current_doc = current_doc[1:]
			
 
				             current_doc.append(d)
			
 
				             total += _len + (separator_len if len(current_doc) > 1 else 0)
			
 
				+            index += 1
			
 
				         doc = self._join_docs(current_doc, separator)
			
 
				         if doc is not None:
			
 
				             docs.append(doc)
			
@@ -493,11 +495,10 @@ class RecursiveCharacterTextSplitter(TextSplitter):
 
				         self._separators = separators or ["\n\n", "\n", " ", ""]
			
 
				 
			
 
				     def _split_text(self, text: str, separators: list[str]) -> list[str]:
			
 
				-        """Split incoming text and return chunks."""
			
 
				         final_chunks = []
			
 
				-        # Get appropriate separator to use
			
 
				         separator = separators[-1]
			
 
				         new_separators = []
			
 
				+
			
 
				         for i, _s in enumerate(separators):
			
 
				             if _s == "":
			
 
				                 separator = _s
			
@@ -508,25 +509,31 @@ class RecursiveCharacterTextSplitter(TextSplitter):
 
				                 break
			
 
				 
			
 
				         splits = _split_text_with_regex(text, separator, self._keep_separator)
			
 
				-        # Now go merging things, recursively splitting longer texts.
			
 
				         _good_splits = []
			
 
				+        _good_splits_lengths = []  # cache the lengths of the splits
			
 
				         _separator = "" if self._keep_separator else separator
			
 
				+
			
 
				         for s in splits:
			
 
				-            if self._length_function(s) < self._chunk_size:
			
 
				+            s_len = self._length_function(s)
			
 
				+            if s_len < self._chunk_size:
			
 
				                 _good_splits.append(s)
			
 
				+                _good_splits_lengths.append(s_len)
			
 
				             else:
			
 
				                 if _good_splits:
			
 
				-                    merged_text = self._merge_splits(_good_splits, _separator)
			
 
				+                    merged_text = self._merge_splits(_good_splits, _separator, _good_splits_lengths)
			
 
				                     final_chunks.extend(merged_text)
			
 
				                     _good_splits = []
			
 
				+                    _good_splits_lengths = []
			
 
				                 if not new_separators:
			
 
				                     final_chunks.append(s)
			
 
				                 else:
			
 
				                     other_info = self._split_text(s, new_separators)
			
 
				                     final_chunks.extend(other_info)
			
 
				+
			
 
				         if _good_splits:
			
 
				-            merged_text = self._merge_splits(_good_splits, _separator)
			
 
				+            merged_text = self._merge_splits(_good_splits, _separator, _good_splits_lengths)
			
 
				             final_chunks.extend(merged_text)
			
 
				+
			
 
				         return final_chunks
			
 
				 
			
 
				     def split_text(self, text: str) -> list[str]:
			
--- a/api/services/dataset_service.py
+++ b/api/services/dataset_service.py
@@ -1054,7 +1054,6 @@ class DocumentService:
 
				 
			
 
				             DocumentService.check_documents_upload_quota(count, features)
			
 
				 
			
 
				-        embedding_model = None
			
 
				         dataset_collection_binding_id = None
			
 
				         retrieval_model = None
			
 
				         if document_data["indexing_technique"] == "high_quality":
			
@@ -1082,10 +1081,10 @@ class DocumentService:
 
				             tenant_id=tenant_id,
			
 
				             name="",
			
 
				             data_source_type=document_data["data_source"]["type"],
			
 
				-            indexing_technique=document_data["indexing_technique"],
			
 
				+            indexing_technique=document_data.get("indexing_technique", "high_quality"),
			
 
				             created_by=account.id,
			
 
				-            embedding_model=embedding_model.model if embedding_model else None,
			
 
				-            embedding_model_provider=embedding_model.provider if embedding_model else None,
			
 
				+            embedding_model=document_data.get("embedding_model"),
			
 
				+            embedding_model_provider=document_data.get("embedding_model_provider"),
			
 
				             collection_binding_id=dataset_collection_binding_id,
			
 
				             retrieval_model=retrieval_model,
			
 
				         )
			
--- a/web/app/components/datasets/common/retrieval-method-config/index.tsx
+++ b/web/app/components/datasets/common/retrieval-method-config/index.tsx
@@ -11,6 +11,11 @@ import { FileSearch02 } from '@/app/components/base/icons/src/vender/solid/files
 
				 import { useProviderContext } from '@/context/provider-context'
			
 
				 import { useDefaultModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
			
 
				 import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations'
			
 
				+import {
			
 
				+  DEFAULT_WEIGHTED_SCORE,
			
 
				+  RerankingModeEnum,
			
 
				+  WeightedScoreEnum,
			
 
				+} from '@/models/datasets'
			
 
				 
			
 
				 type Props = {
			
 
				   value: RetrievalConfig
			
@@ -32,6 +37,18 @@ const RetrievalMethodConfig: FC<Props> = ({
 
				           reranking_provider_name: rerankDefaultModel?.provider.provider || '',
			
 
				           reranking_model_name: rerankDefaultModel?.model || '',
			
 
				         },
			
 
				+        reranking_mode: passValue.reranking_mode || (rerankDefaultModel ? RerankingModeEnum.RerankingModel : RerankingModeEnum.WeightedScore),
			
 
				+        weights: passValue.weights || {
			
 
				+          weight_type: WeightedScoreEnum.Customized,
			
 
				+          vector_setting: {
			
 
				+            vector_weight: DEFAULT_WEIGHTED_SCORE.other.semantic,
			
 
				+            embedding_provider_name: '',
			
 
				+            embedding_model_name: '',
			
 
				+          },
			
 
				+          keyword_setting: {
			
 
				+            keyword_weight: DEFAULT_WEIGHTED_SCORE.other.keyword,
			
 
				+          },
			
 
				+        },
			
 
				       }
			
 
				     }
			
 
				     return passValue
			
--- a/web/app/components/datasets/create/embedding-process/index.tsx
+++ b/web/app/components/datasets/create/embedding-process/index.tsx
@@ -13,8 +13,7 @@ import cn from '@/utils/classnames'
 
				 import { FieldInfo } from '@/app/components/datasets/documents/detail/metadata'
			
 
				 import Button from '@/app/components/base/button'
			
 
				 import type { FullDocumentDetail, IndexingStatusResponse, ProcessRuleResponse } from '@/models/datasets'
			
 
				-import { formatNumber } from '@/utils/format'
			
 
				-import { fetchIndexingStatusBatch as doFetchIndexingStatus, fetchIndexingEstimateBatch, fetchProcessRule } from '@/service/datasets'
			
 
				+import { fetchIndexingStatusBatch as doFetchIndexingStatus, fetchProcessRule } from '@/service/datasets'
			
 
				 import { DataSourceType } from '@/models/datasets'
			
 
				 import NotionIcon from '@/app/components/base/notion-icon'
			
 
				 import PriorityLabel from '@/app/components/billing/priority-label'
			
@@ -142,14 +141,6 @@ const EmbeddingProcess: FC<Props> = ({ datasetId, batchId, documents = [], index
 
				   }, apiParams => fetchProcessRule(omit(apiParams, 'action')), {
			
 
				     revalidateOnFocus: false,
			
 
				   })
			
 
				-  // get cost
			
 
				-  const { data: indexingEstimateDetail } = useSWR({
			
 
				-    action: 'fetchIndexingEstimateBatch',
			
 
				-    datasetId,
			
 
				-    batchId,
			
 
				-  }, apiParams => fetchIndexingEstimateBatch(omit(apiParams, 'action')), {
			
 
				-    revalidateOnFocus: false,
			
 
				-  })
			
 
				 
			
 
				   const router = useRouter()
			
 
				   const navToDocumentList = () => {
			
@@ -190,28 +181,11 @@ const EmbeddingProcess: FC<Props> = ({ datasetId, batchId, documents = [], index
 
				 
			
 
				   return (
			
 
				     <>
			
 
				-      <div className='h-5 flex justify-between items-center mb-5'>
			
 
				+      <div className='h-5 flex items-center mb-5'>
			
 
				         <div className={s.embeddingStatus}>
			
 
				           {isEmbedding && t('datasetDocuments.embedding.processing')}
			
 
				           {isEmbeddingCompleted && t('datasetDocuments.embedding.completed')}
			
 
				         </div>
			
 
				-        <div className={s.cost}>
			
 
				-          {indexingType === 'high_quality' && (
			
 
				-            <div className='flex items-center'>
			
 
				-              <div className={cn(s.commonIcon, s.highIcon)} />
			
 
				-              {t('datasetDocuments.embedding.highQuality')} · {t('datasetDocuments.embedding.estimate')}
			
 
				-              <span className={s.tokens}>{formatNumber(indexingEstimateDetail?.tokens || 0)}</span>tokens
			
 
				-              (<span className={s.price}>${formatNumber(indexingEstimateDetail?.total_price || 0)}</span>)
			
 
				-            </div>
			
 
				-          )}
			
 
				-          {indexingType === 'economy' && (
			
 
				-            <div className='flex items-center'>
			
 
				-              <div className={cn(s.commonIcon, s.economyIcon)} />
			
 
				-              {t('datasetDocuments.embedding.economy')} · {t('datasetDocuments.embedding.estimate')}
			
 
				-              <span className={s.tokens}>0</span>tokens
			
 
				-            </div>
			
 
				-          )}
			
 
				-        </div>
			
 
				       </div>
			
 
				       {
			
 
				         enableBilling && plan.type !== Plan.team && (
			
--- a/web/app/components/datasets/create/step-two/index.module.css
+++ b/web/app/components/datasets/create/step-two/index.module.css
@@ -30,7 +30,7 @@
 
				 }
			
 
				 
			
 
				 .indexItem {
			
 
				-  min-height: 146px;
			
 
				+  min-height: 126px;
			
 
				 }
			
 
				 
			
 
				 .indexItem .disableMask {
			
@@ -121,10 +121,6 @@
 
				   @apply pb-1;
			
 
				 }
			
 
				 
			
 
				-.radioItem.indexItem .typeHeader .tip {
			
 
				-  @apply pb-3;
			
 
				-}
			
 
				-
			
 
				 .radioItem .typeIcon {
			
 
				   position: absolute;
			
 
				   top: 18px;
			
@@ -264,7 +260,7 @@
 
				 }
			
 
				 
			
 
				 .input {
			
 
				-  @apply inline-flex h-9 w-full py-1 px-2 rounded-lg text-xs leading-normal;
			
 
				+  @apply inline-flex h-9 w-full py-1 px-2 pr-14 rounded-lg text-xs leading-normal;
			
 
				   @apply bg-gray-100 caret-primary-600 hover:bg-gray-100 focus:ring-1 focus:ring-inset focus:ring-gray-200 focus-visible:outline-none focus:bg-white placeholder:text-gray-400;
			
 
				 }
			
 
				 
			
--- a/web/app/components/datasets/create/step-two/index.tsx
+++ b/web/app/components/datasets/create/step-two/index.tsx
@@ -14,7 +14,7 @@ import PreviewItem, { PreviewType } from './preview-item'
 
				 import LanguageSelect from './language-select'
			
 
				 import s from './index.module.css'
			
 
				 import cn from '@/utils/classnames'
			
 
				-import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, IndexingEstimateResponse, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
			
 
				+import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
			
 
				 import {
			
 
				   createDocument,
			
 
				   createFirstDocument,
			
@@ -41,8 +41,10 @@ import { IS_CE_EDITION } from '@/config'
 
				 import { RETRIEVE_METHOD } from '@/types/app'
			
 
				 import useBreakpoints, { MediaType } from '@/hooks/use-breakpoints'
			
 
				 import Tooltip from '@/app/components/base/tooltip'
			
 
				-import { useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
			
 
				+import { useDefaultModel, useModelList, useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
			
 
				 import { LanguagesSupported } from '@/i18n/language'
			
 
				+import ModelSelector from '@/app/components/header/account-setting/model-provider-page/model-selector'
			
 
				+import type { DefaultModel } from '@/app/components/header/account-setting/model-provider-page/declarations'
			
 
				 import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations'
			
 
				 import { Globe01 } from '@/app/components/base/icons/src/vender/line/mapsAndTravel'
			
 
				 
			
@@ -109,7 +111,7 @@ const StepTwo = ({
 
				   const [previewScrolled, setPreviewScrolled] = useState(false)
			
 
				   const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.AUTO)
			
 
				   const [segmentIdentifier, setSegmentIdentifier] = useState('\\n')
			
 
				-  const [max, setMax] = useState(500)
			
 
				+  const [max, setMax] = useState(5000) // default chunk length
			
 
				   const [overlap, setOverlap] = useState(50)
			
 
				   const [rules, setRules] = useState<PreProcessingRule[]>([])
			
 
				   const [defaultConfig, setDefaultConfig] = useState<Rules>()
			
@@ -131,7 +133,6 @@ const StepTwo = ({
 
				   const [showPreview, { setTrue: setShowPreview, setFalse: hidePreview }] = useBoolean()
			
 
				   const [customFileIndexingEstimate, setCustomFileIndexingEstimate] = useState<FileIndexingEstimateResponse | null>(null)
			
 
				   const [automaticFileIndexingEstimate, setAutomaticFileIndexingEstimate] = useState<FileIndexingEstimateResponse | null>(null)
			
 
				-  const [estimateTokes, setEstimateTokes] = useState<Pick<IndexingEstimateResponse, 'tokens' | 'total_price'> | null>(null)
			
 
				 
			
 
				   const fileIndexingEstimate = (() => {
			
 
				     return segmentationType === SegmentType.AUTO ? automaticFileIndexingEstimate : customFileIndexingEstimate
			
@@ -192,13 +193,10 @@ const StepTwo = ({
 
				   const fetchFileIndexingEstimate = async (docForm = DocForm.TEXT) => {
			
 
				     // eslint-disable-next-line @typescript-eslint/no-use-before-define
			
 
				     const res = await didFetchFileIndexingEstimate(getFileIndexingEstimateParams(docForm)!)
			
 
				-    if (segmentationType === SegmentType.CUSTOM) {
			
 
				+    if (segmentationType === SegmentType.CUSTOM)
			
 
				       setCustomFileIndexingEstimate(res)
			
 
				-    }
			
 
				-    else {
			
 
				+    else
			
 
				       setAutomaticFileIndexingEstimate(res)
			
 
				-      indexType === IndexingType.QUALIFIED && setEstimateTokes({ tokens: res.tokens, total_price: res.total_price })
			
 
				-    }
			
 
				   }
			
 
				 
			
 
				   const confirmChangeCustomConfig = () => {
			
@@ -310,6 +308,19 @@ const StepTwo = ({
 
				     defaultModel: rerankDefaultModel,
			
 
				     currentModel: isRerankDefaultModelVaild,
			
 
				   } = useModelListAndDefaultModelAndCurrentProviderAndModel(ModelTypeEnum.rerank)
			
 
				+  const { data: embeddingModelList } = useModelList(ModelTypeEnum.textEmbedding)
			
 
				+  const { data: defaultEmbeddingModel } = useDefaultModel(ModelTypeEnum.textEmbedding)
			
 
				+  const [embeddingModel, setEmbeddingModel] = useState<DefaultModel>(
			
 
				+    currentDataset?.embedding_model
			
 
				+      ? {
			
 
				+        provider: currentDataset.embedding_model_provider,
			
 
				+        model: currentDataset.embedding_model,
			
 
				+      }
			
 
				+      : {
			
 
				+        provider: defaultEmbeddingModel?.provider.provider || '',
			
 
				+        model: defaultEmbeddingModel?.model || '',
			
 
				+      },
			
 
				+  )
			
 
				   const getCreationParams = () => {
			
 
				     let params
			
 
				     if (segmentationType === SegmentType.CUSTOM && overlap > max) {
			
@@ -324,6 +335,8 @@ const StepTwo = ({
 
				         process_rule: getProcessRule(),
			
 
				         // eslint-disable-next-line @typescript-eslint/no-use-before-define
			
 
				         retrieval_model: retrievalConfig, // Readonly. If want to changed, just go to settings page.
			
 
				+        embedding_model: embeddingModel.model, // Readonly
			
 
				+        embedding_model_provider: embeddingModel.provider, // Readonly
			
 
				       } as CreateDocumentReq
			
 
				     }
			
 
				     else { // create
			
@@ -360,6 +373,8 @@ const StepTwo = ({
 
				         doc_language: docLanguage,
			
 
				 
			
 
				         retrieval_model: postRetrievalConfig,
			
 
				+        embedding_model: embeddingModel.model,
			
 
				+        embedding_model_provider: embeddingModel.provider,
			
 
				       } as CreateDocumentReq
			
 
				       if (dataSourceType === DataSourceType.FILE) {
			
 
				         params.data_source.info_list.file_info_list = {
			
@@ -613,14 +628,17 @@ const StepTwo = ({
 
				                   <div className={s.formRow}>
			
 
				                     <div className='w-full'>
			
 
				                       <div className={s.label}>{t('datasetCreation.stepTwo.maxLength')}</div>
			
 
				-                      <input
			
 
				-                        type="number"
			
 
				-                        className={s.input}
			
 
				-                        placeholder={t('datasetCreation.stepTwo.maxLength') || ''}
			
 
				-                        value={max}
			
 
				-                        min={1}
			
 
				-                        onChange={e => setMax(parseInt(e.target.value.replace(/^0+/, ''), 10))}
			
 
				-                      />
			
 
				+                      <div className='relative w-full'>
			
 
				+                        <input
			
 
				+                          type="number"
			
 
				+                          className={s.input}
			
 
				+                          placeholder={t('datasetCreation.stepTwo.maxLength') || ''}
			
 
				+                          value={max}
			
 
				+                          min={1}
			
 
				+                          onChange={e => setMax(parseInt(e.target.value.replace(/^0+/, ''), 10))}
			
 
				+                        />
			
 
				+                        <div className='absolute top-2.5 right-2.5 text-text-tertiary system-sm-regular'>Tokens</div>
			
 
				+                      </div>
			
 
				                     </div>
			
 
				                   </div>
			
 
				                   <div className={s.formRow}>
			
@@ -635,14 +653,17 @@ const StepTwo = ({
 
				                           }
			
 
				                         />
			
 
				                       </div>
			
 
				-                      <input
			
 
				-                        type="number"
			
 
				-                        className={s.input}
			
 
				-                        placeholder={t('datasetCreation.stepTwo.overlap') || ''}
			
 
				-                        value={overlap}
			
 
				-                        min={1}
			
 
				-                        onChange={e => setOverlap(parseInt(e.target.value.replace(/^0+/, ''), 10))}
			
 
				-                      />
			
 
				+                      <div className='relative w-full'>
			
 
				+                        <input
			
 
				+                          type="number"
			
 
				+                          className={s.input}
			
 
				+                          placeholder={t('datasetCreation.stepTwo.overlap') || ''}
			
 
				+                          value={overlap}
			
 
				+                          min={1}
			
 
				+                          onChange={e => setOverlap(parseInt(e.target.value.replace(/^0+/, ''), 10))}
			
 
				+                        />
			
 
				+                        <div className='absolute top-2.5 right-2.5 text-text-tertiary system-sm-regular'>Tokens</div>
			
 
				+                      </div>
			
 
				                     </div>
			
 
				                   </div>
			
 
				                   <div className={s.formRow}>
			
@@ -675,7 +696,7 @@ const StepTwo = ({
 
				                     !isAPIKeySet && s.disabled,
			
 
				                     !hasSetIndexType && indexType === IndexingType.QUALIFIED && s.active,
			
 
				                     hasSetIndexType && s.disabled,
			
 
				-                    hasSetIndexType && '!w-full',
			
 
				+                    hasSetIndexType && '!w-full !min-h-[96px]',
			
 
				                   )}
			
 
				                   onClick={() => {
			
 
				                     if (isAPIKeySet)
			
@@ -690,16 +711,6 @@ const StepTwo = ({
 
				                       {!hasSetIndexType && <span className={s.recommendTag}>{t('datasetCreation.stepTwo.recommend')}</span>}
			
 
				                     </div>
			
 
				                     <div className={s.tip}>{t('datasetCreation.stepTwo.qualifiedTip')}</div>
			
 
				-                    <div className='pb-0.5 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.emstimateCost')}</div>
			
 
				-                    {
			
 
				-                      estimateTokes
			
 
				-                        ? (
			
 
				-                          <div className='text-xs font-medium text-gray-800'>{formatNumber(estimateTokes.tokens)} tokens(<span className='text-yellow-500'>${formatNumber(estimateTokes.total_price)}</span>)</div>
			
 
				-                        )
			
 
				-                        : (
			
 
				-                          <div className={s.calculating}>{t('datasetCreation.stepTwo.calculating')}</div>
			
 
				-                        )
			
 
				-                    }
			
 
				                   </div>
			
 
				                   {!isAPIKeySet && (
			
 
				                     <div className={s.warningTip}>
			
@@ -717,7 +728,7 @@ const StepTwo = ({
 
				                     s.indexItem,
			
 
				                     !hasSetIndexType && indexType === IndexingType.ECONOMICAL && s.active,
			
 
				                     hasSetIndexType && s.disabled,
			
 
				-                    hasSetIndexType && '!w-full',
			
 
				+                    hasSetIndexType && '!w-full !min-h-[96px]',
			
 
				                   )}
			
 
				                   onClick={changeToEconomicalType}
			
 
				                 >
			
@@ -726,13 +737,11 @@ const StepTwo = ({
 
				                   <div className={s.typeHeader}>
			
 
				                     <div className={s.title}>{t('datasetCreation.stepTwo.economical')}</div>
			
 
				                     <div className={s.tip}>{t('datasetCreation.stepTwo.economicalTip')}</div>
			
 
				-                    <div className='pb-0.5 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.emstimateCost')}</div>
			
 
				-                    <div className='text-xs font-medium text-gray-800'>0 tokens</div>
			
 
				                   </div>
			
 
				                 </div>
			
 
				               )}
			
 
				             </div>
			
 
				-            {hasSetIndexType && (
			
 
				+            {hasSetIndexType && indexType === IndexingType.ECONOMICAL && (
			
 
				               <div className='mt-2 text-xs text-gray-500 font-medium'>
			
 
				                 {t('datasetCreation.stepTwo.indexSettedTip')}
			
 
				                 <Link className='text-[#155EEF]' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
			
@@ -767,12 +776,32 @@ const StepTwo = ({
 
				                 )}
			
 
				               </div>
			
 
				             )}
			
 
				+            {/* Embedding model */}
			
 
				+            {indexType === IndexingType.QUALIFIED && (
			
 
				+              <div className='mb-2'>
			
 
				+                <div className={cn(s.label, datasetId && 'flex justify-between items-center')}>{t('datasetSettings.form.embeddingModel')}</div>
			
 
				+                <ModelSelector
			
 
				+                  readonly={!!datasetId}
			
 
				+                  defaultModel={embeddingModel}
			
 
				+                  modelList={embeddingModelList}
			
 
				+                  onSelect={(model: DefaultModel) => {
			
 
				+                    setEmbeddingModel(model)
			
 
				+                  }}
			
 
				+                />
			
 
				+                {!!datasetId && (
			
 
				+                  <div className='mt-2 text-xs text-gray-500 font-medium'>
			
 
				+                    {t('datasetCreation.stepTwo.indexSettedTip')}
			
 
				+                    <Link className='text-[#155EEF]' href={`/datasets/${datasetId}/settings`}>{t('datasetCreation.stepTwo.datasetSettingLink')}</Link>
			
 
				+                  </div>
			
 
				+                )}
			
 
				+              </div>
			
 
				+            )}
			
 
				             {/* Retrieval Method Config */}
			
 
				             <div>
			
 
				               {!datasetId
			
 
				                 ? (
			
 
				                   <div className={s.label}>
			
 
				-                    {t('datasetSettings.form.retrievalSetting.title')}
			
 
				+                    <div className='shrink-0 mr-4'>{t('datasetSettings.form.retrievalSetting.title')}</div>
			
 
				                     <div className='leading-[18px] text-xs font-normal text-gray-500'>
			
 
				                       <a target='_blank' rel='noopener noreferrer' href='https://docs.dify.ai/guides/knowledge-base/create-knowledge-and-upload-documents#id-6-retrieval-settings' className='text-[#155eef]'>{t('datasetSettings.form.retrievalSetting.learnMore')}</a>
			
 
				                       {t('datasetSettings.form.retrievalSetting.longDescription')}
			
--- a/web/app/components/datasets/create/steps-nav-bar/index.tsx
+++ b/web/app/components/datasets/create/steps-nav-bar/index.tsx
@@ -49,7 +49,7 @@ const StepsNavBar = ({
 
				             key={item}
			
 
				             className={cn(s.stepItem, s[`step${item}`], step === item && s.active, step > item && s.done, isMobile && 'px-0')}
			
 
				           >
			
 
				-            <div className={cn(s.stepNum)}>{item}</div>
			
 
				+            <div className={cn(s.stepNum)}>{step > item ? '' : item}</div>
			
 
				             <div className={cn(s.stepName)}>{isMobile ? '' : t(STEP_T_MAP[item])}</div>
			
 
				           </div>
			
 
				         ))}
			
--- a/web/app/components/datasets/documents/detail/embedding/index.tsx
+++ b/web/app/components/datasets/documents/detail/embedding/index.tsx
@@ -18,9 +18,7 @@ import { ToastContext } from '@/app/components/base/toast'
 
				 import type { FullDocumentDetail, ProcessRuleResponse } from '@/models/datasets'
			
 
				 import type { CommonResponse } from '@/models/common'
			
 
				 import { asyncRunSafe, sleep } from '@/utils'
			
 
				-import { formatNumber } from '@/utils/format'
			
 
				-import { fetchIndexingStatus as doFetchIndexingStatus, fetchIndexingEstimate, fetchProcessRule, pauseDocIndexing, resumeDocIndexing } from '@/service/datasets'
			
 
				-import DatasetDetailContext from '@/context/dataset-detail'
			
 
				+import { fetchIndexingStatus as doFetchIndexingStatus, fetchProcessRule, pauseDocIndexing, resumeDocIndexing } from '@/service/datasets'
			
 
				 import StopEmbeddingModal from '@/app/components/datasets/create/stop-embedding-modal'
			
 
				 
			
 
				 type Props = {
			
@@ -108,16 +106,14 @@ const RuleDetail: FC<{ sourceData?: ProcessRuleResponse; docName?: string }> = (
 
				   </div>
			
 
				 }
			
 
				 
			
 
				-const EmbeddingDetail: FC<Props> = ({ detail, stopPosition = 'top', datasetId: dstId, documentId: docId, indexingType, detailUpdate }) => {
			
 
				+const EmbeddingDetail: FC<Props> = ({ detail, stopPosition = 'top', datasetId: dstId, documentId: docId, detailUpdate }) => {
			
 
				   const onTop = stopPosition === 'top'
			
 
				   const { t } = useTranslation()
			
 
				   const { notify } = useContext(ToastContext)
			
 
				 
			
 
				   const { datasetId = '', documentId = '' } = useContext(DocumentContext)
			
 
				-  const { indexingTechnique } = useContext(DatasetDetailContext)
			
 
				   const localDatasetId = dstId ?? datasetId
			
 
				   const localDocumentId = docId ?? documentId
			
 
				-  const localIndexingTechnique = indexingType ?? indexingTechnique
			
 
				 
			
 
				   const [indexingStatusDetail, setIndexingStatusDetail] = useState<any>(null)
			
 
				   const fetchIndexingStatus = async () => {
			
@@ -160,14 +156,6 @@ const EmbeddingDetail: FC<Props> = ({ detail, stopPosition = 'top', datasetId: d
 
				     }
			
 
				   }, [startQueryStatus, stopQueryStatus])
			
 
				 
			
 
				-  const { data: indexingEstimateDetail, error: indexingEstimateErr } = useSWR({
			
 
				-    action: 'fetchIndexingEstimate',
			
 
				-    datasetId: localDatasetId,
			
 
				-    documentId: localDocumentId,
			
 
				-  }, apiParams => fetchIndexingEstimate(omit(apiParams, 'action')), {
			
 
				-    revalidateOnFocus: false,
			
 
				-  })
			
 
				-
			
 
				   const { data: ruleDetail, error: ruleError } = useSWR({
			
 
				     action: 'fetchProcessRule',
			
 
				     params: { documentId: localDocumentId },
			
@@ -250,21 +238,6 @@ const EmbeddingDetail: FC<Props> = ({ detail, stopPosition = 'top', datasetId: d
 
				       </div>
			
 
				       <div className={s.progressData}>
			
 
				         <div>{t('datasetDocuments.embedding.segments')} {indexingStatusDetail?.completed_segments}/{indexingStatusDetail?.total_segments} · {percent}%</div>
			
 
				-        {localIndexingTechnique === 'high_quaility' && (
			
 
				-          <div className='flex items-center'>
			
 
				-            <div className={cn(s.commonIcon, s.highIcon)} />
			
 
				-            {t('datasetDocuments.embedding.highQuality')} · {t('datasetDocuments.embedding.estimate')}
			
 
				-            <span className={s.tokens}>{formatNumber(indexingEstimateDetail?.tokens || 0)}</span>tokens
			
 
				-            (<span className={s.price}>${formatNumber(indexingEstimateDetail?.total_price || 0)}</span>)
			
 
				-          </div>
			
 
				-        )}
			
 
				-        {localIndexingTechnique === 'economy' && (
			
 
				-          <div className='flex items-center'>
			
 
				-            <div className={cn(s.commonIcon, s.economyIcon)} />
			
 
				-            {t('datasetDocuments.embedding.economy')} · {t('datasetDocuments.embedding.estimate')}
			
 
				-            <span className={s.tokens}>0</span>tokens
			
 
				-          </div>
			
 
				-        )}
			
 
				       </div>
			
 
				       <RuleDetail sourceData={ruleDetail} docName={detail?.name} />
			
 
				       {!onTop && (
			
--- a/web/app/components/datasets/documents/detail/embedding/style.module.css
+++ b/web/app/components/datasets/documents/detail/embedding/style.module.css
@@ -31,7 +31,7 @@
 
				   @apply rounded-r-md;
			
 
				 }
			
 
				 .progressData {
			
 
				-  @apply w-full flex justify-between items-center text-xs text-gray-700;
			
 
				+  @apply w-full flex items-center text-xs text-gray-700;
			
 
				 }
			
 
				 .previewTip {
			
 
				   @apply pb-1 pt-12 text-gray-900 text-sm font-medium;
			
--- a/web/i18n/en-US/dataset-creation.ts
+++ b/web/i18n/en-US/dataset-creation.ts
@@ -86,7 +86,7 @@ const translation = {
 
				     autoDescription: 'Automatically set chunk and preprocessing rules. Unfamiliar users are recommended to select this.',
			
 
				     custom: 'Custom',
			
 
				     customDescription: 'Customize chunks rules, chunks length, and preprocessing rules, etc.',
			
 
				-    separator: 'Segment identifier',
			
 
				+    separator: 'Delimiter',
			
 
				     separatorPlaceholder: 'For example, newline (\\\\n) or special separator (such as "***")',
			
 
				     maxLength: 'Maximum chunk length',
			
 
				     overlap: 'Chunk overlap',
			
@@ -135,8 +135,8 @@ const translation = {
 
				     previewSwitchTipStart: 'The current chunk preview is in text format, switching to a question-and-answer format preview will',
			
 
				     previewSwitchTipEnd: ' consume additional tokens',
			
 
				     characters: 'characters',
			
 
				-    indexSettedTip: 'To change the index method, please go to the ',
			
 
				-    retrivalSettedTip: 'To change the index method, please go to the ',
			
 
				+    indexSettedTip: 'To change the index method & embedding model, please go to the ',
			
 
				+    retrivalSettedTip: 'To change the retrieval setting, please go to the ',
			
 
				     datasetSettingLink: 'Knowledge settings.',
			
 
				   },
			
 
				   stepThree: {
			
--- a/web/i18n/ja-JP/dataset-creation.ts
+++ b/web/i18n/ja-JP/dataset-creation.ts
@@ -136,7 +136,7 @@ const translation = {
 
				     previewSwitchTipEnd: ' 追加のトークンが消費されます',
			
 
				     characters: '文字',
			
 
				     indexSettedTip: 'インデックス方法を変更するには、',
			
 
				-    retrivalSettedTip: 'インデックス方法を変更するには、',
			
 
				+    retrivalSettedTip: '検索方法を変更するには、',
			
 
				     datasetSettingLink: 'ナレッジ設定',
			
 
				   },
			
 
				   stepThree: {
			
--- a/web/i18n/zh-Hans/dataset-creation.ts
+++ b/web/i18n/zh-Hans/dataset-creation.ts
@@ -135,7 +135,7 @@ const translation = {
 
				     previewSwitchTipStart: '当前分段预览是文本模式，切换到 Q&A 模式将会',
			
 
				     previewSwitchTipEnd: '消耗额外的 token',
			
 
				     characters: '字符',
			
 
				-    indexSettedTip: '要更改索引方法，请转到',
			
 
				+    indexSettedTip: '要更改索引方法和 embedding 模型，请转到',
			
 
				     retrivalSettedTip: '要更改检索方法，请转到',
			
 
				     datasetSettingLink: '知识库设置。',
			
 
				   },
			
--- a/web/models/datasets.ts
+++ b/web/models/datasets.ts
@@ -227,6 +227,8 @@ export type DocumentReq = {
 
				 export type CreateDocumentReq = DocumentReq & {
			
 
				   data_source: DataSource
			
 
				   retrieval_model: RetrievalConfig
			
 
				+  embedding_model: string
			
 
				+  embedding_model_provider: string
			
 
				 }
			
 
				 
			
 
				 export type IndexingEstimateParams = DocumentReq & Partial<DataSource> & {