|
@@ -31,17 +31,17 @@ import LanguageSelect from './language-select'
|
|
|
import { DelimiterInput, MaxLengthInput, OverlapInput } from './inputs'
|
|
|
import cn from '@/utils/classnames'
|
|
|
import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, DocumentItem, FullDocumentDetail, ParentMode, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
|
|
|
+import { ChunkingMode, DataSourceType, ProcessMode } from '@/models/datasets'
|
|
|
|
|
|
import Button from '@/app/components/base/button'
|
|
|
import FloatRightContainer from '@/app/components/base/float-right-container'
|
|
|
import RetrievalMethodConfig from '@/app/components/datasets/common/retrieval-method-config'
|
|
|
import EconomicalRetrievalMethodConfig from '@/app/components/datasets/common/economical-retrieval-method-config'
|
|
|
import { type RetrievalConfig } from '@/types/app'
|
|
|
-import { ensureRerankModelSelected, isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
|
|
|
+import { isReRankModelSelected } from '@/app/components/datasets/common/check-rerank-model'
|
|
|
import Toast from '@/app/components/base/toast'
|
|
|
import type { NotionPage } from '@/models/common'
|
|
|
import { DataSourceProvider } from '@/models/common'
|
|
|
-import { ChunkingMode, DataSourceType, RerankingModeEnum } from '@/models/datasets'
|
|
|
import { useDatasetDetailContext } from '@/context/dataset-detail'
|
|
|
import I18n from '@/context/i18n'
|
|
|
import { RETRIEVE_METHOD } from '@/types/app'
|
|
@@ -90,17 +90,13 @@ type StepTwoProps = {
|
|
|
onCancel?: () => void
|
|
|
}
|
|
|
|
|
|
-export enum SegmentType {
|
|
|
- AUTO = 'automatic',
|
|
|
- CUSTOM = 'custom',
|
|
|
-}
|
|
|
export enum IndexingType {
|
|
|
QUALIFIED = 'high_quality',
|
|
|
ECONOMICAL = 'economy',
|
|
|
}
|
|
|
|
|
|
const DEFAULT_SEGMENT_IDENTIFIER = '\\n\\n'
|
|
|
-const DEFAULT_MAXMIMUM_CHUNK_LENGTH = 500
|
|
|
+const DEFAULT_MAXIMUM_CHUNK_LENGTH = 500
|
|
|
const DEFAULT_OVERLAP = 50
|
|
|
|
|
|
type ParentChildConfig = {
|
|
@@ -131,7 +127,6 @@ const StepTwo = ({
|
|
|
isSetting,
|
|
|
documentDetail,
|
|
|
isAPIKeySet,
|
|
|
- onSetting,
|
|
|
datasetId,
|
|
|
indexingType,
|
|
|
dataSourceType: inCreatePageDataSourceType,
|
|
@@ -162,12 +157,12 @@ const StepTwo = ({
|
|
|
|
|
|
const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type)
|
|
|
const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type
|
|
|
- const [segmentationType, setSegmentationType] = useState<SegmentType>(SegmentType.CUSTOM)
|
|
|
+ const [segmentationType, setSegmentationType] = useState<ProcessMode>(ProcessMode.general)
|
|
|
const [segmentIdentifier, doSetSegmentIdentifier] = useState(DEFAULT_SEGMENT_IDENTIFIER)
|
|
|
const setSegmentIdentifier = useCallback((value: string, canEmpty?: boolean) => {
|
|
|
doSetSegmentIdentifier(value ? escape(value) : (canEmpty ? '' : DEFAULT_SEGMENT_IDENTIFIER))
|
|
|
}, [])
|
|
|
- const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXMIMUM_CHUNK_LENGTH) // default chunk length
|
|
|
+ const [maxChunkLength, setMaxChunkLength] = useState(DEFAULT_MAXIMUM_CHUNK_LENGTH) // default chunk length
|
|
|
const [limitMaxChunkLength, setLimitMaxChunkLength] = useState(4000)
|
|
|
const [overlap, setOverlap] = useState(DEFAULT_OVERLAP)
|
|
|
const [rules, setRules] = useState<PreProcessingRule[]>([])
|
|
@@ -198,7 +193,6 @@ const StepTwo = ({
|
|
|
)
|
|
|
|
|
|
// QA Related
|
|
|
- const [isLanguageSelectDisabled, _setIsLanguageSelectDisabled] = useState(false)
|
|
|
const [isQAConfirmDialogOpen, setIsQAConfirmDialogOpen] = useState(false)
|
|
|
const [docForm, setDocForm] = useState<ChunkingMode>(
|
|
|
(datasetId && documentDetail) ? documentDetail.doc_form as ChunkingMode : ChunkingMode.text,
|
|
@@ -348,7 +342,7 @@ const StepTwo = ({
|
|
|
}
|
|
|
|
|
|
const updatePreview = () => {
|
|
|
- if (segmentationType === SegmentType.CUSTOM && maxChunkLength > 4000) {
|
|
|
+ if (segmentationType === ProcessMode.general && maxChunkLength > 4000) {
|
|
|
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck') })
|
|
|
return
|
|
|
}
|
|
@@ -373,13 +367,42 @@ const StepTwo = ({
|
|
|
model: defaultEmbeddingModel?.model || '',
|
|
|
},
|
|
|
)
|
|
|
+ const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || {
|
|
|
+ search_method: RETRIEVE_METHOD.semantic,
|
|
|
+ reranking_enable: false,
|
|
|
+ reranking_model: {
|
|
|
+ reranking_provider_name: '',
|
|
|
+ reranking_model_name: '',
|
|
|
+ },
|
|
|
+ top_k: 3,
|
|
|
+ score_threshold_enabled: false,
|
|
|
+ score_threshold: 0.5,
|
|
|
+ } as RetrievalConfig)
|
|
|
+
|
|
|
+ useEffect(() => {
|
|
|
+ if (currentDataset?.retrieval_model_dict)
|
|
|
+ return
|
|
|
+ setRetrievalConfig({
|
|
|
+ search_method: RETRIEVE_METHOD.semantic,
|
|
|
+ reranking_enable: !!isRerankDefaultModelValid,
|
|
|
+ reranking_model: {
|
|
|
+ reranking_provider_name: isRerankDefaultModelValid ? rerankDefaultModel?.provider.provider ?? '' : '',
|
|
|
+ reranking_model_name: isRerankDefaultModelValid ? rerankDefaultModel?.model ?? '' : '',
|
|
|
+ },
|
|
|
+ top_k: 3,
|
|
|
+ score_threshold_enabled: false,
|
|
|
+ score_threshold: 0.5,
|
|
|
+ })
|
|
|
+ // eslint-disable-next-line react-hooks/exhaustive-deps
|
|
|
+ }, [rerankDefaultModel, isRerankDefaultModelValid])
|
|
|
+
|
|
|
const getCreationParams = () => {
|
|
|
let params
|
|
|
- if (segmentationType === SegmentType.CUSTOM && overlap > maxChunkLength) {
|
|
|
+ if (segmentationType === ProcessMode.general && overlap > maxChunkLength) {
|
|
|
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.overlapCheck') })
|
|
|
return
|
|
|
}
|
|
|
- if (segmentationType === SegmentType.CUSTOM && maxChunkLength > limitMaxChunkLength) {
|
|
|
+ if (segmentationType === ProcessMode.general && maxChunkLength > limitMaxChunkLength) {
|
|
|
Toast.notify({ type: 'error', message: t('datasetCreation.stepTwo.maxLengthCheck', { limit: limitMaxChunkLength }) })
|
|
|
return
|
|
|
}
|
|
@@ -389,7 +412,6 @@ const StepTwo = ({
|
|
|
doc_form: currentDocForm,
|
|
|
doc_language: docLanguage,
|
|
|
process_rule: getProcessRule(),
|
|
|
- // eslint-disable-next-line @typescript-eslint/no-use-before-define
|
|
|
retrieval_model: retrievalConfig, // Readonly. If want to changed, just go to settings page.
|
|
|
embedding_model: embeddingModel.model, // Readonly
|
|
|
embedding_model_provider: embeddingModel.provider, // Readonly
|
|
@@ -400,10 +422,7 @@ const StepTwo = ({
|
|
|
const indexMethod = getIndexing_technique()
|
|
|
if (
|
|
|
!isReRankModelSelected({
|
|
|
- rerankDefaultModel,
|
|
|
- isRerankDefaultModelValid: !!isRerankDefaultModelValid,
|
|
|
rerankModelList,
|
|
|
- // eslint-disable-next-line @typescript-eslint/no-use-before-define
|
|
|
retrievalConfig,
|
|
|
indexMethod: indexMethod as string,
|
|
|
})
|
|
@@ -411,16 +430,6 @@ const StepTwo = ({
|
|
|
Toast.notify({ type: 'error', message: t('appDebug.datasetConfig.rerankModelRequired') })
|
|
|
return
|
|
|
}
|
|
|
- const postRetrievalConfig = ensureRerankModelSelected({
|
|
|
- rerankDefaultModel: rerankDefaultModel!,
|
|
|
- retrievalConfig: {
|
|
|
- // eslint-disable-next-line @typescript-eslint/no-use-before-define
|
|
|
- ...retrievalConfig,
|
|
|
- // eslint-disable-next-line @typescript-eslint/no-use-before-define
|
|
|
- reranking_enable: retrievalConfig.reranking_mode === RerankingModeEnum.RerankingModel,
|
|
|
- },
|
|
|
- indexMethod: indexMethod as string,
|
|
|
- })
|
|
|
params = {
|
|
|
data_source: {
|
|
|
type: dataSourceType,
|
|
@@ -432,8 +441,7 @@ const StepTwo = ({
|
|
|
process_rule: getProcessRule(),
|
|
|
doc_form: currentDocForm,
|
|
|
doc_language: docLanguage,
|
|
|
-
|
|
|
- retrieval_model: postRetrievalConfig,
|
|
|
+ retrieval_model: retrievalConfig,
|
|
|
embedding_model: embeddingModel.model,
|
|
|
embedding_model_provider: embeddingModel.provider,
|
|
|
} as CreateDocumentReq
|
|
@@ -490,7 +498,6 @@ const StepTwo = ({
|
|
|
|
|
|
const getDefaultMode = () => {
|
|
|
if (documentDetail)
|
|
|
- // @ts-expect-error fix after api refactored
|
|
|
setSegmentationType(documentDetail.dataset_process_rule.mode)
|
|
|
}
|
|
|
|
|
@@ -525,7 +532,6 @@ const StepTwo = ({
|
|
|
onSuccess(data) {
|
|
|
updateIndexingTypeCache && updateIndexingTypeCache(indexType as string)
|
|
|
updateResultCache && updateResultCache(data)
|
|
|
- // eslint-disable-next-line @typescript-eslint/no-use-before-define
|
|
|
updateRetrievalMethodCache && updateRetrievalMethodCache(retrievalConfig.search_method as string)
|
|
|
},
|
|
|
},
|
|
@@ -545,14 +551,6 @@ const StepTwo = ({
|
|
|
isSetting && onSave && onSave()
|
|
|
}
|
|
|
|
|
|
- const changeToEconomicalType = () => {
|
|
|
- if (docForm !== ChunkingMode.text)
|
|
|
- return
|
|
|
-
|
|
|
- if (!hasSetIndexType)
|
|
|
- setIndexType(IndexingType.ECONOMICAL)
|
|
|
- }
|
|
|
-
|
|
|
useEffect(() => {
|
|
|
// fetch rules
|
|
|
if (!isSetting) {
|
|
@@ -574,18 +572,6 @@ const StepTwo = ({
|
|
|
setIndexType(isAPIKeySet ? IndexingType.QUALIFIED : IndexingType.ECONOMICAL)
|
|
|
}, [isAPIKeySet, indexingType, datasetId])
|
|
|
|
|
|
- const [retrievalConfig, setRetrievalConfig] = useState(currentDataset?.retrieval_model_dict || {
|
|
|
- search_method: RETRIEVE_METHOD.semantic,
|
|
|
- reranking_enable: false,
|
|
|
- reranking_model: {
|
|
|
- reranking_provider_name: rerankDefaultModel?.provider.provider,
|
|
|
- reranking_model_name: rerankDefaultModel?.model,
|
|
|
- },
|
|
|
- top_k: 3,
|
|
|
- score_threshold_enabled: false,
|
|
|
- score_threshold: 0.5,
|
|
|
- } as RetrievalConfig)
|
|
|
-
|
|
|
const economyDomRef = useRef<HTMLDivElement>(null)
|
|
|
const isHoveringEconomy = useHover(economyDomRef)
|
|
|
|
|
@@ -984,12 +970,14 @@ const StepTwo = ({
|
|
|
getIndexing_technique() === IndexingType.QUALIFIED
|
|
|
? (
|
|
|
<RetrievalMethodConfig
|
|
|
+ disabled={!!datasetId}
|
|
|
value={retrievalConfig}
|
|
|
onChange={setRetrievalConfig}
|
|
|
/>
|
|
|
)
|
|
|
: (
|
|
|
<EconomicalRetrievalMethodConfig
|
|
|
+ disabled={!!datasetId}
|
|
|
value={retrievalConfig}
|
|
|
onChange={setRetrievalConfig}
|
|
|
/>
|
|
@@ -1010,7 +998,7 @@ const StepTwo = ({
|
|
|
)
|
|
|
: (
|
|
|
<div className='flex items-center mt-8 py-2'>
|
|
|
- <Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.save')}</Button>
|
|
|
+ {!datasetId && <Button loading={isCreating} variant='primary' onClick={createHandle}>{t('datasetCreation.stepTwo.save')}</Button>}
|
|
|
<Button className='ml-2' onClick={onCancel}>{t('datasetCreation.stepTwo.cancel')}</Button>
|
|
|
</div>
|
|
|
)}
|
|
@@ -1081,11 +1069,11 @@ const StepTwo = ({
|
|
|
}
|
|
|
{
|
|
|
currentDocForm !== ChunkingMode.qa
|
|
|
- && <Badge text={t(
|
|
|
- 'datasetCreation.stepTwo.previewChunkCount', {
|
|
|
- count: estimate?.total_segments || 0,
|
|
|
- }) as string}
|
|
|
- />
|
|
|
+ && <Badge text={t(
|
|
|
+ 'datasetCreation.stepTwo.previewChunkCount', {
|
|
|
+ count: estimate?.total_segments || 0,
|
|
|
+ }) as string}
|
|
|
+ />
|
|
|
}
|
|
|
</div>
|
|
|
</PreviewHeader>}
|