|
@@ -12,7 +12,7 @@ import RetrievalMethodInfo from '../../common/retrieval-method-info'
|
|
import PreviewItem, { PreviewType } from './preview-item'
|
|
import PreviewItem, { PreviewType } from './preview-item'
|
|
import LanguageSelect from './language-select'
|
|
import LanguageSelect from './language-select'
|
|
import s from './index.module.css'
|
|
import s from './index.module.css'
|
|
-import type { CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, IndexingEstimateResponse, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
|
|
|
|
|
|
+import type { CrawlOptions, CrawlResultItem, CreateDocumentReq, CustomFile, FileIndexingEstimateResponse, FullDocumentDetail, IndexingEstimateParams, IndexingEstimateResponse, NotionInfo, PreProcessingRule, ProcessRule, Rules, createDocumentResponse } from '@/models/datasets'
|
|
import {
|
|
import {
|
|
createDocument,
|
|
createDocument,
|
|
createFirstDocument,
|
|
createFirstDocument,
|
|
@@ -44,6 +44,7 @@ import TooltipPlus from '@/app/components/base/tooltip-plus'
|
|
import { useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
|
|
import { useModelListAndDefaultModelAndCurrentProviderAndModel } from '@/app/components/header/account-setting/model-provider-page/hooks'
|
|
import { LanguagesSupported } from '@/i18n/language'
|
|
import { LanguagesSupported } from '@/i18n/language'
|
|
import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations'
|
|
import { ModelTypeEnum } from '@/app/components/header/account-setting/model-provider-page/declarations'
|
|
|
|
+import { Globe01 } from '@/app/components/base/icons/src/vender/line/mapsAndTravel'
|
|
|
|
|
|
type ValueOf<T> = T[keyof T]
|
|
type ValueOf<T> = T[keyof T]
|
|
type StepTwoProps = {
|
|
type StepTwoProps = {
|
|
@@ -56,6 +57,9 @@ type StepTwoProps = {
|
|
dataSourceType: DataSourceType
|
|
dataSourceType: DataSourceType
|
|
files: CustomFile[]
|
|
files: CustomFile[]
|
|
notionPages?: NotionPage[]
|
|
notionPages?: NotionPage[]
|
|
|
|
+ websitePages?: CrawlResultItem[]
|
|
|
|
+ crawlOptions?: CrawlOptions
|
|
|
|
+ fireCrawlJobId?: string
|
|
onStepChange?: (delta: number) => void
|
|
onStepChange?: (delta: number) => void
|
|
updateIndexingTypeCache?: (type: string) => void
|
|
updateIndexingTypeCache?: (type: string) => void
|
|
updateResultCache?: (res: createDocumentResponse) => void
|
|
updateResultCache?: (res: createDocumentResponse) => void
|
|
@@ -79,9 +83,12 @@ const StepTwo = ({
|
|
onSetting,
|
|
onSetting,
|
|
datasetId,
|
|
datasetId,
|
|
indexingType,
|
|
indexingType,
|
|
- dataSourceType,
|
|
|
|
|
|
+ dataSourceType: inCreatePageDataSourceType,
|
|
files,
|
|
files,
|
|
notionPages = [],
|
|
notionPages = [],
|
|
|
|
+ websitePages = [],
|
|
|
|
+ crawlOptions,
|
|
|
|
+ fireCrawlJobId = '',
|
|
onStepChange,
|
|
onStepChange,
|
|
updateIndexingTypeCache,
|
|
updateIndexingTypeCache,
|
|
updateResultCache,
|
|
updateResultCache,
|
|
@@ -94,6 +101,8 @@ const StepTwo = ({
|
|
const isMobile = media === MediaType.mobile
|
|
const isMobile = media === MediaType.mobile
|
|
|
|
|
|
const { dataset: currentDataset, mutateDatasetRes } = useDatasetDetailContext()
|
|
const { dataset: currentDataset, mutateDatasetRes } = useDatasetDetailContext()
|
|
|
|
+ const isInCreatePage = !datasetId || (datasetId && !currentDataset?.data_source_type)
|
|
|
|
+ const dataSourceType = isInCreatePage ? inCreatePageDataSourceType : currentDataset?.data_source_type
|
|
const scrollRef = useRef<HTMLDivElement>(null)
|
|
const scrollRef = useRef<HTMLDivElement>(null)
|
|
const [scrolled, setScrolled] = useState(false)
|
|
const [scrolled, setScrolled] = useState(false)
|
|
const previewScrollRef = useRef<HTMLDivElement>(null)
|
|
const previewScrollRef = useRef<HTMLDivElement>(null)
|
|
@@ -242,6 +251,15 @@ const StepTwo = ({
|
|
}) as NotionInfo[]
|
|
}) as NotionInfo[]
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+ const getWebsiteInfo = () => {
|
|
|
|
+ return {
|
|
|
|
+ provider: 'firecrawl',
|
|
|
|
+ job_id: fireCrawlJobId,
|
|
|
|
+ urls: websitePages.map(page => page.source_url),
|
|
|
|
+ only_main_content: crawlOptions?.only_main_content,
|
|
|
|
+ }
|
|
|
|
+ }
|
|
|
|
+
|
|
const getFileIndexingEstimateParams = (docForm: DocForm): IndexingEstimateParams | undefined => {
|
|
const getFileIndexingEstimateParams = (docForm: DocForm): IndexingEstimateParams | undefined => {
|
|
if (dataSourceType === DataSourceType.FILE) {
|
|
if (dataSourceType === DataSourceType.FILE) {
|
|
return {
|
|
return {
|
|
@@ -271,6 +289,19 @@ const StepTwo = ({
|
|
dataset_id: datasetId as string,
|
|
dataset_id: datasetId as string,
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
+ if (dataSourceType === DataSourceType.WEB) {
|
|
|
|
+ return {
|
|
|
|
+ info_list: {
|
|
|
|
+ data_source_type: dataSourceType,
|
|
|
|
+ website_info_list: getWebsiteInfo(),
|
|
|
|
+ },
|
|
|
|
+ indexing_technique: getIndexing_technique() as string,
|
|
|
|
+ process_rule: getProcessRule(),
|
|
|
|
+ doc_form: docForm,
|
|
|
|
+ doc_language: docLanguage,
|
|
|
|
+ dataset_id: datasetId as string,
|
|
|
|
+ }
|
|
|
|
+ }
|
|
}
|
|
}
|
|
const {
|
|
const {
|
|
modelList: rerankModelList,
|
|
modelList: rerankModelList,
|
|
@@ -335,6 +366,9 @@ const StepTwo = ({
|
|
}
|
|
}
|
|
if (dataSourceType === DataSourceType.NOTION)
|
|
if (dataSourceType === DataSourceType.NOTION)
|
|
params.data_source.info_list.notion_info_list = getNotionInfo()
|
|
params.data_source.info_list.notion_info_list = getNotionInfo()
|
|
|
|
+
|
|
|
|
+ if (dataSourceType === DataSourceType.WEB)
|
|
|
|
+ params.data_source.info_list.website_info_list = getWebsiteInfo()
|
|
}
|
|
}
|
|
return params
|
|
return params
|
|
}
|
|
}
|
|
@@ -819,6 +853,22 @@ const StepTwo = ({
|
|
</div>
|
|
</div>
|
|
</>
|
|
</>
|
|
)}
|
|
)}
|
|
|
|
+ {dataSourceType === DataSourceType.WEB && (
|
|
|
|
+ <>
|
|
|
|
+ <div className='mb-2 text-xs font-medium text-gray-500'>{t('datasetCreation.stepTwo.websiteSource')}</div>
|
|
|
|
+ <div className='flex items-center text-sm leading-6 font-medium text-gray-800'>
|
|
|
|
+ <Globe01 className='shrink-0 mr-1' />
|
|
|
|
+ <span className='grow w-0 truncate'>{websitePages[0].source_url}</span>
|
|
|
|
+ {websitePages.length > 1 && (
|
|
|
|
+ <span className={s.sourceCount}>
|
|
|
|
+ <span>{t('datasetCreation.stepTwo.other')}</span>
|
|
|
|
+ <span>{websitePages.length - 1}</span>
|
|
|
|
+ <span>{t('datasetCreation.stepTwo.webpageUnit')}</span>
|
|
|
|
+ </span>
|
|
|
|
+ )}
|
|
|
|
+ </div>
|
|
|
|
+ </>
|
|
|
|
+ )}
|
|
</div>
|
|
</div>
|
|
<div className={s.divider} />
|
|
<div className={s.divider} />
|
|
<div className={s.segmentCount}>
|
|
<div className={s.segmentCount}>
|