datasets.ts 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675
  1. import type { DataSourceNotionPage, DataSourceProvider } from './common'
  2. import type { AppIconType, AppMode, RetrievalConfig } from '@/types/app'
  3. import type { Tag } from '@/app/components/base/tag-management/constant'
  4. import type { IndexingType } from '@/app/components/datasets/create/step-two'
  5. export enum DataSourceType {
  6. FILE = 'upload_file',
  7. NOTION = 'notion_import',
  8. WEB = 'website_crawl',
  9. }
  10. export type DatasetPermission = 'only_me' | 'all_team_members' | 'partial_members'
  11. export enum ChunkingMode {
  12. 'text' = 'text_model', // General text
  13. 'qa' = 'qa_model', // General QA
  14. 'parentChild' = 'hierarchical_model', // Parent-Child
  15. }
  16. export type DataSet = {
  17. id: string
  18. name: string
  19. icon: string
  20. icon_background: string
  21. description: string
  22. permission: DatasetPermission
  23. data_source_type: DataSourceType
  24. indexing_technique: IndexingType
  25. created_by: string
  26. updated_by: string
  27. updated_at: number
  28. app_count: number
  29. doc_form: ChunkingMode
  30. document_count: number
  31. word_count: number
  32. provider: string
  33. embedding_model: string
  34. embedding_model_provider: string
  35. embedding_available: boolean
  36. retrieval_model_dict: RetrievalConfig
  37. retrieval_model: RetrievalConfig
  38. tags: Tag[]
  39. partial_member_list?: any[]
  40. external_knowledge_info: {
  41. external_knowledge_id: string
  42. external_knowledge_api_id: string
  43. external_knowledge_api_name: string
  44. external_knowledge_api_endpoint: string
  45. }
  46. external_retrieval_model: {
  47. top_k: number
  48. score_threshold: number
  49. score_threshold_enabled: boolean
  50. }
  51. }
  52. export type ExternalAPIItem = {
  53. id: string
  54. tenant_id: string
  55. name: string
  56. description: string
  57. settings: {
  58. endpoint: string
  59. api_key: string
  60. }
  61. dataset_bindings: { id: string; name: string }[]
  62. created_by: string
  63. created_at: string
  64. }
  65. export type ExternalKnowledgeItem = {
  66. id: string
  67. name: string
  68. description: string | null
  69. provider: 'external'
  70. permission: DatasetPermission
  71. data_source_type: null
  72. indexing_technique: null
  73. app_count: number
  74. document_count: number
  75. word_count: number
  76. created_by: string
  77. created_at: string
  78. updated_by: string
  79. updated_at: string
  80. tags: Tag[]
  81. }
  82. export type ExternalAPIDeleteResponse = {
  83. result: 'success' | 'error'
  84. }
  85. export type ExternalAPIUsage = {
  86. is_using: boolean
  87. count: number
  88. }
  89. export type CustomFile = File & {
  90. id?: string
  91. extension?: string
  92. mime_type?: string
  93. created_by?: string
  94. created_at?: number
  95. }
  96. export type DocumentItem = {
  97. id: string
  98. name: string
  99. extension: string
  100. }
  101. export type CrawlOptions = {
  102. crawl_sub_pages: boolean
  103. only_main_content: boolean
  104. includes: string
  105. excludes: string
  106. limit: number | string
  107. max_depth: number | string
  108. use_sitemap: boolean
  109. }
  110. export type CrawlResultItem = {
  111. title: string
  112. markdown: string
  113. description: string
  114. source_url: string
  115. }
  116. export type FileItem = {
  117. fileID: string
  118. file: CustomFile
  119. progress: number
  120. }
  121. export type FetchDatasetsParams = {
  122. url: string
  123. params: {
  124. page: number
  125. tag_ids?: string[]
  126. limit: number
  127. include_all: boolean
  128. keyword?: string
  129. }
  130. }
  131. export type DataSetListResponse = {
  132. data: DataSet[]
  133. has_more: boolean
  134. limit: number
  135. page: number
  136. total: number
  137. }
  138. export type ExternalAPIListResponse = {
  139. data: ExternalAPIItem[]
  140. has_more: boolean
  141. limit: number
  142. page: number
  143. total: number
  144. }
  145. export type QA = {
  146. question: string
  147. answer: string
  148. }
  149. export type IndexingEstimateResponse = {
  150. tokens: number
  151. total_price: number
  152. currency: string
  153. total_segments: number
  154. preview: Array<{ content: string; child_chunks: string[] }>
  155. qa_preview?: QA[]
  156. }
  157. export type FileIndexingEstimateResponse = {
  158. total_nodes: number
  159. } & IndexingEstimateResponse
  160. export type IndexingStatusResponse = {
  161. id: string
  162. indexing_status: DocumentIndexingStatus
  163. processing_started_at: number
  164. parsing_completed_at: number
  165. cleaning_completed_at: number
  166. splitting_completed_at: number
  167. completed_at: any
  168. paused_at: any
  169. error: any
  170. stopped_at: any
  171. completed_segments: number
  172. total_segments: number
  173. }
  174. export type IndexingStatusBatchResponse = {
  175. data: IndexingStatusResponse[]
  176. }
  177. export enum ProcessMode {
  178. general = 'custom',
  179. parentChild = 'hierarchical',
  180. }
  181. export type ParentMode = 'full-doc' | 'paragraph'
  182. export type ProcessRuleResponse = {
  183. mode: ProcessMode
  184. rules: Rules
  185. limits: Limits
  186. }
  187. export type Rules = {
  188. pre_processing_rules: PreProcessingRule[]
  189. segmentation: Segmentation
  190. parent_mode: ParentMode
  191. subchunk_segmentation: Segmentation
  192. }
  193. export type Limits = {
  194. indexing_max_segmentation_tokens_length: number
  195. }
  196. export type PreProcessingRule = {
  197. id: string
  198. enabled: boolean
  199. }
  200. export type Segmentation = {
  201. separator: string
  202. max_tokens: number
  203. chunk_overlap?: number
  204. }
  205. export const DocumentIndexingStatusList = [
  206. 'waiting',
  207. 'parsing',
  208. 'cleaning',
  209. 'splitting',
  210. 'indexing',
  211. 'paused',
  212. 'error',
  213. 'completed',
  214. ] as const
  215. export type DocumentIndexingStatus = typeof DocumentIndexingStatusList[number]
  216. export const DisplayStatusList = [
  217. 'queuing',
  218. 'indexing',
  219. 'paused',
  220. 'error',
  221. 'available',
  222. 'enabled',
  223. 'disabled',
  224. 'archived',
  225. ] as const
  226. export type DocumentDisplayStatus = typeof DisplayStatusList[number]
  227. export type DataSourceInfo = {
  228. upload_file: {
  229. id: string
  230. name: string
  231. size: number
  232. mime_type: string
  233. created_at: number
  234. created_by: string
  235. extension: string
  236. }
  237. notion_page_icon?: string
  238. notion_workspace_id?: string
  239. notion_page_id?: string
  240. provider?: DataSourceProvider
  241. job_id: string
  242. url: string
  243. }
  244. export type InitialDocumentDetail = {
  245. id: string
  246. batch: string
  247. position: number
  248. dataset_id: string
  249. data_source_type: DataSourceType
  250. data_source_info: DataSourceInfo
  251. dataset_process_rule_id: string
  252. name: string
  253. created_from: 'api' | 'web'
  254. created_by: string
  255. created_at: number
  256. indexing_status: DocumentIndexingStatus
  257. display_status: DocumentDisplayStatus
  258. completed_segments?: number
  259. total_segments?: number
  260. doc_form: ChunkingMode
  261. doc_language: string
  262. }
  263. export type SimpleDocumentDetail = InitialDocumentDetail & {
  264. enabled: boolean
  265. word_count: number
  266. is_qa: boolean // TODO waiting for backend to add this field
  267. error?: string | null
  268. archived: boolean
  269. updated_at: number
  270. hit_count: number
  271. dataset_process_rule_id?: string
  272. data_source_detail_dict?: {
  273. upload_file: {
  274. name: string
  275. extension: string
  276. }
  277. }
  278. }
  279. export type DocumentListResponse = {
  280. data: SimpleDocumentDetail[]
  281. has_more: boolean
  282. total: number
  283. page: number
  284. limit: number
  285. }
  286. export type DocumentReq = {
  287. original_document_id?: string
  288. indexing_technique?: string
  289. doc_form: ChunkingMode
  290. doc_language: string
  291. process_rule: ProcessRule
  292. }
  293. export type CreateDocumentReq = DocumentReq & {
  294. data_source: DataSource
  295. retrieval_model: RetrievalConfig
  296. embedding_model: string
  297. embedding_model_provider: string
  298. }
  299. export type IndexingEstimateParams = DocumentReq & Partial<DataSource> & {
  300. dataset_id: string
  301. }
  302. export type DataSource = {
  303. type: DataSourceType
  304. info_list: {
  305. data_source_type: DataSourceType
  306. notion_info_list?: NotionInfo[]
  307. file_info_list?: {
  308. file_ids: string[]
  309. }
  310. website_info_list?: {
  311. provider: string
  312. job_id: string
  313. urls: string[]
  314. }
  315. }
  316. }
  317. export type NotionInfo = {
  318. workspace_id: string
  319. pages: DataSourceNotionPage[]
  320. }
  321. export type NotionPage = {
  322. page_id: string
  323. type: string
  324. }
  325. export type ProcessRule = {
  326. mode: ProcessMode
  327. rules: Rules
  328. }
  329. export type createDocumentResponse = {
  330. dataset?: DataSet
  331. batch: string
  332. documents: InitialDocumentDetail[]
  333. }
  334. export type PrecessRule = {
  335. mode: ProcessMode
  336. rules: Rules
  337. }
  338. export type FullDocumentDetail = SimpleDocumentDetail & {
  339. batch: string
  340. created_api_request_id: string
  341. processing_started_at: number
  342. parsing_completed_at: number
  343. cleaning_completed_at: number
  344. splitting_completed_at: number
  345. tokens: number
  346. indexing_latency: number
  347. completed_at: number
  348. paused_by: string
  349. paused_at: number
  350. stopped_at: number
  351. indexing_status: string
  352. disabled_at: number
  353. disabled_by: string
  354. archived_reason: 'rule_modified' | 're_upload'
  355. archived_by: string
  356. archived_at: number
  357. doc_type?: DocType | null | 'others'
  358. doc_metadata?: DocMetadata | null
  359. segment_count: number
  360. dataset_process_rule: PrecessRule
  361. document_process_rule: ProcessRule
  362. [key: string]: any
  363. }
  364. export type DocMetadata = {
  365. title: string
  366. language: string
  367. author: string
  368. publisher: string
  369. publicationDate: string
  370. ISBN: string
  371. category: string
  372. [key: string]: string
  373. }
  374. export const CUSTOMIZABLE_DOC_TYPES = [
  375. 'book',
  376. 'web_page',
  377. 'paper',
  378. 'social_media_post',
  379. 'personal_document',
  380. 'business_document',
  381. 'im_chat_log',
  382. ] as const
  383. export const FIXED_DOC_TYPES = ['synced_from_github', 'synced_from_notion', 'wikipedia_entry'] as const
  384. export type CustomizableDocType = typeof CUSTOMIZABLE_DOC_TYPES[number]
  385. export type FixedDocType = typeof FIXED_DOC_TYPES[number]
  386. export type DocType = CustomizableDocType | FixedDocType
  387. export type DocumentDetailResponse = FullDocumentDetail
  388. export const SEGMENT_STATUS_LIST = ['waiting', 'completed', 'error', 'indexing']
  389. export type SegmentStatus = typeof SEGMENT_STATUS_LIST[number]
  390. export type SegmentsQuery = {
  391. page?: string
  392. limit: number
  393. // status?: SegmentStatus
  394. hit_count_gte?: number
  395. keyword?: string
  396. enabled?: boolean | 'all'
  397. }
  398. export type SegmentDetailModel = {
  399. id: string
  400. position: number
  401. document_id: string
  402. content: string
  403. word_count: number
  404. tokens: number
  405. keywords: string[]
  406. index_node_id: string
  407. index_node_hash: string
  408. hit_count: number
  409. enabled: boolean
  410. disabled_at: number
  411. disabled_by: string
  412. status: SegmentStatus
  413. created_by: string
  414. created_at: number
  415. indexing_at: number
  416. completed_at: number
  417. error: string | null
  418. stopped_at: number
  419. answer?: string
  420. child_chunks?: ChildChunkDetail[]
  421. updated_at: number
  422. }
  423. export type SegmentsResponse = {
  424. data: SegmentDetailModel[]
  425. has_more: boolean
  426. limit: number
  427. total: number
  428. total_pages: number
  429. page: number
  430. }
  431. export type HitTestingRecord = {
  432. id: string
  433. content: string
  434. source: 'app' | 'hit_testing' | 'plugin'
  435. source_app_id: string
  436. created_by_role: 'account' | 'end_user'
  437. created_by: string
  438. created_at: number
  439. }
  440. export type HitTestingChildChunk = {
  441. id: string
  442. content: string
  443. position: number
  444. score: number
  445. }
  446. export type HitTesting = {
  447. segment: Segment
  448. content: Segment
  449. score: number
  450. tsne_position: TsnePosition
  451. child_chunks?: HitTestingChildChunk[] | null
  452. }
  453. export type ExternalKnowledgeBaseHitTesting = {
  454. content: string
  455. title: string
  456. score: number
  457. metadata: {
  458. 'x-amz-bedrock-kb-source-uri': string
  459. 'x-amz-bedrock-kb-data-source-id': string
  460. }
  461. }
  462. export type Segment = {
  463. id: string
  464. document: Document
  465. content: string
  466. position: number
  467. word_count: number
  468. tokens: number
  469. keywords: string[]
  470. hit_count: number
  471. index_node_hash: string
  472. }
  473. export type Document = {
  474. id: string
  475. data_source_type: string
  476. name: string
  477. doc_type: DocType
  478. }
  479. export type HitTestingRecordsResponse = {
  480. data: HitTestingRecord[]
  481. has_more: boolean
  482. limit: number
  483. total: number
  484. page: number
  485. }
  486. export type TsnePosition = {
  487. x: number
  488. y: number
  489. }
  490. export type HitTestingResponse = {
  491. query: {
  492. content: string
  493. tsne_position: TsnePosition
  494. }
  495. records: Array<HitTesting>
  496. }
  497. export type ExternalKnowledgeBaseHitTestingResponse = {
  498. query: {
  499. content: string
  500. }
  501. records: Array<ExternalKnowledgeBaseHitTesting>
  502. }
  503. export type RelatedApp = {
  504. id: string
  505. name: string
  506. mode: AppMode
  507. icon_type: AppIconType | null
  508. icon: string
  509. icon_background: string
  510. icon_url: string
  511. }
  512. export type RelatedAppResponse = {
  513. data: Array<RelatedApp>
  514. total: number
  515. }
  516. export type SegmentUpdater = {
  517. content: string
  518. answer?: string
  519. keywords?: string[]
  520. regenerate_child_chunks?: boolean
  521. }
  522. export type ErrorDocsResponse = {
  523. data: IndexingStatusResponse[]
  524. total: number
  525. }
  526. export type SelectedDatasetsMode = {
  527. allHighQuality: boolean
  528. allHighQualityVectorSearch: boolean
  529. allHighQualityFullTextSearch: boolean
  530. allEconomic: boolean
  531. mixtureHighQualityAndEconomic: boolean
  532. allInternal: boolean
  533. allExternal: boolean
  534. mixtureInternalAndExternal: boolean
  535. inconsistentEmbeddingModel: boolean
  536. }
  537. export enum WeightedScoreEnum {
  538. SemanticFirst = 'semantic_first',
  539. KeywordFirst = 'keyword_first',
  540. Customized = 'customized',
  541. }
  542. export enum RerankingModeEnum {
  543. RerankingModel = 'reranking_model',
  544. WeightedScore = 'weighted_score',
  545. }
  546. export const DEFAULT_WEIGHTED_SCORE = {
  547. allHighQualityVectorSearch: {
  548. semantic: 1.0,
  549. keyword: 0,
  550. },
  551. allHighQualityFullTextSearch: {
  552. semantic: 0,
  553. keyword: 1.0,
  554. },
  555. other: {
  556. semantic: 0.7,
  557. keyword: 0.3,
  558. },
  559. }
  560. export type ChildChunkType = 'automatic' | 'customized'
  561. export type ChildChunkDetail = {
  562. id: string
  563. position: number
  564. segment_id: string
  565. content: string
  566. word_count: number
  567. created_at: number
  568. updated_at: number
  569. type: ChildChunkType
  570. }
  571. export type ChildSegmentsResponse = {
  572. data: ChildChunkDetail[]
  573. total: number
  574. total_pages: number
  575. page: number
  576. limit: number
  577. }
  578. export type UpdateDocumentParams = {
  579. datasetId: string
  580. documentId: string
  581. }
  582. // Used in api url
  583. export enum DocumentActionType {
  584. enable = 'enable',
  585. disable = 'disable',
  586. archive = 'archive',
  587. unArchive = 'un_archive',
  588. delete = 'delete',
  589. }
  590. export type UpdateDocumentBatchParams = {
  591. datasetId: string
  592. documentId?: string
  593. documentIds?: string[] | string
  594. }
  595. export type BatchImportResponse = {
  596. job_id: string
  597. job_status: string
  598. }