datasets.ts 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664
  1. import type { DataSourceNotionPage, DataSourceProvider } from './common'
  2. import type { AppIconType, AppMode, RetrievalConfig } from '@/types/app'
  3. import type { Tag } from '@/app/components/base/tag-management/constant'
  4. import type { IndexingType } from '@/app/components/datasets/create/step-two'
  5. export enum DataSourceType {
  6. FILE = 'upload_file',
  7. NOTION = 'notion_import',
  8. WEB = 'website_crawl',
  9. }
  10. export type DatasetPermission = 'only_me' | 'all_team_members' | 'partial_members'
  11. export enum ChunkingMode {
  12. 'text' = 'text_model', // General text
  13. 'qa' = 'qa_model', // General QA
  14. 'parentChild' = 'hierarchical_model', // Parent-Child
  15. }
  16. export type DataSet = {
  17. id: string
  18. name: string
  19. icon: string
  20. icon_background: string
  21. description: string
  22. permission: DatasetPermission
  23. data_source_type: DataSourceType
  24. indexing_technique: IndexingType
  25. created_by: string
  26. updated_by: string
  27. updated_at: number
  28. app_count: number
  29. doc_form: ChunkingMode
  30. document_count: number
  31. word_count: number
  32. provider: string
  33. embedding_model: string
  34. embedding_model_provider: string
  35. embedding_available: boolean
  36. retrieval_model_dict: RetrievalConfig
  37. retrieval_model: RetrievalConfig
  38. tags: Tag[]
  39. partial_member_list?: any[]
  40. external_knowledge_info: {
  41. external_knowledge_id: string
  42. external_knowledge_api_id: string
  43. external_knowledge_api_name: string
  44. external_knowledge_api_endpoint: string
  45. }
  46. external_retrieval_model: {
  47. top_k: number
  48. score_threshold: number
  49. score_threshold_enabled: boolean
  50. }
  51. }
  52. export type ExternalAPIItem = {
  53. id: string
  54. tenant_id: string
  55. name: string
  56. description: string
  57. settings: {
  58. endpoint: string
  59. api_key: string
  60. }
  61. dataset_bindings: { id: string; name: string }[]
  62. created_by: string
  63. created_at: string
  64. }
  65. export type ExternalKnowledgeItem = {
  66. id: string
  67. name: string
  68. description: string | null
  69. provider: 'external'
  70. permission: DatasetPermission
  71. data_source_type: null
  72. indexing_technique: null
  73. app_count: number
  74. document_count: number
  75. word_count: number
  76. created_by: string
  77. created_at: string
  78. updated_by: string
  79. updated_at: string
  80. tags: Tag[]
  81. }
  82. export type ExternalAPIDeleteResponse = {
  83. result: 'success' | 'error'
  84. }
  85. export type ExternalAPIUsage = {
  86. is_using: boolean
  87. count: number
  88. }
  89. export type CustomFile = File & {
  90. id?: string
  91. extension?: string
  92. mime_type?: string
  93. created_by?: string
  94. created_at?: number
  95. }
  96. export type DocumentItem = {
  97. id: string
  98. name: string
  99. extension: string
  100. }
  101. export type CrawlOptions = {
  102. crawl_sub_pages: boolean
  103. only_main_content: boolean
  104. includes: string
  105. excludes: string
  106. limit: number | string
  107. max_depth: number | string
  108. use_sitemap: boolean
  109. }
  110. export type CrawlResultItem = {
  111. title: string
  112. markdown: string
  113. description: string
  114. source_url: string
  115. }
  116. export type FileItem = {
  117. fileID: string
  118. file: CustomFile
  119. progress: number
  120. }
  121. export type DataSetListResponse = {
  122. data: DataSet[]
  123. has_more: boolean
  124. limit: number
  125. page: number
  126. total: number
  127. }
  128. export type ExternalAPIListResponse = {
  129. data: ExternalAPIItem[]
  130. has_more: boolean
  131. limit: number
  132. page: number
  133. total: number
  134. }
  135. export type QA = {
  136. question: string
  137. answer: string
  138. }
  139. export type IndexingEstimateResponse = {
  140. tokens: number
  141. total_price: number
  142. currency: string
  143. total_segments: number
  144. preview: Array<{ content: string; child_chunks: string[] }>
  145. qa_preview?: QA[]
  146. }
  147. export type FileIndexingEstimateResponse = {
  148. total_nodes: number
  149. } & IndexingEstimateResponse
  150. export type IndexingStatusResponse = {
  151. id: string
  152. indexing_status: DocumentIndexingStatus
  153. processing_started_at: number
  154. parsing_completed_at: number
  155. cleaning_completed_at: number
  156. splitting_completed_at: number
  157. completed_at: any
  158. paused_at: any
  159. error: any
  160. stopped_at: any
  161. completed_segments: number
  162. total_segments: number
  163. }
  164. export type IndexingStatusBatchResponse = {
  165. data: IndexingStatusResponse[]
  166. }
  167. export enum ProcessMode {
  168. general = 'custom',
  169. parentChild = 'hierarchical',
  170. }
  171. export type ParentMode = 'full-doc' | 'paragraph'
  172. export type ProcessRuleResponse = {
  173. mode: ProcessMode
  174. rules: Rules
  175. limits: Limits
  176. }
  177. export type Rules = {
  178. pre_processing_rules: PreProcessingRule[]
  179. segmentation: Segmentation
  180. parent_mode: ParentMode
  181. subchunk_segmentation: Segmentation
  182. }
  183. export type Limits = {
  184. indexing_max_segmentation_tokens_length: number
  185. }
  186. export type PreProcessingRule = {
  187. id: string
  188. enabled: boolean
  189. }
  190. export type Segmentation = {
  191. separator: string
  192. max_tokens: number
  193. chunk_overlap?: number
  194. }
  195. export const DocumentIndexingStatusList = [
  196. 'waiting',
  197. 'parsing',
  198. 'cleaning',
  199. 'splitting',
  200. 'indexing',
  201. 'paused',
  202. 'error',
  203. 'completed',
  204. ] as const
  205. export type DocumentIndexingStatus = typeof DocumentIndexingStatusList[number]
  206. export const DisplayStatusList = [
  207. 'queuing',
  208. 'indexing',
  209. 'paused',
  210. 'error',
  211. 'available',
  212. 'enabled',
  213. 'disabled',
  214. 'archived',
  215. ] as const
  216. export type DocumentDisplayStatus = typeof DisplayStatusList[number]
  217. export type DataSourceInfo = {
  218. upload_file: {
  219. id: string
  220. name: string
  221. size: number
  222. mime_type: string
  223. created_at: number
  224. created_by: string
  225. extension: string
  226. }
  227. notion_page_icon?: string
  228. notion_workspace_id?: string
  229. notion_page_id?: string
  230. provider?: DataSourceProvider
  231. job_id: string
  232. url: string
  233. }
  234. export type InitialDocumentDetail = {
  235. id: string
  236. batch: string
  237. position: number
  238. dataset_id: string
  239. data_source_type: DataSourceType
  240. data_source_info: DataSourceInfo
  241. dataset_process_rule_id: string
  242. name: string
  243. created_from: 'api' | 'web'
  244. created_by: string
  245. created_at: number
  246. indexing_status: DocumentIndexingStatus
  247. display_status: DocumentDisplayStatus
  248. completed_segments?: number
  249. total_segments?: number
  250. doc_form: ChunkingMode
  251. doc_language: string
  252. }
  253. export type SimpleDocumentDetail = InitialDocumentDetail & {
  254. enabled: boolean
  255. word_count: number
  256. is_qa: boolean // TODO waiting for backend to add this field
  257. error?: string | null
  258. archived: boolean
  259. updated_at: number
  260. hit_count: number
  261. dataset_process_rule_id?: string
  262. data_source_detail_dict?: {
  263. upload_file: {
  264. name: string
  265. extension: string
  266. }
  267. }
  268. }
  269. export type DocumentListResponse = {
  270. data: SimpleDocumentDetail[]
  271. has_more: boolean
  272. total: number
  273. page: number
  274. limit: number
  275. }
  276. export type DocumentReq = {
  277. original_document_id?: string
  278. indexing_technique?: string
  279. doc_form: ChunkingMode
  280. doc_language: string
  281. process_rule: ProcessRule
  282. }
  283. export type CreateDocumentReq = DocumentReq & {
  284. data_source: DataSource
  285. retrieval_model: RetrievalConfig
  286. embedding_model: string
  287. embedding_model_provider: string
  288. }
  289. export type IndexingEstimateParams = DocumentReq & Partial<DataSource> & {
  290. dataset_id: string
  291. }
  292. export type DataSource = {
  293. type: DataSourceType
  294. info_list: {
  295. data_source_type: DataSourceType
  296. notion_info_list?: NotionInfo[]
  297. file_info_list?: {
  298. file_ids: string[]
  299. }
  300. website_info_list?: {
  301. provider: string
  302. job_id: string
  303. urls: string[]
  304. }
  305. }
  306. }
  307. export type NotionInfo = {
  308. workspace_id: string
  309. pages: DataSourceNotionPage[]
  310. }
  311. export type NotionPage = {
  312. page_id: string
  313. type: string
  314. }
  315. export type ProcessRule = {
  316. mode: ProcessMode
  317. rules: Rules
  318. }
  319. export type createDocumentResponse = {
  320. dataset?: DataSet
  321. batch: string
  322. documents: InitialDocumentDetail[]
  323. }
  324. export type PrecessRule = {
  325. mode: ProcessMode
  326. rules: Rules
  327. }
  328. export type FullDocumentDetail = SimpleDocumentDetail & {
  329. batch: string
  330. created_api_request_id: string
  331. processing_started_at: number
  332. parsing_completed_at: number
  333. cleaning_completed_at: number
  334. splitting_completed_at: number
  335. tokens: number
  336. indexing_latency: number
  337. completed_at: number
  338. paused_by: string
  339. paused_at: number
  340. stopped_at: number
  341. indexing_status: string
  342. disabled_at: number
  343. disabled_by: string
  344. archived_reason: 'rule_modified' | 're_upload'
  345. archived_by: string
  346. archived_at: number
  347. doc_type?: DocType | null | 'others'
  348. doc_metadata?: DocMetadata | null
  349. segment_count: number
  350. dataset_process_rule: PrecessRule
  351. document_process_rule: ProcessRule
  352. [key: string]: any
  353. }
  354. export type DocMetadata = {
  355. title: string
  356. language: string
  357. author: string
  358. publisher: string
  359. publicationDate: string
  360. ISBN: string
  361. category: string
  362. [key: string]: string
  363. }
  364. export const CUSTOMIZABLE_DOC_TYPES = [
  365. 'book',
  366. 'web_page',
  367. 'paper',
  368. 'social_media_post',
  369. 'personal_document',
  370. 'business_document',
  371. 'im_chat_log',
  372. ] as const
  373. export const FIXED_DOC_TYPES = ['synced_from_github', 'synced_from_notion', 'wikipedia_entry'] as const
  374. export type CustomizableDocType = typeof CUSTOMIZABLE_DOC_TYPES[number]
  375. export type FixedDocType = typeof FIXED_DOC_TYPES[number]
  376. export type DocType = CustomizableDocType | FixedDocType
  377. export type DocumentDetailResponse = FullDocumentDetail
  378. export const SEGMENT_STATUS_LIST = ['waiting', 'completed', 'error', 'indexing']
  379. export type SegmentStatus = typeof SEGMENT_STATUS_LIST[number]
  380. export type SegmentsQuery = {
  381. page?: string
  382. limit: number
  383. // status?: SegmentStatus
  384. hit_count_gte?: number
  385. keyword?: string
  386. enabled?: boolean | 'all'
  387. }
  388. export type SegmentDetailModel = {
  389. id: string
  390. position: number
  391. document_id: string
  392. content: string
  393. word_count: number
  394. tokens: number
  395. keywords: string[]
  396. index_node_id: string
  397. index_node_hash: string
  398. hit_count: number
  399. enabled: boolean
  400. disabled_at: number
  401. disabled_by: string
  402. status: SegmentStatus
  403. created_by: string
  404. created_at: number
  405. indexing_at: number
  406. completed_at: number
  407. error: string | null
  408. stopped_at: number
  409. answer?: string
  410. child_chunks?: ChildChunkDetail[]
  411. updated_at: number
  412. }
  413. export type SegmentsResponse = {
  414. data: SegmentDetailModel[]
  415. has_more: boolean
  416. limit: number
  417. total: number
  418. total_pages: number
  419. page: number
  420. }
  421. export type HitTestingRecord = {
  422. id: string
  423. content: string
  424. source: 'app' | 'hit_testing' | 'plugin'
  425. source_app_id: string
  426. created_by_role: 'account' | 'end_user'
  427. created_by: string
  428. created_at: number
  429. }
  430. export type HitTestingChildChunk = {
  431. id: string
  432. content: string
  433. position: number
  434. score: number
  435. }
  436. export type HitTesting = {
  437. segment: Segment
  438. content: Segment
  439. score: number
  440. tsne_position: TsnePosition
  441. child_chunks?: HitTestingChildChunk[] | null
  442. }
  443. export type ExternalKnowledgeBaseHitTesting = {
  444. content: string
  445. title: string
  446. score: number
  447. metadata: {
  448. 'x-amz-bedrock-kb-source-uri': string
  449. 'x-amz-bedrock-kb-data-source-id': string
  450. }
  451. }
  452. export type Segment = {
  453. id: string
  454. document: Document
  455. content: string
  456. position: number
  457. word_count: number
  458. tokens: number
  459. keywords: string[]
  460. hit_count: number
  461. index_node_hash: string
  462. }
  463. export type Document = {
  464. id: string
  465. data_source_type: string
  466. name: string
  467. doc_type: DocType
  468. }
  469. export type HitTestingRecordsResponse = {
  470. data: HitTestingRecord[]
  471. has_more: boolean
  472. limit: number
  473. total: number
  474. page: number
  475. }
  476. export type TsnePosition = {
  477. x: number
  478. y: number
  479. }
  480. export type HitTestingResponse = {
  481. query: {
  482. content: string
  483. tsne_position: TsnePosition
  484. }
  485. records: Array<HitTesting>
  486. }
  487. export type ExternalKnowledgeBaseHitTestingResponse = {
  488. query: {
  489. content: string
  490. }
  491. records: Array<ExternalKnowledgeBaseHitTesting>
  492. }
  493. export type RelatedApp = {
  494. id: string
  495. name: string
  496. mode: AppMode
  497. icon_type: AppIconType | null
  498. icon: string
  499. icon_background: string
  500. icon_url: string
  501. }
  502. export type RelatedAppResponse = {
  503. data: Array<RelatedApp>
  504. total: number
  505. }
  506. export type SegmentUpdater = {
  507. content: string
  508. answer?: string
  509. keywords?: string[]
  510. regenerate_child_chunks?: boolean
  511. }
  512. export type ErrorDocsResponse = {
  513. data: IndexingStatusResponse[]
  514. total: number
  515. }
  516. export type SelectedDatasetsMode = {
  517. allHighQuality: boolean
  518. allHighQualityVectorSearch: boolean
  519. allHighQualityFullTextSearch: boolean
  520. allEconomic: boolean
  521. mixtureHighQualityAndEconomic: boolean
  522. allInternal: boolean
  523. allExternal: boolean
  524. mixtureInternalAndExternal: boolean
  525. inconsistentEmbeddingModel: boolean
  526. }
  527. export enum WeightedScoreEnum {
  528. SemanticFirst = 'semantic_first',
  529. KeywordFirst = 'keyword_first',
  530. Customized = 'customized',
  531. }
  532. export enum RerankingModeEnum {
  533. RerankingModel = 'reranking_model',
  534. WeightedScore = 'weighted_score',
  535. }
  536. export const DEFAULT_WEIGHTED_SCORE = {
  537. allHighQualityVectorSearch: {
  538. semantic: 1.0,
  539. keyword: 0,
  540. },
  541. allHighQualityFullTextSearch: {
  542. semantic: 0,
  543. keyword: 1.0,
  544. },
  545. other: {
  546. semantic: 0.7,
  547. keyword: 0.3,
  548. },
  549. }
  550. export type ChildChunkType = 'automatic' | 'customized'
  551. export type ChildChunkDetail = {
  552. id: string
  553. position: number
  554. segment_id: string
  555. content: string
  556. word_count: number
  557. created_at: number
  558. updated_at: number
  559. type: ChildChunkType
  560. }
  561. export type ChildSegmentsResponse = {
  562. data: ChildChunkDetail[]
  563. total: number
  564. total_pages: number
  565. page: number
  566. limit: number
  567. }
  568. export type UpdateDocumentParams = {
  569. datasetId: string
  570. documentId: string
  571. }
  572. // Used in api url
  573. export enum DocumentActionType {
  574. enable = 'enable',
  575. disable = 'disable',
  576. archive = 'archive',
  577. unArchive = 'un_archive',
  578. delete = 'delete',
  579. }
  580. export type UpdateDocumentBatchParams = {
  581. datasetId: string
  582. documentId?: string
  583. documentIds?: string[] | string
  584. }
  585. export type BatchImportResponse = {
  586. job_id: string
  587. job_status: string
  588. }