datasets.ts 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682
  1. import type { DataSourceNotionPage, DataSourceProvider } from './common'
  2. import type { AppIconType, AppMode, RetrievalConfig } from '@/types/app'
  3. import type { Tag } from '@/app/components/base/tag-management/constant'
  4. import type { IndexingType } from '@/app/components/datasets/create/step-two'
  5. export enum DataSourceType {
  6. FILE = 'upload_file',
  7. NOTION = 'notion_import',
  8. WEB = 'website_crawl',
  9. }
  10. export enum DatasetPermission {
  11. onlyMe = 'only_me',
  12. allTeamMembers = 'all_team_members',
  13. partialMembers = 'partial_members',
  14. }
  15. export enum ChunkingMode {
  16. text = 'text_model', // General text
  17. qa = 'qa_model', // General QA
  18. parentChild = 'hierarchical_model', // Parent-Child
  19. }
  20. export type DataSet = {
  21. id: string
  22. name: string
  23. icon: string
  24. icon_background: string
  25. description: string
  26. permission: DatasetPermission
  27. data_source_type: DataSourceType
  28. indexing_technique: IndexingType
  29. created_by: string
  30. updated_by: string
  31. updated_at: number
  32. app_count: number
  33. doc_form: ChunkingMode
  34. document_count: number
  35. word_count: number
  36. provider: string
  37. embedding_model: string
  38. embedding_model_provider: string
  39. embedding_available: boolean
  40. retrieval_model_dict: RetrievalConfig
  41. retrieval_model: RetrievalConfig
  42. tags: Tag[]
  43. partial_member_list?: string[]
  44. external_knowledge_info: {
  45. external_knowledge_id: string
  46. external_knowledge_api_id: string
  47. external_knowledge_api_name: string
  48. external_knowledge_api_endpoint: string
  49. }
  50. external_retrieval_model: {
  51. top_k: number
  52. score_threshold: number
  53. score_threshold_enabled: boolean
  54. }
  55. }
  56. export type ExternalAPIItem = {
  57. id: string
  58. tenant_id: string
  59. name: string
  60. description: string
  61. settings: {
  62. endpoint: string
  63. api_key: string
  64. }
  65. dataset_bindings: { id: string; name: string }[]
  66. created_by: string
  67. created_at: string
  68. }
  69. export type ExternalKnowledgeItem = {
  70. id: string
  71. name: string
  72. description: string | null
  73. provider: 'external'
  74. permission: DatasetPermission
  75. data_source_type: null
  76. indexing_technique: null
  77. app_count: number
  78. document_count: number
  79. word_count: number
  80. created_by: string
  81. created_at: string
  82. updated_by: string
  83. updated_at: string
  84. tags: Tag[]
  85. }
  86. export type ExternalAPIDeleteResponse = {
  87. result: 'success' | 'error'
  88. }
  89. export type ExternalAPIUsage = {
  90. is_using: boolean
  91. count: number
  92. }
  93. export type CustomFile = File & {
  94. id?: string
  95. extension?: string
  96. mime_type?: string
  97. created_by?: string
  98. created_at?: number
  99. }
  100. export type DocumentItem = {
  101. id: string
  102. name: string
  103. extension: string
  104. }
  105. export type CrawlOptions = {
  106. crawl_sub_pages: boolean
  107. only_main_content: boolean
  108. includes: string
  109. excludes: string
  110. limit: number | string
  111. max_depth: number | string
  112. use_sitemap: boolean
  113. }
  114. export type CrawlResultItem = {
  115. title: string
  116. markdown: string
  117. description: string
  118. source_url: string
  119. }
  120. export type FileItem = {
  121. fileID: string
  122. file: CustomFile
  123. progress: number
  124. }
  125. export type FetchDatasetsParams = {
  126. url: string
  127. params: {
  128. page: number
  129. ids?: string[]
  130. tag_ids?: string[]
  131. limit?: number
  132. include_all?: boolean
  133. keyword?: string
  134. }
  135. }
  136. export type DataSetListResponse = {
  137. data: DataSet[]
  138. has_more: boolean
  139. limit: number
  140. page: number
  141. total: number
  142. }
  143. export type ExternalAPIListResponse = {
  144. data: ExternalAPIItem[]
  145. has_more: boolean
  146. limit: number
  147. page: number
  148. total: number
  149. }
  150. export type QA = {
  151. question: string
  152. answer: string
  153. }
  154. export type IndexingEstimateResponse = {
  155. tokens: number
  156. total_price: number
  157. currency: string
  158. total_segments: number
  159. preview: Array<{ content: string; child_chunks: string[] }>
  160. qa_preview?: QA[]
  161. }
  162. export type FileIndexingEstimateResponse = {
  163. total_nodes: number
  164. } & IndexingEstimateResponse
  165. export type IndexingStatusResponse = {
  166. id: string
  167. indexing_status: DocumentIndexingStatus
  168. processing_started_at: number
  169. parsing_completed_at: number
  170. cleaning_completed_at: number
  171. splitting_completed_at: number
  172. completed_at: any
  173. paused_at: any
  174. error: any
  175. stopped_at: any
  176. completed_segments: number
  177. total_segments: number
  178. }
  179. export type IndexingStatusBatchResponse = {
  180. data: IndexingStatusResponse[]
  181. }
  182. export enum ProcessMode {
  183. general = 'custom',
  184. parentChild = 'hierarchical',
  185. }
  186. export type ParentMode = 'full-doc' | 'paragraph'
  187. export type ProcessRuleResponse = {
  188. mode: ProcessMode
  189. rules: Rules
  190. limits: Limits
  191. }
  192. export type Rules = {
  193. pre_processing_rules: PreProcessingRule[]
  194. segmentation: Segmentation
  195. parent_mode: ParentMode
  196. subchunk_segmentation: Segmentation
  197. }
  198. export type Limits = {
  199. indexing_max_segmentation_tokens_length: number
  200. }
  201. export type PreProcessingRule = {
  202. id: string
  203. enabled: boolean
  204. }
  205. export type Segmentation = {
  206. separator: string
  207. max_tokens: number
  208. chunk_overlap?: number
  209. }
  210. export const DocumentIndexingStatusList = [
  211. 'waiting',
  212. 'parsing',
  213. 'cleaning',
  214. 'splitting',
  215. 'indexing',
  216. 'paused',
  217. 'error',
  218. 'completed',
  219. ] as const
  220. export type DocumentIndexingStatus = typeof DocumentIndexingStatusList[number]
  221. export const DisplayStatusList = [
  222. 'queuing',
  223. 'indexing',
  224. 'paused',
  225. 'error',
  226. 'available',
  227. 'enabled',
  228. 'disabled',
  229. 'archived',
  230. ] as const
  231. export type DocumentDisplayStatus = typeof DisplayStatusList[number]
  232. export type DataSourceInfo = {
  233. upload_file: {
  234. id: string
  235. name: string
  236. size: number
  237. mime_type: string
  238. created_at: number
  239. created_by: string
  240. extension: string
  241. }
  242. notion_page_icon?: string
  243. notion_workspace_id?: string
  244. notion_page_id?: string
  245. provider?: DataSourceProvider
  246. job_id: string
  247. url: string
  248. }
  249. export type InitialDocumentDetail = {
  250. id: string
  251. batch: string
  252. position: number
  253. dataset_id: string
  254. data_source_type: DataSourceType
  255. data_source_info: DataSourceInfo
  256. dataset_process_rule_id: string
  257. name: string
  258. created_from: 'api' | 'web'
  259. created_by: string
  260. created_at: number
  261. indexing_status: DocumentIndexingStatus
  262. display_status: DocumentDisplayStatus
  263. completed_segments?: number
  264. total_segments?: number
  265. doc_form: ChunkingMode
  266. doc_language: string
  267. }
  268. export type SimpleDocumentDetail = InitialDocumentDetail & {
  269. enabled: boolean
  270. word_count: number
  271. is_qa: boolean // TODO waiting for backend to add this field
  272. error?: string | null
  273. archived: boolean
  274. updated_at: number
  275. hit_count: number
  276. dataset_process_rule_id?: string
  277. data_source_detail_dict?: {
  278. upload_file: {
  279. name: string
  280. extension: string
  281. }
  282. }
  283. }
  284. export type DocumentListResponse = {
  285. data: SimpleDocumentDetail[]
  286. has_more: boolean
  287. total: number
  288. page: number
  289. limit: number
  290. }
  291. export type DocumentReq = {
  292. original_document_id?: string
  293. indexing_technique?: string
  294. doc_form: ChunkingMode
  295. doc_language: string
  296. process_rule: ProcessRule
  297. }
  298. export type CreateDocumentReq = DocumentReq & {
  299. data_source: DataSource
  300. retrieval_model: RetrievalConfig
  301. embedding_model: string
  302. embedding_model_provider: string
  303. }
  304. export type IndexingEstimateParams = DocumentReq & Partial<DataSource> & {
  305. dataset_id: string
  306. }
  307. export type DataSource = {
  308. type: DataSourceType
  309. info_list: {
  310. data_source_type: DataSourceType
  311. notion_info_list?: NotionInfo[]
  312. file_info_list?: {
  313. file_ids: string[]
  314. }
  315. website_info_list?: {
  316. provider: string
  317. job_id: string
  318. urls: string[]
  319. }
  320. }
  321. }
  322. export type NotionInfo = {
  323. workspace_id: string
  324. pages: DataSourceNotionPage[]
  325. }
  326. export type NotionPage = {
  327. page_id: string
  328. type: string
  329. }
  330. export type ProcessRule = {
  331. mode: ProcessMode
  332. rules: Rules
  333. }
  334. export type createDocumentResponse = {
  335. dataset?: DataSet
  336. batch: string
  337. documents: InitialDocumentDetail[]
  338. }
  339. export type PrecessRule = {
  340. mode: ProcessMode
  341. rules: Rules
  342. }
  343. export type FullDocumentDetail = SimpleDocumentDetail & {
  344. batch: string
  345. created_api_request_id: string
  346. processing_started_at: number
  347. parsing_completed_at: number
  348. cleaning_completed_at: number
  349. splitting_completed_at: number
  350. tokens: number
  351. indexing_latency: number
  352. completed_at: number
  353. paused_by: string
  354. paused_at: number
  355. stopped_at: number
  356. indexing_status: string
  357. disabled_at: number
  358. disabled_by: string
  359. archived_reason: 'rule_modified' | 're_upload'
  360. archived_by: string
  361. archived_at: number
  362. doc_type?: DocType | null | 'others'
  363. doc_metadata?: DocMetadata | null
  364. segment_count: number
  365. dataset_process_rule: PrecessRule
  366. document_process_rule: ProcessRule
  367. [key: string]: any
  368. }
  369. export type DocMetadata = {
  370. title: string
  371. language: string
  372. author: string
  373. publisher: string
  374. publicationDate: string
  375. ISBN: string
  376. category: string
  377. [key: string]: string
  378. }
  379. export const CUSTOMIZABLE_DOC_TYPES = [
  380. 'book',
  381. 'web_page',
  382. 'paper',
  383. 'social_media_post',
  384. 'personal_document',
  385. 'business_document',
  386. 'im_chat_log',
  387. ] as const
  388. export const FIXED_DOC_TYPES = ['synced_from_github', 'synced_from_notion', 'wikipedia_entry'] as const
  389. export type CustomizableDocType = typeof CUSTOMIZABLE_DOC_TYPES[number]
  390. export type FixedDocType = typeof FIXED_DOC_TYPES[number]
  391. export type DocType = CustomizableDocType | FixedDocType
  392. export type DocumentDetailResponse = FullDocumentDetail
  393. export const SEGMENT_STATUS_LIST = ['waiting', 'completed', 'error', 'indexing']
  394. export type SegmentStatus = typeof SEGMENT_STATUS_LIST[number]
  395. export type SegmentsQuery = {
  396. page?: string
  397. limit: number
  398. // status?: SegmentStatus
  399. hit_count_gte?: number
  400. keyword?: string
  401. enabled?: boolean | 'all'
  402. }
  403. export type SegmentDetailModel = {
  404. id: string
  405. position: number
  406. document_id: string
  407. content: string
  408. sign_content: string
  409. word_count: number
  410. tokens: number
  411. keywords: string[]
  412. index_node_id: string
  413. index_node_hash: string
  414. hit_count: number
  415. enabled: boolean
  416. disabled_at: number
  417. disabled_by: string
  418. status: SegmentStatus
  419. created_by: string
  420. created_at: number
  421. indexing_at: number
  422. completed_at: number
  423. error: string | null
  424. stopped_at: number
  425. answer?: string
  426. child_chunks?: ChildChunkDetail[]
  427. updated_at: number
  428. }
  429. export type SegmentsResponse = {
  430. data: SegmentDetailModel[]
  431. has_more: boolean
  432. limit: number
  433. total: number
  434. total_pages: number
  435. page: number
  436. }
  437. export type HitTestingRecord = {
  438. id: string
  439. content: string
  440. source: 'app' | 'hit_testing' | 'plugin'
  441. source_app_id: string
  442. created_by_role: 'account' | 'end_user'
  443. created_by: string
  444. created_at: number
  445. }
  446. export type HitTestingChildChunk = {
  447. id: string
  448. content: string
  449. position: number
  450. score: number
  451. }
  452. export type HitTesting = {
  453. segment: Segment
  454. content: Segment
  455. score: number
  456. tsne_position: TsnePosition
  457. child_chunks?: HitTestingChildChunk[] | null
  458. }
  459. export type ExternalKnowledgeBaseHitTesting = {
  460. content: string
  461. title: string
  462. score: number
  463. metadata: {
  464. 'x-amz-bedrock-kb-source-uri': string
  465. 'x-amz-bedrock-kb-data-source-id': string
  466. }
  467. }
  468. export type Segment = {
  469. id: string
  470. document: Document
  471. content: string
  472. sign_content: string
  473. position: number
  474. word_count: number
  475. tokens: number
  476. keywords: string[]
  477. hit_count: number
  478. index_node_hash: string
  479. }
  480. export type Document = {
  481. id: string
  482. data_source_type: string
  483. name: string
  484. doc_type: DocType
  485. }
  486. export type HitTestingRecordsResponse = {
  487. data: HitTestingRecord[]
  488. has_more: boolean
  489. limit: number
  490. total: number
  491. page: number
  492. }
  493. export type TsnePosition = {
  494. x: number
  495. y: number
  496. }
  497. export type HitTestingResponse = {
  498. query: {
  499. content: string
  500. tsne_position: TsnePosition
  501. }
  502. records: Array<HitTesting>
  503. }
  504. export type ExternalKnowledgeBaseHitTestingResponse = {
  505. query: {
  506. content: string
  507. }
  508. records: Array<ExternalKnowledgeBaseHitTesting>
  509. }
  510. export type RelatedApp = {
  511. id: string
  512. name: string
  513. mode: AppMode
  514. icon_type: AppIconType | null
  515. icon: string
  516. icon_background: string
  517. icon_url: string
  518. }
  519. export type RelatedAppResponse = {
  520. data: Array<RelatedApp>
  521. total: number
  522. }
  523. export type SegmentUpdater = {
  524. content: string
  525. answer?: string
  526. keywords?: string[]
  527. regenerate_child_chunks?: boolean
  528. }
  529. export type ErrorDocsResponse = {
  530. data: IndexingStatusResponse[]
  531. total: number
  532. }
  533. export type SelectedDatasetsMode = {
  534. allHighQuality: boolean
  535. allHighQualityVectorSearch: boolean
  536. allHighQualityFullTextSearch: boolean
  537. allEconomic: boolean
  538. mixtureHighQualityAndEconomic: boolean
  539. allInternal: boolean
  540. allExternal: boolean
  541. mixtureInternalAndExternal: boolean
  542. inconsistentEmbeddingModel: boolean
  543. }
  544. export enum WeightedScoreEnum {
  545. SemanticFirst = 'semantic_first',
  546. KeywordFirst = 'keyword_first',
  547. Customized = 'customized',
  548. }
  549. export enum RerankingModeEnum {
  550. RerankingModel = 'reranking_model',
  551. WeightedScore = 'weighted_score',
  552. }
  553. export const DEFAULT_WEIGHTED_SCORE = {
  554. allHighQualityVectorSearch: {
  555. semantic: 1.0,
  556. keyword: 0,
  557. },
  558. allHighQualityFullTextSearch: {
  559. semantic: 0,
  560. keyword: 1.0,
  561. },
  562. other: {
  563. semantic: 0.7,
  564. keyword: 0.3,
  565. },
  566. }
  567. export type ChildChunkType = 'automatic' | 'customized'
  568. export type ChildChunkDetail = {
  569. id: string
  570. position: number
  571. segment_id: string
  572. content: string
  573. word_count: number
  574. created_at: number
  575. updated_at: number
  576. type: ChildChunkType
  577. }
  578. export type ChildSegmentsResponse = {
  579. data: ChildChunkDetail[]
  580. total: number
  581. total_pages: number
  582. page: number
  583. limit: number
  584. }
  585. export type UpdateDocumentParams = {
  586. datasetId: string
  587. documentId: string
  588. }
  589. // Used in api url
  590. export enum DocumentActionType {
  591. enable = 'enable',
  592. disable = 'disable',
  593. archive = 'archive',
  594. unArchive = 'un_archive',
  595. delete = 'delete',
  596. }
  597. export type UpdateDocumentBatchParams = {
  598. datasetId: string
  599. documentId?: string
  600. documentIds?: string[] | string
  601. }
  602. export type BatchImportResponse = {
  603. job_id: string
  604. job_status: string
  605. }