datasets.ts 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694
  1. import type { DataSourceNotionPage, DataSourceProvider } from './common'
  2. import type { AppIconType, AppMode, RetrievalConfig } from '@/types/app'
  3. import type { Tag } from '@/app/components/base/tag-management/constant'
  4. import type { IndexingType } from '@/app/components/datasets/create/step-two'
  5. import type { MetadataFilteringVariableType } from '@/app/components/workflow/nodes/knowledge-retrieval/types'
  6. import type { MetadataItemWithValue } from '@/app/components/datasets/metadata/types'
  7. export enum DataSourceType {
  8. FILE = 'upload_file',
  9. NOTION = 'notion_import',
  10. WEB = 'website_crawl',
  11. }
  12. export enum DatasetPermission {
  13. onlyMe = 'only_me',
  14. allTeamMembers = 'all_team_members',
  15. partialMembers = 'partial_members',
  16. }
  17. export enum ChunkingMode {
  18. text = 'text_model', // General text
  19. qa = 'qa_model', // General QA
  20. parentChild = 'hierarchical_model', // Parent-Child
  21. }
  22. export type MetadataInDoc = {
  23. value: string
  24. id: string
  25. type: MetadataFilteringVariableType
  26. name: string
  27. }
  28. export type DataSet = {
  29. id: string
  30. name: string
  31. icon: string
  32. icon_background: string
  33. description: string
  34. permission: DatasetPermission
  35. data_source_type: DataSourceType
  36. indexing_technique: IndexingType
  37. created_by: string
  38. updated_by: string
  39. updated_at: number
  40. app_count: number
  41. doc_form: ChunkingMode
  42. document_count: number
  43. word_count: number
  44. provider: string
  45. embedding_model: string
  46. embedding_model_provider: string
  47. embedding_available: boolean
  48. retrieval_model_dict: RetrievalConfig
  49. retrieval_model: RetrievalConfig
  50. tags: Tag[]
  51. partial_member_list?: string[]
  52. external_knowledge_info: {
  53. external_knowledge_id: string
  54. external_knowledge_api_id: string
  55. external_knowledge_api_name: string
  56. external_knowledge_api_endpoint: string
  57. }
  58. external_retrieval_model: {
  59. top_k: number
  60. score_threshold: number
  61. score_threshold_enabled: boolean
  62. }
  63. built_in_field_enabled: boolean
  64. doc_metadata?: MetadataInDoc[]
  65. }
  66. export type ExternalAPIItem = {
  67. id: string
  68. tenant_id: string
  69. name: string
  70. description: string
  71. settings: {
  72. endpoint: string
  73. api_key: string
  74. }
  75. dataset_bindings: { id: string; name: string }[]
  76. created_by: string
  77. created_at: string
  78. }
  79. export type ExternalKnowledgeItem = {
  80. id: string
  81. name: string
  82. description: string | null
  83. provider: 'external'
  84. permission: DatasetPermission
  85. data_source_type: null
  86. indexing_technique: null
  87. app_count: number
  88. document_count: number
  89. word_count: number
  90. created_by: string
  91. created_at: string
  92. updated_by: string
  93. updated_at: string
  94. tags: Tag[]
  95. }
  96. export type ExternalAPIDeleteResponse = {
  97. result: 'success' | 'error'
  98. }
  99. export type ExternalAPIUsage = {
  100. is_using: boolean
  101. count: number
  102. }
  103. export type CustomFile = File & {
  104. id?: string
  105. extension?: string
  106. mime_type?: string
  107. created_by?: string
  108. created_at?: number
  109. }
  110. export type DocumentItem = {
  111. id: string
  112. name: string
  113. extension: string
  114. }
  115. export type CrawlOptions = {
  116. crawl_sub_pages: boolean
  117. only_main_content: boolean
  118. includes: string
  119. excludes: string
  120. limit: number | string
  121. max_depth: number | string
  122. use_sitemap: boolean
  123. }
  124. export type CrawlResultItem = {
  125. title: string
  126. markdown: string
  127. description: string
  128. source_url: string
  129. }
  130. export type FileItem = {
  131. fileID: string
  132. file: CustomFile
  133. progress: number
  134. }
  135. export type FetchDatasetsParams = {
  136. url: string
  137. params: {
  138. page: number
  139. ids?: string[]
  140. tag_ids?: string[]
  141. limit?: number
  142. include_all?: boolean
  143. keyword?: string
  144. }
  145. }
  146. export type DataSetListResponse = {
  147. data: DataSet[]
  148. has_more: boolean
  149. limit: number
  150. page: number
  151. total: number
  152. }
  153. export type ExternalAPIListResponse = {
  154. data: ExternalAPIItem[]
  155. has_more: boolean
  156. limit: number
  157. page: number
  158. total: number
  159. }
  160. export type QA = {
  161. question: string
  162. answer: string
  163. }
  164. export type IndexingEstimateResponse = {
  165. tokens: number
  166. total_price: number
  167. currency: string
  168. total_segments: number
  169. preview: Array<{ content: string; child_chunks: string[] }>
  170. qa_preview?: QA[]
  171. }
  172. export type FileIndexingEstimateResponse = {
  173. total_nodes: number
  174. } & IndexingEstimateResponse
  175. export type IndexingStatusResponse = {
  176. id: string
  177. indexing_status: DocumentIndexingStatus
  178. processing_started_at: number
  179. parsing_completed_at: number
  180. cleaning_completed_at: number
  181. splitting_completed_at: number
  182. completed_at: any
  183. paused_at: any
  184. error: any
  185. stopped_at: any
  186. completed_segments: number
  187. total_segments: number
  188. }
  189. export type IndexingStatusBatchResponse = {
  190. data: IndexingStatusResponse[]
  191. }
  192. export enum ProcessMode {
  193. general = 'custom',
  194. parentChild = 'hierarchical',
  195. }
  196. export type ParentMode = 'full-doc' | 'paragraph'
  197. export type ProcessRuleResponse = {
  198. mode: ProcessMode
  199. rules: Rules
  200. limits: Limits
  201. }
  202. export type Rules = {
  203. pre_processing_rules: PreProcessingRule[]
  204. segmentation: Segmentation
  205. parent_mode: ParentMode
  206. subchunk_segmentation: Segmentation
  207. }
  208. export type Limits = {
  209. indexing_max_segmentation_tokens_length: number
  210. }
  211. export type PreProcessingRule = {
  212. id: string
  213. enabled: boolean
  214. }
  215. export type Segmentation = {
  216. separator: string
  217. max_tokens: number
  218. chunk_overlap?: number
  219. }
  220. export const DocumentIndexingStatusList = [
  221. 'waiting',
  222. 'parsing',
  223. 'cleaning',
  224. 'splitting',
  225. 'indexing',
  226. 'paused',
  227. 'error',
  228. 'completed',
  229. ] as const
  230. export type DocumentIndexingStatus = typeof DocumentIndexingStatusList[number]
  231. export const DisplayStatusList = [
  232. 'queuing',
  233. 'indexing',
  234. 'paused',
  235. 'error',
  236. 'available',
  237. 'enabled',
  238. 'disabled',
  239. 'archived',
  240. ] as const
  241. export type DocumentDisplayStatus = typeof DisplayStatusList[number]
  242. export type DataSourceInfo = {
  243. upload_file: {
  244. id: string
  245. name: string
  246. size: number
  247. mime_type: string
  248. created_at: number
  249. created_by: string
  250. extension: string
  251. }
  252. notion_page_icon?: string
  253. notion_workspace_id?: string
  254. notion_page_id?: string
  255. provider?: DataSourceProvider
  256. job_id: string
  257. url: string
  258. }
  259. export type InitialDocumentDetail = {
  260. id: string
  261. batch: string
  262. position: number
  263. dataset_id: string
  264. data_source_type: DataSourceType
  265. data_source_info: DataSourceInfo
  266. dataset_process_rule_id: string
  267. name: string
  268. created_from: 'api' | 'web'
  269. created_by: string
  270. created_at: number
  271. indexing_status: DocumentIndexingStatus
  272. display_status: DocumentDisplayStatus
  273. completed_segments?: number
  274. total_segments?: number
  275. doc_form: ChunkingMode
  276. doc_language: string
  277. }
  278. export type SimpleDocumentDetail = InitialDocumentDetail & {
  279. enabled: boolean
  280. word_count: number
  281. is_qa: boolean // TODO waiting for backend to add this field
  282. error?: string | null
  283. archived: boolean
  284. updated_at: number
  285. hit_count: number
  286. dataset_process_rule_id?: string
  287. data_source_detail_dict?: {
  288. upload_file: {
  289. name: string
  290. extension: string
  291. }
  292. }
  293. doc_metadata?: MetadataItemWithValue[]
  294. }
  295. export type DocumentListResponse = {
  296. data: SimpleDocumentDetail[]
  297. has_more: boolean
  298. total: number
  299. page: number
  300. limit: number
  301. }
  302. export type DocumentReq = {
  303. original_document_id?: string
  304. indexing_technique?: string
  305. doc_form: ChunkingMode
  306. doc_language: string
  307. process_rule: ProcessRule
  308. }
  309. export type CreateDocumentReq = DocumentReq & {
  310. data_source: DataSource
  311. retrieval_model: RetrievalConfig
  312. embedding_model: string
  313. embedding_model_provider: string
  314. }
  315. export type IndexingEstimateParams = DocumentReq & Partial<DataSource> & {
  316. dataset_id: string
  317. }
  318. export type DataSource = {
  319. type: DataSourceType
  320. info_list: {
  321. data_source_type: DataSourceType
  322. notion_info_list?: NotionInfo[]
  323. file_info_list?: {
  324. file_ids: string[]
  325. }
  326. website_info_list?: {
  327. provider: string
  328. job_id: string
  329. urls: string[]
  330. }
  331. }
  332. }
  333. export type NotionInfo = {
  334. workspace_id: string
  335. pages: DataSourceNotionPage[]
  336. }
  337. export type NotionPage = {
  338. page_id: string
  339. type: string
  340. }
  341. export type ProcessRule = {
  342. mode: ProcessMode
  343. rules: Rules
  344. }
  345. export type createDocumentResponse = {
  346. dataset?: DataSet
  347. batch: string
  348. documents: InitialDocumentDetail[]
  349. }
  350. export type PrecessRule = {
  351. mode: ProcessMode
  352. rules: Rules
  353. }
  354. export type FullDocumentDetail = SimpleDocumentDetail & {
  355. batch: string
  356. created_api_request_id: string
  357. processing_started_at: number
  358. parsing_completed_at: number
  359. cleaning_completed_at: number
  360. splitting_completed_at: number
  361. tokens: number
  362. indexing_latency: number
  363. completed_at: number
  364. paused_by: string
  365. paused_at: number
  366. stopped_at: number
  367. indexing_status: string
  368. disabled_at: number
  369. disabled_by: string
  370. archived_reason: 'rule_modified' | 're_upload'
  371. archived_by: string
  372. archived_at: number
  373. doc_type?: DocType | null | 'others'
  374. doc_metadata?: DocMetadata | null
  375. segment_count: number
  376. dataset_process_rule: PrecessRule
  377. document_process_rule: ProcessRule
  378. [key: string]: any
  379. }
  380. export type DocMetadata = {
  381. title: string
  382. language: string
  383. author: string
  384. publisher: string
  385. publicationDate: string
  386. ISBN: string
  387. category: string
  388. [key: string]: string
  389. }
  390. export const CUSTOMIZABLE_DOC_TYPES = [
  391. 'book',
  392. 'web_page',
  393. 'paper',
  394. 'social_media_post',
  395. 'personal_document',
  396. 'business_document',
  397. 'im_chat_log',
  398. ] as const
  399. export const FIXED_DOC_TYPES = ['synced_from_github', 'synced_from_notion', 'wikipedia_entry'] as const
  400. export type CustomizableDocType = typeof CUSTOMIZABLE_DOC_TYPES[number]
  401. export type FixedDocType = typeof FIXED_DOC_TYPES[number]
  402. export type DocType = CustomizableDocType | FixedDocType
  403. export type DocumentDetailResponse = FullDocumentDetail
  404. export const SEGMENT_STATUS_LIST = ['waiting', 'completed', 'error', 'indexing']
  405. export type SegmentStatus = typeof SEGMENT_STATUS_LIST[number]
  406. export type SegmentsQuery = {
  407. page?: string
  408. limit: number
  409. // status?: SegmentStatus
  410. hit_count_gte?: number
  411. keyword?: string
  412. enabled?: boolean | 'all'
  413. }
  414. export type SegmentDetailModel = {
  415. id: string
  416. position: number
  417. document_id: string
  418. content: string
  419. sign_content: string
  420. word_count: number
  421. tokens: number
  422. keywords: string[]
  423. index_node_id: string
  424. index_node_hash: string
  425. hit_count: number
  426. enabled: boolean
  427. disabled_at: number
  428. disabled_by: string
  429. status: SegmentStatus
  430. created_by: string
  431. created_at: number
  432. indexing_at: number
  433. completed_at: number
  434. error: string | null
  435. stopped_at: number
  436. answer?: string
  437. child_chunks?: ChildChunkDetail[]
  438. updated_at: number
  439. }
  440. export type SegmentsResponse = {
  441. data: SegmentDetailModel[]
  442. has_more: boolean
  443. limit: number
  444. total: number
  445. total_pages: number
  446. page: number
  447. }
  448. export type HitTestingRecord = {
  449. id: string
  450. content: string
  451. source: 'app' | 'hit_testing' | 'plugin'
  452. source_app_id: string
  453. created_by_role: 'account' | 'end_user'
  454. created_by: string
  455. created_at: number
  456. }
  457. export type HitTestingChildChunk = {
  458. id: string
  459. content: string
  460. position: number
  461. score: number
  462. }
  463. export type HitTesting = {
  464. segment: Segment
  465. content: Segment
  466. score: number
  467. tsne_position: TsnePosition
  468. child_chunks?: HitTestingChildChunk[] | null
  469. }
  470. export type ExternalKnowledgeBaseHitTesting = {
  471. content: string
  472. title: string
  473. score: number
  474. metadata: {
  475. 'x-amz-bedrock-kb-source-uri': string
  476. 'x-amz-bedrock-kb-data-source-id': string
  477. }
  478. }
  479. export type Segment = {
  480. id: string
  481. document: Document
  482. content: string
  483. sign_content: string
  484. position: number
  485. word_count: number
  486. tokens: number
  487. keywords: string[]
  488. hit_count: number
  489. index_node_hash: string
  490. }
  491. export type Document = {
  492. id: string
  493. data_source_type: string
  494. name: string
  495. doc_type: DocType
  496. }
  497. export type HitTestingRecordsResponse = {
  498. data: HitTestingRecord[]
  499. has_more: boolean
  500. limit: number
  501. total: number
  502. page: number
  503. }
  504. export type TsnePosition = {
  505. x: number
  506. y: number
  507. }
  508. export type HitTestingResponse = {
  509. query: {
  510. content: string
  511. tsne_position: TsnePosition
  512. }
  513. records: Array<HitTesting>
  514. }
  515. export type ExternalKnowledgeBaseHitTestingResponse = {
  516. query: {
  517. content: string
  518. }
  519. records: Array<ExternalKnowledgeBaseHitTesting>
  520. }
  521. export type RelatedApp = {
  522. id: string
  523. name: string
  524. mode: AppMode
  525. icon_type: AppIconType | null
  526. icon: string
  527. icon_background: string
  528. icon_url: string
  529. }
  530. export type RelatedAppResponse = {
  531. data: Array<RelatedApp>
  532. total: number
  533. }
  534. export type SegmentUpdater = {
  535. content: string
  536. answer?: string
  537. keywords?: string[]
  538. regenerate_child_chunks?: boolean
  539. }
  540. export type ErrorDocsResponse = {
  541. data: IndexingStatusResponse[]
  542. total: number
  543. }
  544. export type SelectedDatasetsMode = {
  545. allHighQuality: boolean
  546. allHighQualityVectorSearch: boolean
  547. allHighQualityFullTextSearch: boolean
  548. allEconomic: boolean
  549. mixtureHighQualityAndEconomic: boolean
  550. allInternal: boolean
  551. allExternal: boolean
  552. mixtureInternalAndExternal: boolean
  553. inconsistentEmbeddingModel: boolean
  554. }
  555. export enum WeightedScoreEnum {
  556. SemanticFirst = 'semantic_first',
  557. KeywordFirst = 'keyword_first',
  558. Customized = 'customized',
  559. }
  560. export enum RerankingModeEnum {
  561. RerankingModel = 'reranking_model',
  562. WeightedScore = 'weighted_score',
  563. }
  564. export const DEFAULT_WEIGHTED_SCORE = {
  565. allHighQualityVectorSearch: {
  566. semantic: 1.0,
  567. keyword: 0,
  568. },
  569. allHighQualityFullTextSearch: {
  570. semantic: 0,
  571. keyword: 1.0,
  572. },
  573. other: {
  574. semantic: 0.7,
  575. keyword: 0.3,
  576. },
  577. }
  578. export type ChildChunkType = 'automatic' | 'customized'
  579. export type ChildChunkDetail = {
  580. id: string
  581. position: number
  582. segment_id: string
  583. content: string
  584. word_count: number
  585. created_at: number
  586. updated_at: number
  587. type: ChildChunkType
  588. }
  589. export type ChildSegmentsResponse = {
  590. data: ChildChunkDetail[]
  591. total: number
  592. total_pages: number
  593. page: number
  594. limit: number
  595. }
  596. export type UpdateDocumentParams = {
  597. datasetId: string
  598. documentId: string
  599. }
  600. // Used in api url
  601. export enum DocumentActionType {
  602. enable = 'enable',
  603. disable = 'disable',
  604. archive = 'archive',
  605. unArchive = 'un_archive',
  606. delete = 'delete',
  607. }
  608. export type UpdateDocumentBatchParams = {
  609. datasetId: string
  610. documentId?: string
  611. documentIds?: string[] | string
  612. }
  613. export type BatchImportResponse = {
  614. job_id: string
  615. job_status: string
  616. }