datasets.ts 9.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500
  1. import type { DataSourceNotionPage } from './common'
  2. import type { AppIconType, AppMode, RetrievalConfig } from '@/types/app'
  3. import type { Tag } from '@/app/components/base/tag-management/constant'
  4. export enum DataSourceType {
  5. FILE = 'upload_file',
  6. NOTION = 'notion_import',
  7. WEB = 'website_crawl',
  8. }
  9. export type DatasetPermission = 'only_me' | 'all_team_members' | 'partial_members'
  10. export type DataSet = {
  11. id: string
  12. name: string
  13. icon: string
  14. icon_background: string
  15. description: string
  16. permission: DatasetPermission
  17. data_source_type: DataSourceType
  18. indexing_technique: 'high_quality' | 'economy'
  19. created_by: string
  20. updated_by: string
  21. updated_at: number
  22. app_count: number
  23. document_count: number
  24. word_count: number
  25. embedding_model: string
  26. embedding_model_provider: string
  27. embedding_available: boolean
  28. retrieval_model_dict: RetrievalConfig
  29. retrieval_model: RetrievalConfig
  30. tags: Tag[]
  31. partial_member_list?: any[]
  32. }
  33. export type CustomFile = File & {
  34. id?: string
  35. extension?: string
  36. mime_type?: string
  37. created_by?: string
  38. created_at?: number
  39. }
  40. export type CrawlOptions = {
  41. crawl_sub_pages: boolean
  42. only_main_content: boolean
  43. includes: string
  44. excludes: string
  45. limit: number | string
  46. max_depth: number | string
  47. use_sitemap: boolean
  48. }
  49. export type CrawlResultItem = {
  50. title: string
  51. markdown: string
  52. description: string
  53. source_url: string
  54. }
  55. export type FileItem = {
  56. fileID: string
  57. file: CustomFile
  58. progress: number
  59. }
  60. export type DataSetListResponse = {
  61. data: DataSet[]
  62. has_more: boolean
  63. limit: number
  64. page: number
  65. total: number
  66. }
  67. export type QA = {
  68. question: string
  69. answer: string
  70. }
  71. export type IndexingEstimateResponse = {
  72. tokens: number
  73. total_price: number
  74. currency: string
  75. total_segments: number
  76. preview: string[]
  77. qa_preview?: QA[]
  78. }
  79. export type FileIndexingEstimateResponse = {
  80. total_nodes: number
  81. } & IndexingEstimateResponse
  82. export type IndexingStatusResponse = {
  83. id: string
  84. indexing_status: DocumentIndexingStatus
  85. processing_started_at: number
  86. parsing_completed_at: number
  87. cleaning_completed_at: number
  88. splitting_completed_at: number
  89. completed_at: any
  90. paused_at: any
  91. error: any
  92. stopped_at: any
  93. completed_segments: number
  94. total_segments: number
  95. }
  96. export type IndexingStatusBatchResponse = {
  97. data: IndexingStatusResponse[]
  98. }
  99. export type ProcessMode = 'automatic' | 'custom'
  100. export type ProcessRuleResponse = {
  101. mode: ProcessMode
  102. rules: Rules
  103. }
  104. export type Rules = {
  105. pre_processing_rules: PreProcessingRule[]
  106. segmentation: Segmentation
  107. }
  108. export type PreProcessingRule = {
  109. id: string
  110. enabled: boolean
  111. }
  112. export type Segmentation = {
  113. separator: string
  114. max_tokens: number
  115. chunk_overlap: number
  116. }
  117. export const DocumentIndexingStatusList = [
  118. 'waiting',
  119. 'parsing',
  120. 'cleaning',
  121. 'splitting',
  122. 'indexing',
  123. 'paused',
  124. 'error',
  125. 'completed',
  126. ] as const
  127. export type DocumentIndexingStatus = typeof DocumentIndexingStatusList[number]
  128. export const DisplayStatusList = [
  129. 'queuing',
  130. 'indexing',
  131. 'paused',
  132. 'error',
  133. 'available',
  134. 'enabled',
  135. 'disabled',
  136. 'archived',
  137. ] as const
  138. export type DocumentDisplayStatus = typeof DisplayStatusList[number]
  139. export type DataSourceInfo = {
  140. upload_file: {
  141. id: string
  142. name: string
  143. size: number
  144. mime_type: string
  145. created_at: number
  146. created_by: string
  147. extension: string
  148. }
  149. notion_page_icon?: string
  150. job_id: string
  151. url: string
  152. }
  153. export type InitialDocumentDetail = {
  154. id: string
  155. batch: string
  156. position: number
  157. dataset_id: string
  158. data_source_type: DataSourceType
  159. data_source_info: DataSourceInfo
  160. dataset_process_rule_id: string
  161. name: string
  162. created_from: 'api' | 'web'
  163. created_by: string
  164. created_at: number
  165. indexing_status: DocumentIndexingStatus
  166. display_status: DocumentDisplayStatus
  167. completed_segments?: number
  168. total_segments?: number
  169. doc_form: 'text_model' | 'qa_model'
  170. doc_language: string
  171. }
  172. export type SimpleDocumentDetail = InitialDocumentDetail & {
  173. enabled: boolean
  174. word_count: number
  175. error?: string | null
  176. archived: boolean
  177. updated_at: number
  178. hit_count: number
  179. dataset_process_rule_id?: string
  180. data_source_detail_dict?: {
  181. upload_file: {
  182. name: string
  183. extension: string
  184. }
  185. }
  186. }
  187. export type DocumentListResponse = {
  188. data: SimpleDocumentDetail[]
  189. has_more: boolean
  190. total: number
  191. page: number
  192. limit: number
  193. }
  194. export type DocumentReq = {
  195. original_document_id?: string
  196. indexing_technique?: string
  197. doc_form: 'text_model' | 'qa_model'
  198. doc_language: string
  199. process_rule: ProcessRule
  200. }
  201. export type CreateDocumentReq = DocumentReq & {
  202. data_source: DataSource
  203. retrieval_model: RetrievalConfig
  204. embedding_model: string
  205. embedding_model_provider: string
  206. }
  207. export type IndexingEstimateParams = DocumentReq & Partial<DataSource> & {
  208. dataset_id: string
  209. }
  210. export type DataSource = {
  211. type: DataSourceType
  212. info_list: {
  213. data_source_type: DataSourceType
  214. notion_info_list?: NotionInfo[]
  215. file_info_list?: {
  216. file_ids: string[]
  217. }
  218. website_info_list?: {
  219. provider: string
  220. job_id: string
  221. urls: string[]
  222. }
  223. }
  224. }
  225. export type NotionInfo = {
  226. workspace_id: string
  227. pages: DataSourceNotionPage[]
  228. }
  229. export type NotionPage = {
  230. page_id: string
  231. type: string
  232. }
  233. export type ProcessRule = {
  234. mode: string
  235. rules: Rules
  236. }
  237. export type createDocumentResponse = {
  238. dataset?: DataSet
  239. batch: string
  240. documents: InitialDocumentDetail[]
  241. }
  242. export type FullDocumentDetail = SimpleDocumentDetail & {
  243. batch: string
  244. created_api_request_id: string
  245. processing_started_at: number
  246. parsing_completed_at: number
  247. cleaning_completed_at: number
  248. splitting_completed_at: number
  249. tokens: number
  250. indexing_latency: number
  251. completed_at: number
  252. paused_by: string
  253. paused_at: number
  254. stopped_at: number
  255. indexing_status: string
  256. disabled_at: number
  257. disabled_by: string
  258. archived_reason: 'rule_modified' | 're_upload'
  259. archived_by: string
  260. archived_at: number
  261. doc_type?: DocType | null | 'others'
  262. doc_metadata?: DocMetadata | null
  263. segment_count: number
  264. [key: string]: any
  265. }
  266. export type DocMetadata = {
  267. title: string
  268. language: string
  269. author: string
  270. publisher: string
  271. publicationDate: string
  272. ISBN: string
  273. category: string
  274. [key: string]: string
  275. }
  276. export const CUSTOMIZABLE_DOC_TYPES = [
  277. 'book',
  278. 'web_page',
  279. 'paper',
  280. 'social_media_post',
  281. 'personal_document',
  282. 'business_document',
  283. 'im_chat_log',
  284. ] as const
  285. export const FIXED_DOC_TYPES = ['synced_from_github', 'synced_from_notion', 'wikipedia_entry'] as const
  286. export type CustomizableDocType = typeof CUSTOMIZABLE_DOC_TYPES[number]
  287. export type FixedDocType = typeof FIXED_DOC_TYPES[number]
  288. export type DocType = CustomizableDocType | FixedDocType
  289. export type DocumentDetailResponse = FullDocumentDetail
  290. export const SEGMENT_STATUS_LIST = ['waiting', 'completed', 'error', 'indexing']
  291. export type SegmentStatus = typeof SEGMENT_STATUS_LIST[number]
  292. export type SegmentsQuery = {
  293. last_id?: string
  294. limit: number
  295. // status?: SegmentStatus
  296. hit_count_gte?: number
  297. keyword?: string
  298. enabled?: boolean
  299. }
  300. export type SegmentDetailModel = {
  301. id: string
  302. position: number
  303. document_id: string
  304. content: string
  305. word_count: number
  306. tokens: number
  307. keywords: string[]
  308. index_node_id: string
  309. index_node_hash: string
  310. hit_count: number
  311. enabled: boolean
  312. disabled_at: number
  313. disabled_by: string
  314. status: SegmentStatus
  315. created_by: string
  316. created_at: number
  317. indexing_at: number
  318. completed_at: number
  319. error: string | null
  320. stopped_at: number
  321. answer?: string
  322. }
  323. export type SegmentsResponse = {
  324. data: SegmentDetailModel[]
  325. has_more: boolean
  326. limit: number
  327. total: number
  328. }
  329. export type HitTestingRecord = {
  330. id: string
  331. content: string
  332. source: 'app' | 'hit_testing' | 'plugin'
  333. source_app_id: string
  334. created_by_role: 'account' | 'end_user'
  335. created_by: string
  336. created_at: number
  337. }
  338. export type HitTesting = {
  339. segment: Segment
  340. score: number
  341. tsne_position: TsnePosition
  342. }
  343. export type Segment = {
  344. id: string
  345. document: Document
  346. content: string
  347. position: number
  348. word_count: number
  349. tokens: number
  350. keywords: string[]
  351. hit_count: number
  352. index_node_hash: string
  353. }
  354. export type Document = {
  355. id: string
  356. data_source_type: string
  357. name: string
  358. doc_type: DocType
  359. }
  360. export type HitTestingRecordsResponse = {
  361. data: HitTestingRecord[]
  362. has_more: boolean
  363. limit: number
  364. total: number
  365. page: number
  366. }
  367. export type TsnePosition = {
  368. x: number
  369. y: number
  370. }
  371. export type HitTestingResponse = {
  372. query: {
  373. content: string
  374. tsne_position: TsnePosition
  375. }
  376. records: Array<HitTesting>
  377. }
  378. export type RelatedApp = {
  379. id: string
  380. name: string
  381. mode: AppMode
  382. icon_type: AppIconType | null
  383. icon: string
  384. icon_background: string
  385. icon_url: string
  386. }
  387. export type RelatedAppResponse = {
  388. data: Array<RelatedApp>
  389. total: number
  390. }
  391. export type SegmentUpdater = {
  392. content: string
  393. answer?: string
  394. keywords?: string[]
  395. }
  396. export enum DocForm {
  397. TEXT = 'text_model',
  398. QA = 'qa_model',
  399. }
  400. export type ErrorDocsResponse = {
  401. data: IndexingStatusResponse[]
  402. total: number
  403. }
  404. export type SelectedDatasetsMode = {
  405. allHighQuality: boolean
  406. allHighQualityVectorSearch: boolean
  407. allHighQualityFullTextSearch: boolean
  408. allEconomic: boolean
  409. mixtureHighQualityAndEconomic: boolean
  410. inconsistentEmbeddingModel: boolean
  411. }
  412. export enum WeightedScoreEnum {
  413. SemanticFirst = 'semantic_first',
  414. KeywordFirst = 'keyword_first',
  415. Customized = 'customized',
  416. }
  417. export enum RerankingModeEnum {
  418. RerankingModel = 'reranking_model',
  419. WeightedScore = 'weighted_score',
  420. }
  421. export const DEFAULT_WEIGHTED_SCORE = {
  422. allHighQualityVectorSearch: {
  423. semantic: 1.0,
  424. keyword: 0,
  425. },
  426. allHighQualityFullTextSearch: {
  427. semantic: 0,
  428. keyword: 1.0,
  429. },
  430. semanticFirst: {
  431. semantic: 0.7,
  432. keyword: 0.3,
  433. },
  434. keywordFirst: {
  435. semantic: 0.3,
  436. keyword: 0.7,
  437. },
  438. other: {
  439. semantic: 0.7,
  440. keyword: 0.3,
  441. },
  442. }