datasets_document.py 41 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025
  1. import logging
  2. from argparse import ArgumentTypeError
  3. from datetime import datetime, timezone
  4. from flask import request
  5. from flask_login import current_user
  6. from flask_restful import Resource, fields, marshal, marshal_with, reqparse
  7. from sqlalchemy import asc, desc
  8. from transformers.hf_argparser import string_to_bool
  9. from werkzeug.exceptions import Forbidden, NotFound
  10. import services
  11. from controllers.console import api
  12. from controllers.console.app.error import (
  13. ProviderModelCurrentlyNotSupportError,
  14. ProviderNotInitializeError,
  15. ProviderQuotaExceededError,
  16. )
  17. from controllers.console.datasets.error import (
  18. ArchivedDocumentImmutableError,
  19. DocumentAlreadyFinishedError,
  20. DocumentIndexingError,
  21. InvalidActionError,
  22. InvalidMetadataError,
  23. )
  24. from controllers.console.setup import setup_required
  25. from controllers.console.wraps import account_initialization_required, cloud_edition_billing_resource_check
  26. from core.errors.error import (
  27. LLMBadRequestError,
  28. ModelCurrentlyNotSupportError,
  29. ProviderTokenNotInitError,
  30. QuotaExceededError,
  31. )
  32. from core.indexing_runner import IndexingRunner
  33. from core.model_manager import ModelManager
  34. from core.model_runtime.entities.model_entities import ModelType
  35. from core.model_runtime.errors.invoke import InvokeAuthorizationError
  36. from core.rag.extractor.entity.extract_setting import ExtractSetting
  37. from extensions.ext_database import db
  38. from extensions.ext_redis import redis_client
  39. from fields.document_fields import (
  40. dataset_and_document_fields,
  41. document_fields,
  42. document_status_fields,
  43. document_with_segments_fields,
  44. )
  45. from libs.login import login_required
  46. from models.dataset import Dataset, DatasetProcessRule, Document, DocumentSegment
  47. from models.model import UploadFile
  48. from services.dataset_service import DatasetService, DocumentService
  49. from tasks.add_document_to_index_task import add_document_to_index_task
  50. from tasks.remove_document_from_index_task import remove_document_from_index_task
  51. class DocumentResource(Resource):
  52. def get_document(self, dataset_id: str, document_id: str) -> Document:
  53. dataset = DatasetService.get_dataset(dataset_id)
  54. if not dataset:
  55. raise NotFound('Dataset not found.')
  56. try:
  57. DatasetService.check_dataset_permission(dataset, current_user)
  58. except services.errors.account.NoPermissionError as e:
  59. raise Forbidden(str(e))
  60. document = DocumentService.get_document(dataset_id, document_id)
  61. if not document:
  62. raise NotFound('Document not found.')
  63. if document.tenant_id != current_user.current_tenant_id:
  64. raise Forbidden('No permission.')
  65. return document
  66. def get_batch_documents(self, dataset_id: str, batch: str) -> list[Document]:
  67. dataset = DatasetService.get_dataset(dataset_id)
  68. if not dataset:
  69. raise NotFound('Dataset not found.')
  70. try:
  71. DatasetService.check_dataset_permission(dataset, current_user)
  72. except services.errors.account.NoPermissionError as e:
  73. raise Forbidden(str(e))
  74. documents = DocumentService.get_batch_documents(dataset_id, batch)
  75. if not documents:
  76. raise NotFound('Documents not found.')
  77. return documents
  78. class GetProcessRuleApi(Resource):
  79. @setup_required
  80. @login_required
  81. @account_initialization_required
  82. def get(self):
  83. req_data = request.args
  84. document_id = req_data.get('document_id')
  85. # get default rules
  86. mode = DocumentService.DEFAULT_RULES['mode']
  87. rules = DocumentService.DEFAULT_RULES['rules']
  88. if document_id:
  89. # get the latest process rule
  90. document = Document.query.get_or_404(document_id)
  91. dataset = DatasetService.get_dataset(document.dataset_id)
  92. if not dataset:
  93. raise NotFound('Dataset not found.')
  94. try:
  95. DatasetService.check_dataset_permission(dataset, current_user)
  96. except services.errors.account.NoPermissionError as e:
  97. raise Forbidden(str(e))
  98. # get the latest process rule
  99. dataset_process_rule = db.session.query(DatasetProcessRule). \
  100. filter(DatasetProcessRule.dataset_id == document.dataset_id). \
  101. order_by(DatasetProcessRule.created_at.desc()). \
  102. limit(1). \
  103. one_or_none()
  104. if dataset_process_rule:
  105. mode = dataset_process_rule.mode
  106. rules = dataset_process_rule.rules_dict
  107. return {
  108. 'mode': mode,
  109. 'rules': rules
  110. }
  111. class DatasetDocumentListApi(Resource):
  112. @setup_required
  113. @login_required
  114. @account_initialization_required
  115. def get(self, dataset_id):
  116. dataset_id = str(dataset_id)
  117. page = request.args.get('page', default=1, type=int)
  118. limit = request.args.get('limit', default=20, type=int)
  119. search = request.args.get('keyword', default=None, type=str)
  120. sort = request.args.get('sort', default='-created_at', type=str)
  121. # "yes", "true", "t", "y", "1" convert to True, while others convert to False.
  122. try:
  123. fetch = string_to_bool(request.args.get('fetch', default='false'))
  124. except (ArgumentTypeError, ValueError, Exception) as e:
  125. fetch = False
  126. dataset = DatasetService.get_dataset(dataset_id)
  127. if not dataset:
  128. raise NotFound('Dataset not found.')
  129. try:
  130. DatasetService.check_dataset_permission(dataset, current_user)
  131. except services.errors.account.NoPermissionError as e:
  132. raise Forbidden(str(e))
  133. query = Document.query.filter_by(
  134. dataset_id=str(dataset_id), tenant_id=current_user.current_tenant_id)
  135. if search:
  136. search = f'%{search}%'
  137. query = query.filter(Document.name.like(search))
  138. if sort.startswith('-'):
  139. sort_logic = desc
  140. sort = sort[1:]
  141. else:
  142. sort_logic = asc
  143. if sort == 'hit_count':
  144. sub_query = db.select(DocumentSegment.document_id,
  145. db.func.sum(DocumentSegment.hit_count).label("total_hit_count")) \
  146. .group_by(DocumentSegment.document_id) \
  147. .subquery()
  148. query = query.outerjoin(sub_query, sub_query.c.document_id == Document.id) \
  149. .order_by(sort_logic(db.func.coalesce(sub_query.c.total_hit_count, 0)))
  150. elif sort == 'created_at':
  151. query = query.order_by(sort_logic(Document.created_at))
  152. else:
  153. query = query.order_by(desc(Document.created_at))
  154. paginated_documents = query.paginate(
  155. page=page, per_page=limit, max_per_page=100, error_out=False)
  156. documents = paginated_documents.items
  157. if fetch:
  158. for document in documents:
  159. completed_segments = DocumentSegment.query.filter(DocumentSegment.completed_at.isnot(None),
  160. DocumentSegment.document_id == str(document.id),
  161. DocumentSegment.status != 're_segment').count()
  162. total_segments = DocumentSegment.query.filter(DocumentSegment.document_id == str(document.id),
  163. DocumentSegment.status != 're_segment').count()
  164. document.completed_segments = completed_segments
  165. document.total_segments = total_segments
  166. data = marshal(documents, document_with_segments_fields)
  167. else:
  168. data = marshal(documents, document_fields)
  169. response = {
  170. 'data': data,
  171. 'has_more': len(documents) == limit,
  172. 'limit': limit,
  173. 'total': paginated_documents.total,
  174. 'page': page
  175. }
  176. return response
  177. documents_and_batch_fields = {
  178. 'documents': fields.List(fields.Nested(document_fields)),
  179. 'batch': fields.String
  180. }
  181. @setup_required
  182. @login_required
  183. @account_initialization_required
  184. @marshal_with(documents_and_batch_fields)
  185. @cloud_edition_billing_resource_check('vector_space')
  186. def post(self, dataset_id):
  187. dataset_id = str(dataset_id)
  188. dataset = DatasetService.get_dataset(dataset_id)
  189. if not dataset:
  190. raise NotFound('Dataset not found.')
  191. # The role of the current user in the ta table must be admin, owner, or editor
  192. if not current_user.is_editor:
  193. raise Forbidden()
  194. try:
  195. DatasetService.check_dataset_permission(dataset, current_user)
  196. except services.errors.account.NoPermissionError as e:
  197. raise Forbidden(str(e))
  198. parser = reqparse.RequestParser()
  199. parser.add_argument('indexing_technique', type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, nullable=False,
  200. location='json')
  201. parser.add_argument('data_source', type=dict, required=False, location='json')
  202. parser.add_argument('process_rule', type=dict, required=False, location='json')
  203. parser.add_argument('duplicate', type=bool, default=True, nullable=False, location='json')
  204. parser.add_argument('original_document_id', type=str, required=False, location='json')
  205. parser.add_argument('doc_form', type=str, default='text_model', required=False, nullable=False, location='json')
  206. parser.add_argument('doc_language', type=str, default='English', required=False, nullable=False,
  207. location='json')
  208. parser.add_argument('retrieval_model', type=dict, required=False, nullable=False,
  209. location='json')
  210. args = parser.parse_args()
  211. if not dataset.indexing_technique and not args['indexing_technique']:
  212. raise ValueError('indexing_technique is required.')
  213. # validate args
  214. DocumentService.document_create_args_validate(args)
  215. try:
  216. documents, batch = DocumentService.save_document_with_dataset_id(dataset, args, current_user)
  217. except ProviderTokenNotInitError as ex:
  218. raise ProviderNotInitializeError(ex.description)
  219. except QuotaExceededError:
  220. raise ProviderQuotaExceededError()
  221. except ModelCurrentlyNotSupportError:
  222. raise ProviderModelCurrentlyNotSupportError()
  223. return {
  224. 'documents': documents,
  225. 'batch': batch
  226. }
  227. class DatasetInitApi(Resource):
  228. @setup_required
  229. @login_required
  230. @account_initialization_required
  231. @marshal_with(dataset_and_document_fields)
  232. @cloud_edition_billing_resource_check('vector_space')
  233. def post(self):
  234. # The role of the current user in the ta table must be admin, owner, or editor
  235. if not current_user.is_editor:
  236. raise Forbidden()
  237. parser = reqparse.RequestParser()
  238. parser.add_argument('indexing_technique', type=str, choices=Dataset.INDEXING_TECHNIQUE_LIST, required=True,
  239. nullable=False, location='json')
  240. parser.add_argument('data_source', type=dict, required=True, nullable=True, location='json')
  241. parser.add_argument('process_rule', type=dict, required=True, nullable=True, location='json')
  242. parser.add_argument('doc_form', type=str, default='text_model', required=False, nullable=False, location='json')
  243. parser.add_argument('doc_language', type=str, default='English', required=False, nullable=False,
  244. location='json')
  245. parser.add_argument('retrieval_model', type=dict, required=False, nullable=False,
  246. location='json')
  247. args = parser.parse_args()
  248. if args['indexing_technique'] == 'high_quality':
  249. try:
  250. model_manager = ModelManager()
  251. model_manager.get_default_model_instance(
  252. tenant_id=current_user.current_tenant_id,
  253. model_type=ModelType.TEXT_EMBEDDING
  254. )
  255. except InvokeAuthorizationError:
  256. raise ProviderNotInitializeError(
  257. "No Embedding Model available. Please configure a valid provider "
  258. "in the Settings -> Model Provider.")
  259. except ProviderTokenNotInitError as ex:
  260. raise ProviderNotInitializeError(ex.description)
  261. # validate args
  262. DocumentService.document_create_args_validate(args)
  263. try:
  264. dataset, documents, batch = DocumentService.save_document_without_dataset_id(
  265. tenant_id=current_user.current_tenant_id,
  266. document_data=args,
  267. account=current_user
  268. )
  269. except ProviderTokenNotInitError as ex:
  270. raise ProviderNotInitializeError(ex.description)
  271. except QuotaExceededError:
  272. raise ProviderQuotaExceededError()
  273. except ModelCurrentlyNotSupportError:
  274. raise ProviderModelCurrentlyNotSupportError()
  275. response = {
  276. 'dataset': dataset,
  277. 'documents': documents,
  278. 'batch': batch
  279. }
  280. return response
  281. class DocumentIndexingEstimateApi(DocumentResource):
  282. @setup_required
  283. @login_required
  284. @account_initialization_required
  285. def get(self, dataset_id, document_id):
  286. dataset_id = str(dataset_id)
  287. document_id = str(document_id)
  288. document = self.get_document(dataset_id, document_id)
  289. if document.indexing_status in ['completed', 'error']:
  290. raise DocumentAlreadyFinishedError()
  291. data_process_rule = document.dataset_process_rule
  292. data_process_rule_dict = data_process_rule.to_dict()
  293. response = {
  294. "tokens": 0,
  295. "total_price": 0,
  296. "currency": "USD",
  297. "total_segments": 0,
  298. "preview": []
  299. }
  300. if document.data_source_type == 'upload_file':
  301. data_source_info = document.data_source_info_dict
  302. if data_source_info and 'upload_file_id' in data_source_info:
  303. file_id = data_source_info['upload_file_id']
  304. file = db.session.query(UploadFile).filter(
  305. UploadFile.tenant_id == document.tenant_id,
  306. UploadFile.id == file_id
  307. ).first()
  308. # raise error if file not found
  309. if not file:
  310. raise NotFound('File not found.')
  311. extract_setting = ExtractSetting(
  312. datasource_type="upload_file",
  313. upload_file=file,
  314. document_model=document.doc_form
  315. )
  316. indexing_runner = IndexingRunner()
  317. try:
  318. response = indexing_runner.indexing_estimate(current_user.current_tenant_id, [extract_setting],
  319. data_process_rule_dict, document.doc_form,
  320. 'English', dataset_id)
  321. except LLMBadRequestError:
  322. raise ProviderNotInitializeError(
  323. "No Embedding Model available. Please configure a valid provider "
  324. "in the Settings -> Model Provider.")
  325. except ProviderTokenNotInitError as ex:
  326. raise ProviderNotInitializeError(ex.description)
  327. return response
  328. class DocumentBatchIndexingEstimateApi(DocumentResource):
  329. @setup_required
  330. @login_required
  331. @account_initialization_required
  332. def get(self, dataset_id, batch):
  333. dataset_id = str(dataset_id)
  334. batch = str(batch)
  335. documents = self.get_batch_documents(dataset_id, batch)
  336. response = {
  337. "tokens": 0,
  338. "total_price": 0,
  339. "currency": "USD",
  340. "total_segments": 0,
  341. "preview": []
  342. }
  343. if not documents:
  344. return response
  345. data_process_rule = documents[0].dataset_process_rule
  346. data_process_rule_dict = data_process_rule.to_dict()
  347. info_list = []
  348. extract_settings = []
  349. for document in documents:
  350. if document.indexing_status in ['completed', 'error']:
  351. raise DocumentAlreadyFinishedError()
  352. data_source_info = document.data_source_info_dict
  353. # format document files info
  354. if data_source_info and 'upload_file_id' in data_source_info:
  355. file_id = data_source_info['upload_file_id']
  356. info_list.append(file_id)
  357. # format document notion info
  358. elif data_source_info and 'notion_workspace_id' in data_source_info and 'notion_page_id' in data_source_info:
  359. pages = []
  360. page = {
  361. 'page_id': data_source_info['notion_page_id'],
  362. 'type': data_source_info['type']
  363. }
  364. pages.append(page)
  365. notion_info = {
  366. 'workspace_id': data_source_info['notion_workspace_id'],
  367. 'pages': pages
  368. }
  369. info_list.append(notion_info)
  370. if document.data_source_type == 'upload_file':
  371. file_id = data_source_info['upload_file_id']
  372. file_detail = db.session.query(UploadFile).filter(
  373. UploadFile.tenant_id == current_user.current_tenant_id,
  374. UploadFile.id == file_id
  375. ).first()
  376. if file_detail is None:
  377. raise NotFound("File not found.")
  378. extract_setting = ExtractSetting(
  379. datasource_type="upload_file",
  380. upload_file=file_detail,
  381. document_model=document.doc_form
  382. )
  383. extract_settings.append(extract_setting)
  384. elif document.data_source_type == 'notion_import':
  385. extract_setting = ExtractSetting(
  386. datasource_type="notion_import",
  387. notion_info={
  388. "notion_workspace_id": data_source_info['notion_workspace_id'],
  389. "notion_obj_id": data_source_info['notion_page_id'],
  390. "notion_page_type": data_source_info['type'],
  391. "tenant_id": current_user.current_tenant_id
  392. },
  393. document_model=document.doc_form
  394. )
  395. extract_settings.append(extract_setting)
  396. elif document.data_source_type == 'website_crawl':
  397. extract_setting = ExtractSetting(
  398. datasource_type="website_crawl",
  399. website_info={
  400. "provider": data_source_info['provider'],
  401. "job_id": data_source_info['job_id'],
  402. "url": data_source_info['url'],
  403. "tenant_id": current_user.current_tenant_id,
  404. "mode": data_source_info['mode'],
  405. "only_main_content": data_source_info['only_main_content']
  406. },
  407. document_model=document.doc_form
  408. )
  409. extract_settings.append(extract_setting)
  410. else:
  411. raise ValueError('Data source type not support')
  412. indexing_runner = IndexingRunner()
  413. try:
  414. response = indexing_runner.indexing_estimate(current_user.current_tenant_id, extract_settings,
  415. data_process_rule_dict, document.doc_form,
  416. 'English', dataset_id)
  417. except LLMBadRequestError:
  418. raise ProviderNotInitializeError(
  419. "No Embedding Model available. Please configure a valid provider "
  420. "in the Settings -> Model Provider.")
  421. except ProviderTokenNotInitError as ex:
  422. raise ProviderNotInitializeError(ex.description)
  423. return response
  424. class DocumentBatchIndexingStatusApi(DocumentResource):
  425. @setup_required
  426. @login_required
  427. @account_initialization_required
  428. def get(self, dataset_id, batch):
  429. dataset_id = str(dataset_id)
  430. batch = str(batch)
  431. documents = self.get_batch_documents(dataset_id, batch)
  432. documents_status = []
  433. for document in documents:
  434. completed_segments = DocumentSegment.query.filter(DocumentSegment.completed_at.isnot(None),
  435. DocumentSegment.document_id == str(document.id),
  436. DocumentSegment.status != 're_segment').count()
  437. total_segments = DocumentSegment.query.filter(DocumentSegment.document_id == str(document.id),
  438. DocumentSegment.status != 're_segment').count()
  439. document.completed_segments = completed_segments
  440. document.total_segments = total_segments
  441. if document.is_paused:
  442. document.indexing_status = 'paused'
  443. documents_status.append(marshal(document, document_status_fields))
  444. data = {
  445. 'data': documents_status
  446. }
  447. return data
  448. class DocumentIndexingStatusApi(DocumentResource):
  449. @setup_required
  450. @login_required
  451. @account_initialization_required
  452. def get(self, dataset_id, document_id):
  453. dataset_id = str(dataset_id)
  454. document_id = str(document_id)
  455. document = self.get_document(dataset_id, document_id)
  456. completed_segments = DocumentSegment.query \
  457. .filter(DocumentSegment.completed_at.isnot(None),
  458. DocumentSegment.document_id == str(document_id),
  459. DocumentSegment.status != 're_segment') \
  460. .count()
  461. total_segments = DocumentSegment.query \
  462. .filter(DocumentSegment.document_id == str(document_id),
  463. DocumentSegment.status != 're_segment') \
  464. .count()
  465. document.completed_segments = completed_segments
  466. document.total_segments = total_segments
  467. if document.is_paused:
  468. document.indexing_status = 'paused'
  469. return marshal(document, document_status_fields)
  470. class DocumentDetailApi(DocumentResource):
  471. METADATA_CHOICES = {'all', 'only', 'without'}
  472. @setup_required
  473. @login_required
  474. @account_initialization_required
  475. def get(self, dataset_id, document_id):
  476. dataset_id = str(dataset_id)
  477. document_id = str(document_id)
  478. document = self.get_document(dataset_id, document_id)
  479. metadata = request.args.get('metadata', 'all')
  480. if metadata not in self.METADATA_CHOICES:
  481. raise InvalidMetadataError(f'Invalid metadata value: {metadata}')
  482. if metadata == 'only':
  483. response = {
  484. 'id': document.id,
  485. 'doc_type': document.doc_type,
  486. 'doc_metadata': document.doc_metadata
  487. }
  488. elif metadata == 'without':
  489. process_rules = DatasetService.get_process_rules(dataset_id)
  490. data_source_info = document.data_source_detail_dict
  491. response = {
  492. 'id': document.id,
  493. 'position': document.position,
  494. 'data_source_type': document.data_source_type,
  495. 'data_source_info': data_source_info,
  496. 'dataset_process_rule_id': document.dataset_process_rule_id,
  497. 'dataset_process_rule': process_rules,
  498. 'name': document.name,
  499. 'created_from': document.created_from,
  500. 'created_by': document.created_by,
  501. 'created_at': document.created_at.timestamp(),
  502. 'tokens': document.tokens,
  503. 'indexing_status': document.indexing_status,
  504. 'completed_at': int(document.completed_at.timestamp()) if document.completed_at else None,
  505. 'updated_at': int(document.updated_at.timestamp()) if document.updated_at else None,
  506. 'indexing_latency': document.indexing_latency,
  507. 'error': document.error,
  508. 'enabled': document.enabled,
  509. 'disabled_at': int(document.disabled_at.timestamp()) if document.disabled_at else None,
  510. 'disabled_by': document.disabled_by,
  511. 'archived': document.archived,
  512. 'segment_count': document.segment_count,
  513. 'average_segment_length': document.average_segment_length,
  514. 'hit_count': document.hit_count,
  515. 'display_status': document.display_status,
  516. 'doc_form': document.doc_form
  517. }
  518. else:
  519. process_rules = DatasetService.get_process_rules(dataset_id)
  520. data_source_info = document.data_source_detail_dict
  521. response = {
  522. 'id': document.id,
  523. 'position': document.position,
  524. 'data_source_type': document.data_source_type,
  525. 'data_source_info': data_source_info,
  526. 'dataset_process_rule_id': document.dataset_process_rule_id,
  527. 'dataset_process_rule': process_rules,
  528. 'name': document.name,
  529. 'created_from': document.created_from,
  530. 'created_by': document.created_by,
  531. 'created_at': document.created_at.timestamp(),
  532. 'tokens': document.tokens,
  533. 'indexing_status': document.indexing_status,
  534. 'completed_at': int(document.completed_at.timestamp()) if document.completed_at else None,
  535. 'updated_at': int(document.updated_at.timestamp()) if document.updated_at else None,
  536. 'indexing_latency': document.indexing_latency,
  537. 'error': document.error,
  538. 'enabled': document.enabled,
  539. 'disabled_at': int(document.disabled_at.timestamp()) if document.disabled_at else None,
  540. 'disabled_by': document.disabled_by,
  541. 'archived': document.archived,
  542. 'doc_type': document.doc_type,
  543. 'doc_metadata': document.doc_metadata,
  544. 'segment_count': document.segment_count,
  545. 'average_segment_length': document.average_segment_length,
  546. 'hit_count': document.hit_count,
  547. 'display_status': document.display_status,
  548. 'doc_form': document.doc_form
  549. }
  550. return response, 200
  551. class DocumentProcessingApi(DocumentResource):
  552. @setup_required
  553. @login_required
  554. @account_initialization_required
  555. def patch(self, dataset_id, document_id, action):
  556. dataset_id = str(dataset_id)
  557. document_id = str(document_id)
  558. document = self.get_document(dataset_id, document_id)
  559. # The role of the current user in the ta table must be admin, owner, or editor
  560. if not current_user.is_editor:
  561. raise Forbidden()
  562. if action == "pause":
  563. if document.indexing_status != "indexing":
  564. raise InvalidActionError('Document not in indexing state.')
  565. document.paused_by = current_user.id
  566. document.paused_at = datetime.now(timezone.utc).replace(tzinfo=None)
  567. document.is_paused = True
  568. db.session.commit()
  569. elif action == "resume":
  570. if document.indexing_status not in ["paused", "error"]:
  571. raise InvalidActionError('Document not in paused or error state.')
  572. document.paused_by = None
  573. document.paused_at = None
  574. document.is_paused = False
  575. db.session.commit()
  576. else:
  577. raise InvalidActionError()
  578. return {'result': 'success'}, 200
  579. class DocumentDeleteApi(DocumentResource):
  580. @setup_required
  581. @login_required
  582. @account_initialization_required
  583. def delete(self, dataset_id, document_id):
  584. dataset_id = str(dataset_id)
  585. document_id = str(document_id)
  586. dataset = DatasetService.get_dataset(dataset_id)
  587. if dataset is None:
  588. raise NotFound("Dataset not found.")
  589. # check user's model setting
  590. DatasetService.check_dataset_model_setting(dataset)
  591. document = self.get_document(dataset_id, document_id)
  592. try:
  593. DocumentService.delete_document(document)
  594. except services.errors.document.DocumentIndexingError:
  595. raise DocumentIndexingError('Cannot delete document during indexing.')
  596. return {'result': 'success'}, 204
  597. class DocumentMetadataApi(DocumentResource):
  598. @setup_required
  599. @login_required
  600. @account_initialization_required
  601. def put(self, dataset_id, document_id):
  602. dataset_id = str(dataset_id)
  603. document_id = str(document_id)
  604. document = self.get_document(dataset_id, document_id)
  605. req_data = request.get_json()
  606. doc_type = req_data.get('doc_type')
  607. doc_metadata = req_data.get('doc_metadata')
  608. # The role of the current user in the ta table must be admin, owner, or editor
  609. if not current_user.is_editor:
  610. raise Forbidden()
  611. if doc_type is None or doc_metadata is None:
  612. raise ValueError('Both doc_type and doc_metadata must be provided.')
  613. if doc_type not in DocumentService.DOCUMENT_METADATA_SCHEMA:
  614. raise ValueError('Invalid doc_type.')
  615. if not isinstance(doc_metadata, dict):
  616. raise ValueError('doc_metadata must be a dictionary.')
  617. metadata_schema = DocumentService.DOCUMENT_METADATA_SCHEMA[doc_type]
  618. document.doc_metadata = {}
  619. if doc_type == 'others':
  620. document.doc_metadata = doc_metadata
  621. else:
  622. for key, value_type in metadata_schema.items():
  623. value = doc_metadata.get(key)
  624. if value is not None and isinstance(value, value_type):
  625. document.doc_metadata[key] = value
  626. document.doc_type = doc_type
  627. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  628. db.session.commit()
  629. return {'result': 'success', 'message': 'Document metadata updated.'}, 200
  630. class DocumentStatusApi(DocumentResource):
  631. @setup_required
  632. @login_required
  633. @account_initialization_required
  634. @cloud_edition_billing_resource_check('vector_space')
  635. def patch(self, dataset_id, document_id, action):
  636. dataset_id = str(dataset_id)
  637. document_id = str(document_id)
  638. dataset = DatasetService.get_dataset(dataset_id)
  639. if dataset is None:
  640. raise NotFound("Dataset not found.")
  641. # check user's model setting
  642. DatasetService.check_dataset_model_setting(dataset)
  643. document = self.get_document(dataset_id, document_id)
  644. # The role of the current user in the ta table must be admin, owner, or editor
  645. if not current_user.is_editor:
  646. raise Forbidden()
  647. indexing_cache_key = 'document_{}_indexing'.format(document.id)
  648. cache_result = redis_client.get(indexing_cache_key)
  649. if cache_result is not None:
  650. raise InvalidActionError("Document is being indexed, please try again later")
  651. if action == "enable":
  652. if document.enabled:
  653. raise InvalidActionError('Document already enabled.')
  654. document.enabled = True
  655. document.disabled_at = None
  656. document.disabled_by = None
  657. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  658. db.session.commit()
  659. # Set cache to prevent indexing the same document multiple times
  660. redis_client.setex(indexing_cache_key, 600, 1)
  661. add_document_to_index_task.delay(document_id)
  662. return {'result': 'success'}, 200
  663. elif action == "disable":
  664. if not document.completed_at or document.indexing_status != 'completed':
  665. raise InvalidActionError('Document is not completed.')
  666. if not document.enabled:
  667. raise InvalidActionError('Document already disabled.')
  668. document.enabled = False
  669. document.disabled_at = datetime.now(timezone.utc).replace(tzinfo=None)
  670. document.disabled_by = current_user.id
  671. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  672. db.session.commit()
  673. # Set cache to prevent indexing the same document multiple times
  674. redis_client.setex(indexing_cache_key, 600, 1)
  675. remove_document_from_index_task.delay(document_id)
  676. return {'result': 'success'}, 200
  677. elif action == "archive":
  678. if document.archived:
  679. raise InvalidActionError('Document already archived.')
  680. document.archived = True
  681. document.archived_at = datetime.now(timezone.utc).replace(tzinfo=None)
  682. document.archived_by = current_user.id
  683. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  684. db.session.commit()
  685. if document.enabled:
  686. # Set cache to prevent indexing the same document multiple times
  687. redis_client.setex(indexing_cache_key, 600, 1)
  688. remove_document_from_index_task.delay(document_id)
  689. return {'result': 'success'}, 200
  690. elif action == "un_archive":
  691. if not document.archived:
  692. raise InvalidActionError('Document is not archived.')
  693. document.archived = False
  694. document.archived_at = None
  695. document.archived_by = None
  696. document.updated_at = datetime.now(timezone.utc).replace(tzinfo=None)
  697. db.session.commit()
  698. # Set cache to prevent indexing the same document multiple times
  699. redis_client.setex(indexing_cache_key, 600, 1)
  700. add_document_to_index_task.delay(document_id)
  701. return {'result': 'success'}, 200
  702. else:
  703. raise InvalidActionError()
  704. class DocumentPauseApi(DocumentResource):
  705. @setup_required
  706. @login_required
  707. @account_initialization_required
  708. def patch(self, dataset_id, document_id):
  709. """pause document."""
  710. dataset_id = str(dataset_id)
  711. document_id = str(document_id)
  712. dataset = DatasetService.get_dataset(dataset_id)
  713. if not dataset:
  714. raise NotFound('Dataset not found.')
  715. document = DocumentService.get_document(dataset.id, document_id)
  716. # 404 if document not found
  717. if document is None:
  718. raise NotFound("Document Not Exists.")
  719. # 403 if document is archived
  720. if DocumentService.check_archived(document):
  721. raise ArchivedDocumentImmutableError()
  722. try:
  723. # pause document
  724. DocumentService.pause_document(document)
  725. except services.errors.document.DocumentIndexingError:
  726. raise DocumentIndexingError('Cannot pause completed document.')
  727. return {'result': 'success'}, 204
  728. class DocumentRecoverApi(DocumentResource):
  729. @setup_required
  730. @login_required
  731. @account_initialization_required
  732. def patch(self, dataset_id, document_id):
  733. """recover document."""
  734. dataset_id = str(dataset_id)
  735. document_id = str(document_id)
  736. dataset = DatasetService.get_dataset(dataset_id)
  737. if not dataset:
  738. raise NotFound('Dataset not found.')
  739. document = DocumentService.get_document(dataset.id, document_id)
  740. # 404 if document not found
  741. if document is None:
  742. raise NotFound("Document Not Exists.")
  743. # 403 if document is archived
  744. if DocumentService.check_archived(document):
  745. raise ArchivedDocumentImmutableError()
  746. try:
  747. # pause document
  748. DocumentService.recover_document(document)
  749. except services.errors.document.DocumentIndexingError:
  750. raise DocumentIndexingError('Document is not in paused status.')
  751. return {'result': 'success'}, 204
  752. class DocumentRetryApi(DocumentResource):
  753. @setup_required
  754. @login_required
  755. @account_initialization_required
  756. def post(self, dataset_id):
  757. """retry document."""
  758. parser = reqparse.RequestParser()
  759. parser.add_argument('document_ids', type=list, required=True, nullable=False,
  760. location='json')
  761. args = parser.parse_args()
  762. dataset_id = str(dataset_id)
  763. dataset = DatasetService.get_dataset(dataset_id)
  764. retry_documents = []
  765. if not dataset:
  766. raise NotFound('Dataset not found.')
  767. for document_id in args['document_ids']:
  768. try:
  769. document_id = str(document_id)
  770. document = DocumentService.get_document(dataset.id, document_id)
  771. # 404 if document not found
  772. if document is None:
  773. raise NotFound("Document Not Exists.")
  774. # 403 if document is archived
  775. if DocumentService.check_archived(document):
  776. raise ArchivedDocumentImmutableError()
  777. # 400 if document is completed
  778. if document.indexing_status == 'completed':
  779. raise DocumentAlreadyFinishedError()
  780. retry_documents.append(document)
  781. except Exception as e:
  782. logging.error(f"Document {document_id} retry failed: {str(e)}")
  783. continue
  784. # retry document
  785. DocumentService.retry_document(dataset_id, retry_documents)
  786. return {'result': 'success'}, 204
  787. class DocumentRenameApi(DocumentResource):
  788. @setup_required
  789. @login_required
  790. @account_initialization_required
  791. @marshal_with(document_fields)
  792. def post(self, dataset_id, document_id):
  793. # The role of the current user in the ta table must be admin or owner
  794. if not current_user.is_admin_or_owner:
  795. raise Forbidden()
  796. parser = reqparse.RequestParser()
  797. parser.add_argument('name', type=str, required=True, nullable=False, location='json')
  798. args = parser.parse_args()
  799. try:
  800. document = DocumentService.rename_document(dataset_id, document_id, args['name'])
  801. except services.errors.document.DocumentIndexingError:
  802. raise DocumentIndexingError('Cannot delete document during indexing.')
  803. return document
  804. class WebsiteDocumentSyncApi(DocumentResource):
  805. @setup_required
  806. @login_required
  807. @account_initialization_required
  808. def get(self, dataset_id, document_id):
  809. """sync website document."""
  810. dataset_id = str(dataset_id)
  811. dataset = DatasetService.get_dataset(dataset_id)
  812. if not dataset:
  813. raise NotFound('Dataset not found.')
  814. document_id = str(document_id)
  815. document = DocumentService.get_document(dataset.id, document_id)
  816. if not document:
  817. raise NotFound('Document not found.')
  818. if document.tenant_id != current_user.current_tenant_id:
  819. raise Forbidden('No permission.')
  820. if document.data_source_type != 'website_crawl':
  821. raise ValueError('Document is not a website document.')
  822. # 403 if document is archived
  823. if DocumentService.check_archived(document):
  824. raise ArchivedDocumentImmutableError()
  825. # sync document
  826. DocumentService.sync_website_document(dataset_id, document)
  827. return {'result': 'success'}, 200
  828. api.add_resource(GetProcessRuleApi, '/datasets/process-rule')
  829. api.add_resource(DatasetDocumentListApi,
  830. '/datasets/<uuid:dataset_id>/documents')
  831. api.add_resource(DatasetInitApi,
  832. '/datasets/init')
  833. api.add_resource(DocumentIndexingEstimateApi,
  834. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-estimate')
  835. api.add_resource(DocumentBatchIndexingEstimateApi,
  836. '/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-estimate')
  837. api.add_resource(DocumentBatchIndexingStatusApi,
  838. '/datasets/<uuid:dataset_id>/batch/<string:batch>/indexing-status')
  839. api.add_resource(DocumentIndexingStatusApi,
  840. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/indexing-status')
  841. api.add_resource(DocumentDetailApi,
  842. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>')
  843. api.add_resource(DocumentProcessingApi,
  844. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/<string:action>')
  845. api.add_resource(DocumentDeleteApi,
  846. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>')
  847. api.add_resource(DocumentMetadataApi,
  848. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/metadata')
  849. api.add_resource(DocumentStatusApi,
  850. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/status/<string:action>')
  851. api.add_resource(DocumentPauseApi, '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/pause')
  852. api.add_resource(DocumentRecoverApi, '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/processing/resume')
  853. api.add_resource(DocumentRetryApi, '/datasets/<uuid:dataset_id>/retry')
  854. api.add_resource(DocumentRenameApi,
  855. '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/rename')
  856. api.add_resource(WebsiteDocumentSyncApi, '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/website-sync')