segment.py 8.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204
  1. from flask_login import current_user
  2. from flask_restful import reqparse, marshal
  3. from werkzeug.exceptions import NotFound
  4. from controllers.service_api import api
  5. from controllers.service_api.app.error import ProviderNotInitializeError
  6. from controllers.service_api.wraps import DatasetApiResource, cloud_edition_billing_resource_check
  7. from core.model_providers.error import ProviderTokenNotInitError, LLMBadRequestError
  8. from core.model_providers.model_factory import ModelFactory
  9. from extensions.ext_database import db
  10. from fields.segment_fields import segment_fields
  11. from models.dataset import Dataset, DocumentSegment
  12. from services.dataset_service import DatasetService, DocumentService, SegmentService
  13. class SegmentApi(DatasetApiResource):
  14. """Resource for segments."""
  15. @cloud_edition_billing_resource_check('vector_space', 'dataset')
  16. def post(self, tenant_id, dataset_id, document_id):
  17. """Create single segment."""
  18. # check dataset
  19. dataset_id = str(dataset_id)
  20. tenant_id = str(tenant_id)
  21. dataset = db.session.query(Dataset).filter(
  22. Dataset.tenant_id == tenant_id,
  23. Dataset.id == dataset_id
  24. ).first()
  25. if not dataset:
  26. raise NotFound('Dataset not found.')
  27. # check document
  28. document_id = str(document_id)
  29. document = DocumentService.get_document(dataset.id, document_id)
  30. if not document:
  31. raise NotFound('Document not found.')
  32. # check embedding model setting
  33. if dataset.indexing_technique == 'high_quality':
  34. try:
  35. ModelFactory.get_embedding_model(
  36. tenant_id=current_user.current_tenant_id,
  37. model_provider_name=dataset.embedding_model_provider,
  38. model_name=dataset.embedding_model
  39. )
  40. except LLMBadRequestError:
  41. raise ProviderNotInitializeError(
  42. f"No Embedding Model available. Please configure a valid provider "
  43. f"in the Settings -> Model Provider.")
  44. except ProviderTokenNotInitError as ex:
  45. raise ProviderNotInitializeError(ex.description)
  46. # validate args
  47. parser = reqparse.RequestParser()
  48. parser.add_argument('segments', type=list, required=False, nullable=True, location='json')
  49. args = parser.parse_args()
  50. for args_item in args['segments']:
  51. SegmentService.segment_create_args_validate(args_item, document)
  52. segments = SegmentService.multi_create_segment(args['segments'], document, dataset)
  53. return {
  54. 'data': marshal(segments, segment_fields),
  55. 'doc_form': document.doc_form
  56. }, 200
  57. def get(self, tenant_id, dataset_id, document_id):
  58. """Create single segment."""
  59. # check dataset
  60. dataset_id = str(dataset_id)
  61. tenant_id = str(tenant_id)
  62. dataset = db.session.query(Dataset).filter(
  63. Dataset.tenant_id == tenant_id,
  64. Dataset.id == dataset_id
  65. ).first()
  66. if not dataset:
  67. raise NotFound('Dataset not found.')
  68. # check document
  69. document_id = str(document_id)
  70. document = DocumentService.get_document(dataset.id, document_id)
  71. if not document:
  72. raise NotFound('Document not found.')
  73. # check embedding model setting
  74. if dataset.indexing_technique == 'high_quality':
  75. try:
  76. ModelFactory.get_embedding_model(
  77. tenant_id=current_user.current_tenant_id,
  78. model_provider_name=dataset.embedding_model_provider,
  79. model_name=dataset.embedding_model
  80. )
  81. except LLMBadRequestError:
  82. raise ProviderNotInitializeError(
  83. f"No Embedding Model available. Please configure a valid provider "
  84. f"in the Settings -> Model Provider.")
  85. except ProviderTokenNotInitError as ex:
  86. raise ProviderNotInitializeError(ex.description)
  87. parser = reqparse.RequestParser()
  88. parser.add_argument('status', type=str,
  89. action='append', default=[], location='args')
  90. parser.add_argument('keyword', type=str, default=None, location='args')
  91. args = parser.parse_args()
  92. status_list = args['status']
  93. keyword = args['keyword']
  94. query = DocumentSegment.query.filter(
  95. DocumentSegment.document_id == str(document_id),
  96. DocumentSegment.tenant_id == current_user.current_tenant_id
  97. )
  98. if status_list:
  99. query = query.filter(DocumentSegment.status.in_(status_list))
  100. if keyword:
  101. query = query.where(DocumentSegment.content.ilike(f'%{keyword}%'))
  102. total = query.count()
  103. segments = query.order_by(DocumentSegment.position).all()
  104. return {
  105. 'data': marshal(segments, segment_fields),
  106. 'doc_form': document.doc_form,
  107. 'total': total
  108. }, 200
  109. class DatasetSegmentApi(DatasetApiResource):
  110. def delete(self, tenant_id, dataset_id, document_id, segment_id):
  111. # check dataset
  112. dataset_id = str(dataset_id)
  113. tenant_id = str(tenant_id)
  114. dataset = db.session.query(Dataset).filter(
  115. Dataset.tenant_id == tenant_id,
  116. Dataset.id == dataset_id
  117. ).first()
  118. if not dataset:
  119. raise NotFound('Dataset not found.')
  120. # check user's model setting
  121. DatasetService.check_dataset_model_setting(dataset)
  122. # check document
  123. document_id = str(document_id)
  124. document = DocumentService.get_document(dataset_id, document_id)
  125. if not document:
  126. raise NotFound('Document not found.')
  127. # check segment
  128. segment = DocumentSegment.query.filter(
  129. DocumentSegment.id == str(segment_id),
  130. DocumentSegment.tenant_id == current_user.current_tenant_id
  131. ).first()
  132. if not segment:
  133. raise NotFound('Segment not found.')
  134. SegmentService.delete_segment(segment, document, dataset)
  135. return {'result': 'success'}, 200
  136. @cloud_edition_billing_resource_check('vector_space', 'dataset')
  137. def post(self, tenant_id, dataset_id, document_id, segment_id):
  138. # check dataset
  139. dataset_id = str(dataset_id)
  140. tenant_id = str(tenant_id)
  141. dataset = db.session.query(Dataset).filter(
  142. Dataset.tenant_id == tenant_id,
  143. Dataset.id == dataset_id
  144. ).first()
  145. if not dataset:
  146. raise NotFound('Dataset not found.')
  147. # check user's model setting
  148. DatasetService.check_dataset_model_setting(dataset)
  149. # check document
  150. document_id = str(document_id)
  151. document = DocumentService.get_document(dataset_id, document_id)
  152. if not document:
  153. raise NotFound('Document not found.')
  154. if dataset.indexing_technique == 'high_quality':
  155. # check embedding model setting
  156. try:
  157. ModelFactory.get_embedding_model(
  158. tenant_id=current_user.current_tenant_id,
  159. model_provider_name=dataset.embedding_model_provider,
  160. model_name=dataset.embedding_model
  161. )
  162. except LLMBadRequestError:
  163. raise ProviderNotInitializeError(
  164. f"No Embedding Model available. Please configure a valid provider "
  165. f"in the Settings -> Model Provider.")
  166. except ProviderTokenNotInitError as ex:
  167. raise ProviderNotInitializeError(ex.description)
  168. # check segment
  169. segment_id = str(segment_id)
  170. segment = DocumentSegment.query.filter(
  171. DocumentSegment.id == str(segment_id),
  172. DocumentSegment.tenant_id == current_user.current_tenant_id
  173. ).first()
  174. if not segment:
  175. raise NotFound('Segment not found.')
  176. # validate args
  177. parser = reqparse.RequestParser()
  178. parser.add_argument('segments', type=dict, required=False, nullable=True, location='json')
  179. args = parser.parse_args()
  180. SegmentService.segment_create_args_validate(args['segments'], document)
  181. segment = SegmentService.update_segment(args['segments'], segment, document, dataset)
  182. return {
  183. 'data': marshal(segment, segment_fields),
  184. 'doc_form': document.doc_form
  185. }, 200
  186. api.add_resource(SegmentApi, '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/segments')
  187. api.add_resource(DatasetSegmentApi, '/datasets/<uuid:dataset_id>/documents/<uuid:document_id>/segments/<uuid:segment_id>')