dataset.py 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534
  1. import json
  2. import logging
  3. import pickle
  4. from json import JSONDecodeError
  5. from sqlalchemy import func
  6. from sqlalchemy.dialects.postgresql import JSONB
  7. from extensions.ext_database import db
  8. from extensions.ext_storage import storage
  9. from models import StringUUID
  10. from models.account import Account
  11. from models.model import App, Tag, TagBinding, UploadFile
  12. class Dataset(db.Model):
  13. __tablename__ = 'datasets'
  14. __table_args__ = (
  15. db.PrimaryKeyConstraint('id', name='dataset_pkey'),
  16. db.Index('dataset_tenant_idx', 'tenant_id'),
  17. db.Index('retrieval_model_idx', "retrieval_model", postgresql_using='gin')
  18. )
  19. INDEXING_TECHNIQUE_LIST = ['high_quality', 'economy', None]
  20. id = db.Column(StringUUID, server_default=db.text('uuid_generate_v4()'))
  21. tenant_id = db.Column(StringUUID, nullable=False)
  22. name = db.Column(db.String(255), nullable=False)
  23. description = db.Column(db.Text, nullable=True)
  24. provider = db.Column(db.String(255), nullable=False,
  25. server_default=db.text("'vendor'::character varying"))
  26. permission = db.Column(db.String(255), nullable=False,
  27. server_default=db.text("'only_me'::character varying"))
  28. data_source_type = db.Column(db.String(255))
  29. indexing_technique = db.Column(db.String(255), nullable=True)
  30. index_struct = db.Column(db.Text, nullable=True)
  31. created_by = db.Column(StringUUID, nullable=False)
  32. created_at = db.Column(db.DateTime, nullable=False,
  33. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  34. updated_by = db.Column(StringUUID, nullable=True)
  35. updated_at = db.Column(db.DateTime, nullable=False,
  36. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  37. embedding_model = db.Column(db.String(255), nullable=True)
  38. embedding_model_provider = db.Column(db.String(255), nullable=True)
  39. collection_binding_id = db.Column(StringUUID, nullable=True)
  40. retrieval_model = db.Column(JSONB, nullable=True)
  41. @property
  42. def dataset_keyword_table(self):
  43. dataset_keyword_table = db.session.query(DatasetKeywordTable).filter(
  44. DatasetKeywordTable.dataset_id == self.id).first()
  45. if dataset_keyword_table:
  46. return dataset_keyword_table
  47. return None
  48. @property
  49. def index_struct_dict(self):
  50. return json.loads(self.index_struct) if self.index_struct else None
  51. @property
  52. def created_by_account(self):
  53. return Account.query.get(self.created_by)
  54. @property
  55. def latest_process_rule(self):
  56. return DatasetProcessRule.query.filter(DatasetProcessRule.dataset_id == self.id) \
  57. .order_by(DatasetProcessRule.created_at.desc()).first()
  58. @property
  59. def app_count(self):
  60. return db.session.query(func.count(AppDatasetJoin.id)).filter(AppDatasetJoin.dataset_id == self.id).scalar()
  61. @property
  62. def document_count(self):
  63. return db.session.query(func.count(Document.id)).filter(Document.dataset_id == self.id).scalar()
  64. @property
  65. def available_document_count(self):
  66. return db.session.query(func.count(Document.id)).filter(
  67. Document.dataset_id == self.id,
  68. Document.indexing_status == 'completed',
  69. Document.enabled == True,
  70. Document.archived == False
  71. ).scalar()
  72. @property
  73. def available_segment_count(self):
  74. return db.session.query(func.count(DocumentSegment.id)).filter(
  75. DocumentSegment.dataset_id == self.id,
  76. DocumentSegment.status == 'completed',
  77. DocumentSegment.enabled == True
  78. ).scalar()
  79. @property
  80. def word_count(self):
  81. return Document.query.with_entities(func.coalesce(func.sum(Document.word_count))) \
  82. .filter(Document.dataset_id == self.id).scalar()
  83. @property
  84. def doc_form(self):
  85. document = db.session.query(Document).filter(
  86. Document.dataset_id == self.id).first()
  87. if document:
  88. return document.doc_form
  89. return None
  90. @property
  91. def retrieval_model_dict(self):
  92. default_retrieval_model = {
  93. 'search_method': 'semantic_search',
  94. 'reranking_enable': False,
  95. 'reranking_model': {
  96. 'reranking_provider_name': '',
  97. 'reranking_model_name': ''
  98. },
  99. 'top_k': 2,
  100. 'score_threshold_enabled': False
  101. }
  102. return self.retrieval_model if self.retrieval_model else default_retrieval_model
  103. @property
  104. def tags(self):
  105. tags = db.session.query(Tag).join(
  106. TagBinding,
  107. Tag.id == TagBinding.tag_id
  108. ).filter(
  109. TagBinding.target_id == self.id,
  110. TagBinding.tenant_id == self.tenant_id,
  111. Tag.tenant_id == self.tenant_id,
  112. Tag.type == 'knowledge'
  113. ).all()
  114. return tags if tags else []
  115. @staticmethod
  116. def gen_collection_name_by_id(dataset_id: str) -> str:
  117. normalized_dataset_id = dataset_id.replace("-", "_")
  118. return f'Vector_index_{normalized_dataset_id}_Node'
  119. class DatasetProcessRule(db.Model):
  120. __tablename__ = 'dataset_process_rules'
  121. __table_args__ = (
  122. db.PrimaryKeyConstraint('id', name='dataset_process_rule_pkey'),
  123. db.Index('dataset_process_rule_dataset_id_idx', 'dataset_id'),
  124. )
  125. id = db.Column(StringUUID, nullable=False,
  126. server_default=db.text('uuid_generate_v4()'))
  127. dataset_id = db.Column(StringUUID, nullable=False)
  128. mode = db.Column(db.String(255), nullable=False,
  129. server_default=db.text("'automatic'::character varying"))
  130. rules = db.Column(db.Text, nullable=True)
  131. created_by = db.Column(StringUUID, nullable=False)
  132. created_at = db.Column(db.DateTime, nullable=False,
  133. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  134. MODES = ['automatic', 'custom']
  135. PRE_PROCESSING_RULES = ['remove_stopwords', 'remove_extra_spaces', 'remove_urls_emails']
  136. AUTOMATIC_RULES = {
  137. 'pre_processing_rules': [
  138. {'id': 'remove_extra_spaces', 'enabled': True},
  139. {'id': 'remove_urls_emails', 'enabled': False}
  140. ],
  141. 'segmentation': {
  142. 'delimiter': '\n',
  143. 'max_tokens': 500,
  144. 'chunk_overlap': 50
  145. }
  146. }
  147. def to_dict(self):
  148. return {
  149. 'id': self.id,
  150. 'dataset_id': self.dataset_id,
  151. 'mode': self.mode,
  152. 'rules': self.rules_dict,
  153. 'created_by': self.created_by,
  154. 'created_at': self.created_at,
  155. }
  156. @property
  157. def rules_dict(self):
  158. try:
  159. return json.loads(self.rules) if self.rules else None
  160. except JSONDecodeError:
  161. return None
  162. class Document(db.Model):
  163. __tablename__ = 'documents'
  164. __table_args__ = (
  165. db.PrimaryKeyConstraint('id', name='document_pkey'),
  166. db.Index('document_dataset_id_idx', 'dataset_id'),
  167. db.Index('document_is_paused_idx', 'is_paused'),
  168. db.Index('document_tenant_idx', 'tenant_id'),
  169. )
  170. # initial fields
  171. id = db.Column(StringUUID, nullable=False,
  172. server_default=db.text('uuid_generate_v4()'))
  173. tenant_id = db.Column(StringUUID, nullable=False)
  174. dataset_id = db.Column(StringUUID, nullable=False)
  175. position = db.Column(db.Integer, nullable=False)
  176. data_source_type = db.Column(db.String(255), nullable=False)
  177. data_source_info = db.Column(db.Text, nullable=True)
  178. dataset_process_rule_id = db.Column(StringUUID, nullable=True)
  179. batch = db.Column(db.String(255), nullable=False)
  180. name = db.Column(db.String(255), nullable=False)
  181. created_from = db.Column(db.String(255), nullable=False)
  182. created_by = db.Column(StringUUID, nullable=False)
  183. created_api_request_id = db.Column(StringUUID, nullable=True)
  184. created_at = db.Column(db.DateTime, nullable=False,
  185. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  186. # start processing
  187. processing_started_at = db.Column(db.DateTime, nullable=True)
  188. # parsing
  189. file_id = db.Column(db.Text, nullable=True)
  190. word_count = db.Column(db.Integer, nullable=True)
  191. parsing_completed_at = db.Column(db.DateTime, nullable=True)
  192. # cleaning
  193. cleaning_completed_at = db.Column(db.DateTime, nullable=True)
  194. # split
  195. splitting_completed_at = db.Column(db.DateTime, nullable=True)
  196. # indexing
  197. tokens = db.Column(db.Integer, nullable=True)
  198. indexing_latency = db.Column(db.Float, nullable=True)
  199. completed_at = db.Column(db.DateTime, nullable=True)
  200. # pause
  201. is_paused = db.Column(db.Boolean, nullable=True, server_default=db.text('false'))
  202. paused_by = db.Column(StringUUID, nullable=True)
  203. paused_at = db.Column(db.DateTime, nullable=True)
  204. # error
  205. error = db.Column(db.Text, nullable=True)
  206. stopped_at = db.Column(db.DateTime, nullable=True)
  207. # basic fields
  208. indexing_status = db.Column(db.String(
  209. 255), nullable=False, server_default=db.text("'waiting'::character varying"))
  210. enabled = db.Column(db.Boolean, nullable=False,
  211. server_default=db.text('true'))
  212. disabled_at = db.Column(db.DateTime, nullable=True)
  213. disabled_by = db.Column(StringUUID, nullable=True)
  214. archived = db.Column(db.Boolean, nullable=False,
  215. server_default=db.text('false'))
  216. archived_reason = db.Column(db.String(255), nullable=True)
  217. archived_by = db.Column(StringUUID, nullable=True)
  218. archived_at = db.Column(db.DateTime, nullable=True)
  219. updated_at = db.Column(db.DateTime, nullable=False,
  220. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  221. doc_type = db.Column(db.String(40), nullable=True)
  222. doc_metadata = db.Column(db.JSON, nullable=True)
  223. doc_form = db.Column(db.String(
  224. 255), nullable=False, server_default=db.text("'text_model'::character varying"))
  225. doc_language = db.Column(db.String(255), nullable=True)
  226. DATA_SOURCES = ['upload_file', 'notion_import']
  227. @property
  228. def display_status(self):
  229. status = None
  230. if self.indexing_status == 'waiting':
  231. status = 'queuing'
  232. elif self.indexing_status not in ['completed', 'error', 'waiting'] and self.is_paused:
  233. status = 'paused'
  234. elif self.indexing_status in ['parsing', 'cleaning', 'splitting', 'indexing']:
  235. status = 'indexing'
  236. elif self.indexing_status == 'error':
  237. status = 'error'
  238. elif self.indexing_status == 'completed' and not self.archived and self.enabled:
  239. status = 'available'
  240. elif self.indexing_status == 'completed' and not self.archived and not self.enabled:
  241. status = 'disabled'
  242. elif self.indexing_status == 'completed' and self.archived:
  243. status = 'archived'
  244. return status
  245. @property
  246. def data_source_info_dict(self):
  247. if self.data_source_info:
  248. try:
  249. data_source_info_dict = json.loads(self.data_source_info)
  250. except JSONDecodeError:
  251. data_source_info_dict = {}
  252. return data_source_info_dict
  253. return None
  254. @property
  255. def data_source_detail_dict(self):
  256. if self.data_source_info:
  257. if self.data_source_type == 'upload_file':
  258. data_source_info_dict = json.loads(self.data_source_info)
  259. file_detail = db.session.query(UploadFile). \
  260. filter(UploadFile.id == data_source_info_dict['upload_file_id']). \
  261. one_or_none()
  262. if file_detail:
  263. return {
  264. 'upload_file': {
  265. 'id': file_detail.id,
  266. 'name': file_detail.name,
  267. 'size': file_detail.size,
  268. 'extension': file_detail.extension,
  269. 'mime_type': file_detail.mime_type,
  270. 'created_by': file_detail.created_by,
  271. 'created_at': file_detail.created_at.timestamp()
  272. }
  273. }
  274. elif self.data_source_type == 'notion_import':
  275. return json.loads(self.data_source_info)
  276. return {}
  277. @property
  278. def average_segment_length(self):
  279. if self.word_count and self.word_count != 0 and self.segment_count and self.segment_count != 0:
  280. return self.word_count // self.segment_count
  281. return 0
  282. @property
  283. def dataset_process_rule(self):
  284. if self.dataset_process_rule_id:
  285. return DatasetProcessRule.query.get(self.dataset_process_rule_id)
  286. return None
  287. @property
  288. def dataset(self):
  289. return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).one_or_none()
  290. @property
  291. def segment_count(self):
  292. return DocumentSegment.query.filter(DocumentSegment.document_id == self.id).count()
  293. @property
  294. def hit_count(self):
  295. return DocumentSegment.query.with_entities(func.coalesce(func.sum(DocumentSegment.hit_count))) \
  296. .filter(DocumentSegment.document_id == self.id).scalar()
  297. class DocumentSegment(db.Model):
  298. __tablename__ = 'document_segments'
  299. __table_args__ = (
  300. db.PrimaryKeyConstraint('id', name='document_segment_pkey'),
  301. db.Index('document_segment_dataset_id_idx', 'dataset_id'),
  302. db.Index('document_segment_document_id_idx', 'document_id'),
  303. db.Index('document_segment_tenant_dataset_idx', 'dataset_id', 'tenant_id'),
  304. db.Index('document_segment_tenant_document_idx', 'document_id', 'tenant_id'),
  305. db.Index('document_segment_dataset_node_idx', 'dataset_id', 'index_node_id'),
  306. db.Index('document_segment_tenant_idx', 'tenant_id'),
  307. )
  308. # initial fields
  309. id = db.Column(StringUUID, nullable=False,
  310. server_default=db.text('uuid_generate_v4()'))
  311. tenant_id = db.Column(StringUUID, nullable=False)
  312. dataset_id = db.Column(StringUUID, nullable=False)
  313. document_id = db.Column(StringUUID, nullable=False)
  314. position = db.Column(db.Integer, nullable=False)
  315. content = db.Column(db.Text, nullable=False)
  316. answer = db.Column(db.Text, nullable=True)
  317. word_count = db.Column(db.Integer, nullable=False)
  318. tokens = db.Column(db.Integer, nullable=False)
  319. # indexing fields
  320. keywords = db.Column(db.JSON, nullable=True)
  321. index_node_id = db.Column(db.String(255), nullable=True)
  322. index_node_hash = db.Column(db.String(255), nullable=True)
  323. # basic fields
  324. hit_count = db.Column(db.Integer, nullable=False, default=0)
  325. enabled = db.Column(db.Boolean, nullable=False,
  326. server_default=db.text('true'))
  327. disabled_at = db.Column(db.DateTime, nullable=True)
  328. disabled_by = db.Column(StringUUID, nullable=True)
  329. status = db.Column(db.String(255), nullable=False,
  330. server_default=db.text("'waiting'::character varying"))
  331. created_by = db.Column(StringUUID, nullable=False)
  332. created_at = db.Column(db.DateTime, nullable=False,
  333. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  334. updated_by = db.Column(StringUUID, nullable=True)
  335. updated_at = db.Column(db.DateTime, nullable=False,
  336. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  337. indexing_at = db.Column(db.DateTime, nullable=True)
  338. completed_at = db.Column(db.DateTime, nullable=True)
  339. error = db.Column(db.Text, nullable=True)
  340. stopped_at = db.Column(db.DateTime, nullable=True)
  341. @property
  342. def dataset(self):
  343. return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).first()
  344. @property
  345. def document(self):
  346. return db.session.query(Document).filter(Document.id == self.document_id).first()
  347. @property
  348. def previous_segment(self):
  349. return db.session.query(DocumentSegment).filter(
  350. DocumentSegment.document_id == self.document_id,
  351. DocumentSegment.position == self.position - 1
  352. ).first()
  353. @property
  354. def next_segment(self):
  355. return db.session.query(DocumentSegment).filter(
  356. DocumentSegment.document_id == self.document_id,
  357. DocumentSegment.position == self.position + 1
  358. ).first()
  359. class AppDatasetJoin(db.Model):
  360. __tablename__ = 'app_dataset_joins'
  361. __table_args__ = (
  362. db.PrimaryKeyConstraint('id', name='app_dataset_join_pkey'),
  363. db.Index('app_dataset_join_app_dataset_idx', 'dataset_id', 'app_id'),
  364. )
  365. id = db.Column(StringUUID, primary_key=True, nullable=False, server_default=db.text('uuid_generate_v4()'))
  366. app_id = db.Column(StringUUID, nullable=False)
  367. dataset_id = db.Column(StringUUID, nullable=False)
  368. created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp())
  369. @property
  370. def app(self):
  371. return App.query.get(self.app_id)
  372. class DatasetQuery(db.Model):
  373. __tablename__ = 'dataset_queries'
  374. __table_args__ = (
  375. db.PrimaryKeyConstraint('id', name='dataset_query_pkey'),
  376. db.Index('dataset_query_dataset_id_idx', 'dataset_id'),
  377. )
  378. id = db.Column(StringUUID, primary_key=True, nullable=False, server_default=db.text('uuid_generate_v4()'))
  379. dataset_id = db.Column(StringUUID, nullable=False)
  380. content = db.Column(db.Text, nullable=False)
  381. source = db.Column(db.String(255), nullable=False)
  382. source_app_id = db.Column(StringUUID, nullable=True)
  383. created_by_role = db.Column(db.String, nullable=False)
  384. created_by = db.Column(StringUUID, nullable=False)
  385. created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp())
  386. class DatasetKeywordTable(db.Model):
  387. __tablename__ = 'dataset_keyword_tables'
  388. __table_args__ = (
  389. db.PrimaryKeyConstraint('id', name='dataset_keyword_table_pkey'),
  390. db.Index('dataset_keyword_table_dataset_id_idx', 'dataset_id'),
  391. )
  392. id = db.Column(StringUUID, primary_key=True, server_default=db.text('uuid_generate_v4()'))
  393. dataset_id = db.Column(StringUUID, nullable=False, unique=True)
  394. keyword_table = db.Column(db.Text, nullable=False)
  395. data_source_type = db.Column(db.String(255), nullable=False,
  396. server_default=db.text("'database'::character varying"))
  397. @property
  398. def keyword_table_dict(self):
  399. class SetDecoder(json.JSONDecoder):
  400. def __init__(self, *args, **kwargs):
  401. super().__init__(object_hook=self.object_hook, *args, **kwargs)
  402. def object_hook(self, dct):
  403. if isinstance(dct, dict):
  404. for keyword, node_idxs in dct.items():
  405. if isinstance(node_idxs, list):
  406. dct[keyword] = set(node_idxs)
  407. return dct
  408. # get dataset
  409. dataset = Dataset.query.filter_by(
  410. id=self.dataset_id
  411. ).first()
  412. if not dataset:
  413. return None
  414. if self.data_source_type == 'database':
  415. return json.loads(self.keyword_table, cls=SetDecoder) if self.keyword_table else None
  416. else:
  417. file_key = 'keyword_files/' + dataset.tenant_id + '/' + self.dataset_id + '.txt'
  418. try:
  419. keyword_table_text = storage.load_once(file_key)
  420. if keyword_table_text:
  421. return json.loads(keyword_table_text.decode('utf-8'), cls=SetDecoder)
  422. return None
  423. except Exception as e:
  424. logging.exception(str(e))
  425. return None
  426. class Embedding(db.Model):
  427. __tablename__ = 'embeddings'
  428. __table_args__ = (
  429. db.PrimaryKeyConstraint('id', name='embedding_pkey'),
  430. db.UniqueConstraint('model_name', 'hash', 'provider_name', name='embedding_hash_idx')
  431. )
  432. id = db.Column(StringUUID, primary_key=True, server_default=db.text('uuid_generate_v4()'))
  433. model_name = db.Column(db.String(40), nullable=False,
  434. server_default=db.text("'text-embedding-ada-002'::character varying"))
  435. hash = db.Column(db.String(64), nullable=False)
  436. embedding = db.Column(db.LargeBinary, nullable=False)
  437. created_at = db.Column(db.DateTime, nullable=False, server_default=db.text('CURRENT_TIMESTAMP(0)'))
  438. provider_name = db.Column(db.String(40), nullable=False,
  439. server_default=db.text("''::character varying"))
  440. def set_embedding(self, embedding_data: list[float]):
  441. self.embedding = pickle.dumps(embedding_data, protocol=pickle.HIGHEST_PROTOCOL)
  442. def get_embedding(self) -> list[float]:
  443. return pickle.loads(self.embedding)
  444. class DatasetCollectionBinding(db.Model):
  445. __tablename__ = 'dataset_collection_bindings'
  446. __table_args__ = (
  447. db.PrimaryKeyConstraint('id', name='dataset_collection_bindings_pkey'),
  448. db.Index('provider_model_name_idx', 'provider_name', 'model_name')
  449. )
  450. id = db.Column(StringUUID, primary_key=True, server_default=db.text('uuid_generate_v4()'))
  451. provider_name = db.Column(db.String(40), nullable=False)
  452. model_name = db.Column(db.String(40), nullable=False)
  453. type = db.Column(db.String(40), server_default=db.text("'dataset'::character varying"), nullable=False)
  454. collection_name = db.Column(db.String(64), nullable=False)
  455. created_at = db.Column(db.DateTime, nullable=False, server_default=db.text('CURRENT_TIMESTAMP(0)'))