dataset.py 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. import json
  2. import pickle
  3. from json import JSONDecodeError
  4. from sqlalchemy import func
  5. from sqlalchemy.dialects.postgresql import UUID
  6. from extensions.ext_database import db
  7. from models.account import Account
  8. from models.model import App, UploadFile
  9. class Dataset(db.Model):
  10. __tablename__ = 'datasets'
  11. __table_args__ = (
  12. db.PrimaryKeyConstraint('id', name='dataset_pkey'),
  13. db.Index('dataset_tenant_idx', 'tenant_id'),
  14. )
  15. INDEXING_TECHNIQUE_LIST = ['high_quality', 'economy']
  16. id = db.Column(UUID, server_default=db.text('uuid_generate_v4()'))
  17. tenant_id = db.Column(UUID, nullable=False)
  18. name = db.Column(db.String(255), nullable=False)
  19. description = db.Column(db.Text, nullable=True)
  20. provider = db.Column(db.String(255), nullable=False,
  21. server_default=db.text("'vendor'::character varying"))
  22. permission = db.Column(db.String(255), nullable=False,
  23. server_default=db.text("'only_me'::character varying"))
  24. data_source_type = db.Column(db.String(255))
  25. indexing_technique = db.Column(db.String(255), nullable=True)
  26. index_struct = db.Column(db.Text, nullable=True)
  27. created_by = db.Column(UUID, nullable=False)
  28. created_at = db.Column(db.DateTime, nullable=False,
  29. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  30. updated_by = db.Column(UUID, nullable=True)
  31. updated_at = db.Column(db.DateTime, nullable=False,
  32. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  33. @property
  34. def dataset_keyword_table(self):
  35. dataset_keyword_table = db.session.query(DatasetKeywordTable).filter(
  36. DatasetKeywordTable.dataset_id == self.id).first()
  37. if dataset_keyword_table:
  38. return dataset_keyword_table
  39. return None
  40. @property
  41. def index_struct_dict(self):
  42. return json.loads(self.index_struct) if self.index_struct else None
  43. @property
  44. def created_by_account(self):
  45. return Account.query.get(self.created_by)
  46. @property
  47. def latest_process_rule(self):
  48. return DatasetProcessRule.query.filter(DatasetProcessRule.dataset_id == self.id) \
  49. .order_by(DatasetProcessRule.created_at.desc()).first()
  50. @property
  51. def app_count(self):
  52. return db.session.query(func.count(AppDatasetJoin.id)).filter(AppDatasetJoin.dataset_id == self.id).scalar()
  53. @property
  54. def document_count(self):
  55. return db.session.query(func.count(Document.id)).filter(Document.dataset_id == self.id).scalar()
  56. @property
  57. def word_count(self):
  58. return Document.query.with_entities(func.coalesce(func.sum(Document.word_count))) \
  59. .filter(Document.dataset_id == self.id).scalar()
  60. class DatasetProcessRule(db.Model):
  61. __tablename__ = 'dataset_process_rules'
  62. __table_args__ = (
  63. db.PrimaryKeyConstraint('id', name='dataset_process_rule_pkey'),
  64. db.Index('dataset_process_rule_dataset_id_idx', 'dataset_id'),
  65. )
  66. id = db.Column(UUID, nullable=False,
  67. server_default=db.text('uuid_generate_v4()'))
  68. dataset_id = db.Column(UUID, nullable=False)
  69. mode = db.Column(db.String(255), nullable=False,
  70. server_default=db.text("'automatic'::character varying"))
  71. rules = db.Column(db.Text, nullable=True)
  72. created_by = db.Column(UUID, nullable=False)
  73. created_at = db.Column(db.DateTime, nullable=False,
  74. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  75. MODES = ['automatic', 'custom']
  76. PRE_PROCESSING_RULES = ['remove_stopwords', 'remove_extra_spaces', 'remove_urls_emails']
  77. AUTOMATIC_RULES = {
  78. 'pre_processing_rules': [
  79. {'id': 'remove_extra_spaces', 'enabled': True},
  80. {'id': 'remove_urls_emails', 'enabled': False}
  81. ],
  82. 'segmentation': {
  83. 'delimiter': '\n',
  84. 'max_tokens': 1000
  85. }
  86. }
  87. def to_dict(self):
  88. return {
  89. 'id': self.id,
  90. 'dataset_id': self.dataset_id,
  91. 'mode': self.mode,
  92. 'rules': self.rules_dict,
  93. 'created_by': self.created_by,
  94. 'created_at': self.created_at,
  95. }
  96. @property
  97. def rules_dict(self):
  98. try:
  99. return json.loads(self.rules) if self.rules else None
  100. except JSONDecodeError:
  101. return None
  102. class Document(db.Model):
  103. __tablename__ = 'documents'
  104. __table_args__ = (
  105. db.PrimaryKeyConstraint('id', name='document_pkey'),
  106. db.Index('document_dataset_id_idx', 'dataset_id'),
  107. db.Index('document_is_paused_idx', 'is_paused'),
  108. )
  109. # initial fields
  110. id = db.Column(UUID, nullable=False,
  111. server_default=db.text('uuid_generate_v4()'))
  112. tenant_id = db.Column(UUID, nullable=False)
  113. dataset_id = db.Column(UUID, nullable=False)
  114. position = db.Column(db.Integer, nullable=False)
  115. data_source_type = db.Column(db.String(255), nullable=False)
  116. data_source_info = db.Column(db.Text, nullable=True)
  117. dataset_process_rule_id = db.Column(UUID, nullable=True)
  118. batch = db.Column(db.String(255), nullable=False)
  119. name = db.Column(db.String(255), nullable=False)
  120. created_from = db.Column(db.String(255), nullable=False)
  121. created_by = db.Column(UUID, nullable=False)
  122. created_api_request_id = db.Column(UUID, nullable=True)
  123. created_at = db.Column(db.DateTime, nullable=False,
  124. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  125. # start processing
  126. processing_started_at = db.Column(db.DateTime, nullable=True)
  127. # parsing
  128. file_id = db.Column(db.Text, nullable=True)
  129. word_count = db.Column(db.Integer, nullable=True)
  130. parsing_completed_at = db.Column(db.DateTime, nullable=True)
  131. # cleaning
  132. cleaning_completed_at = db.Column(db.DateTime, nullable=True)
  133. # split
  134. splitting_completed_at = db.Column(db.DateTime, nullable=True)
  135. # indexing
  136. tokens = db.Column(db.Integer, nullable=True)
  137. indexing_latency = db.Column(db.Float, nullable=True)
  138. completed_at = db.Column(db.DateTime, nullable=True)
  139. # pause
  140. is_paused = db.Column(db.Boolean, nullable=True, server_default=db.text('false'))
  141. paused_by = db.Column(UUID, nullable=True)
  142. paused_at = db.Column(db.DateTime, nullable=True)
  143. # error
  144. error = db.Column(db.Text, nullable=True)
  145. stopped_at = db.Column(db.DateTime, nullable=True)
  146. # basic fields
  147. indexing_status = db.Column(db.String(
  148. 255), nullable=False, server_default=db.text("'waiting'::character varying"))
  149. enabled = db.Column(db.Boolean, nullable=False,
  150. server_default=db.text('true'))
  151. disabled_at = db.Column(db.DateTime, nullable=True)
  152. disabled_by = db.Column(UUID, nullable=True)
  153. archived = db.Column(db.Boolean, nullable=False,
  154. server_default=db.text('false'))
  155. archived_reason = db.Column(db.String(255), nullable=True)
  156. archived_by = db.Column(UUID, nullable=True)
  157. archived_at = db.Column(db.DateTime, nullable=True)
  158. updated_at = db.Column(db.DateTime, nullable=False,
  159. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  160. doc_type = db.Column(db.String(40), nullable=True)
  161. doc_metadata = db.Column(db.JSON, nullable=True)
  162. DATA_SOURCES = ['upload_file', 'notion_import']
  163. @property
  164. def display_status(self):
  165. status = None
  166. if self.indexing_status == 'waiting':
  167. status = 'queuing'
  168. elif self.indexing_status not in ['completed', 'error', 'waiting'] and self.is_paused:
  169. status = 'paused'
  170. elif self.indexing_status in ['parsing', 'cleaning', 'splitting', 'indexing']:
  171. status = 'indexing'
  172. elif self.indexing_status == 'error':
  173. status = 'error'
  174. elif self.indexing_status == 'completed' and not self.archived and self.enabled:
  175. status = 'available'
  176. elif self.indexing_status == 'completed' and not self.archived and not self.enabled:
  177. status = 'disabled'
  178. elif self.indexing_status == 'completed' and self.archived:
  179. status = 'archived'
  180. return status
  181. @property
  182. def data_source_info_dict(self):
  183. if self.data_source_info:
  184. try:
  185. data_source_info_dict = json.loads(self.data_source_info)
  186. except JSONDecodeError:
  187. data_source_info_dict = {}
  188. return data_source_info_dict
  189. return None
  190. @property
  191. def data_source_detail_dict(self):
  192. if self.data_source_info:
  193. if self.data_source_type == 'upload_file':
  194. data_source_info_dict = json.loads(self.data_source_info)
  195. file_detail = db.session.query(UploadFile). \
  196. filter(UploadFile.id == data_source_info_dict['upload_file_id']). \
  197. one_or_none()
  198. if file_detail:
  199. return {
  200. 'upload_file': {
  201. 'id': file_detail.id,
  202. 'name': file_detail.name,
  203. 'size': file_detail.size,
  204. 'extension': file_detail.extension,
  205. 'mime_type': file_detail.mime_type,
  206. 'created_by': file_detail.created_by,
  207. 'created_at': file_detail.created_at.timestamp()
  208. }
  209. }
  210. elif self.data_source_type == 'notion_import':
  211. return json.loads(self.data_source_info)
  212. return {}
  213. @property
  214. def average_segment_length(self):
  215. if self.word_count and self.word_count != 0 and self.segment_count and self.segment_count != 0:
  216. return self.word_count//self.segment_count
  217. return 0
  218. @property
  219. def dataset_process_rule(self):
  220. if self.dataset_process_rule_id:
  221. return DatasetProcessRule.query.get(self.dataset_process_rule_id)
  222. return None
  223. @property
  224. def dataset(self):
  225. return Dataset.query.get(self.dataset_id)
  226. @property
  227. def segment_count(self):
  228. return DocumentSegment.query.filter(DocumentSegment.document_id == self.id).count()
  229. @property
  230. def hit_count(self):
  231. return DocumentSegment.query.with_entities(func.coalesce(func.sum(DocumentSegment.hit_count))) \
  232. .filter(DocumentSegment.document_id == self.id).scalar()
  233. class DocumentSegment(db.Model):
  234. __tablename__ = 'document_segments'
  235. __table_args__ = (
  236. db.PrimaryKeyConstraint('id', name='document_segment_pkey'),
  237. db.Index('document_segment_dataset_id_idx', 'dataset_id'),
  238. db.Index('document_segment_document_id_idx', 'document_id'),
  239. db.Index('document_segment_tenant_dataset_idx', 'dataset_id', 'tenant_id'),
  240. db.Index('document_segment_tenant_document_idx', 'document_id', 'tenant_id'),
  241. db.Index('document_segment_dataset_node_idx', 'dataset_id', 'index_node_id'),
  242. )
  243. # initial fields
  244. id = db.Column(UUID, nullable=False,
  245. server_default=db.text('uuid_generate_v4()'))
  246. tenant_id = db.Column(UUID, nullable=False)
  247. dataset_id = db.Column(UUID, nullable=False)
  248. document_id = db.Column(UUID, nullable=False)
  249. position = db.Column(db.Integer, nullable=False)
  250. content = db.Column(db.Text, nullable=False)
  251. word_count = db.Column(db.Integer, nullable=False)
  252. tokens = db.Column(db.Integer, nullable=False)
  253. # indexing fields
  254. keywords = db.Column(db.JSON, nullable=True)
  255. index_node_id = db.Column(db.String(255), nullable=True)
  256. index_node_hash = db.Column(db.String(255), nullable=True)
  257. # basic fields
  258. hit_count = db.Column(db.Integer, nullable=False, default=0)
  259. enabled = db.Column(db.Boolean, nullable=False,
  260. server_default=db.text('true'))
  261. disabled_at = db.Column(db.DateTime, nullable=True)
  262. disabled_by = db.Column(UUID, nullable=True)
  263. status = db.Column(db.String(255), nullable=False,
  264. server_default=db.text("'waiting'::character varying"))
  265. created_by = db.Column(UUID, nullable=False)
  266. created_at = db.Column(db.DateTime, nullable=False,
  267. server_default=db.text('CURRENT_TIMESTAMP(0)'))
  268. indexing_at = db.Column(db.DateTime, nullable=True)
  269. completed_at = db.Column(db.DateTime, nullable=True)
  270. error = db.Column(db.Text, nullable=True)
  271. stopped_at = db.Column(db.DateTime, nullable=True)
  272. @property
  273. def dataset(self):
  274. return db.session.query(Dataset).filter(Dataset.id == self.dataset_id).first()
  275. @property
  276. def document(self):
  277. return db.session.query(Document).filter(Document.id == self.document_id).first()
  278. @property
  279. def embedding(self):
  280. embedding = db.session.query(Embedding).filter(Embedding.hash == self.index_node_hash).first() \
  281. if self.index_node_hash else None
  282. if embedding:
  283. return embedding.embedding
  284. return None
  285. @property
  286. def previous_segment(self):
  287. return db.session.query(DocumentSegment).filter(
  288. DocumentSegment.document_id == self.document_id,
  289. DocumentSegment.position == self.position - 1
  290. ).first()
  291. @property
  292. def next_segment(self):
  293. return db.session.query(DocumentSegment).filter(
  294. DocumentSegment.document_id == self.document_id,
  295. DocumentSegment.position == self.position + 1
  296. ).first()
  297. class AppDatasetJoin(db.Model):
  298. __tablename__ = 'app_dataset_joins'
  299. __table_args__ = (
  300. db.PrimaryKeyConstraint('id', name='app_dataset_join_pkey'),
  301. db.Index('app_dataset_join_app_dataset_idx', 'dataset_id', 'app_id'),
  302. )
  303. id = db.Column(UUID, primary_key=True, nullable=False, server_default=db.text('uuid_generate_v4()'))
  304. app_id = db.Column(UUID, nullable=False)
  305. dataset_id = db.Column(UUID, nullable=False)
  306. created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp())
  307. @property
  308. def app(self):
  309. return App.query.get(self.app_id)
  310. class DatasetQuery(db.Model):
  311. __tablename__ = 'dataset_queries'
  312. __table_args__ = (
  313. db.PrimaryKeyConstraint('id', name='dataset_query_pkey'),
  314. db.Index('dataset_query_dataset_id_idx', 'dataset_id'),
  315. )
  316. id = db.Column(UUID, primary_key=True, nullable=False, server_default=db.text('uuid_generate_v4()'))
  317. dataset_id = db.Column(UUID, nullable=False)
  318. content = db.Column(db.Text, nullable=False)
  319. source = db.Column(db.String(255), nullable=False)
  320. source_app_id = db.Column(UUID, nullable=True)
  321. created_by_role = db.Column(db.String, nullable=False)
  322. created_by = db.Column(UUID, nullable=False)
  323. created_at = db.Column(db.DateTime, nullable=False, server_default=db.func.current_timestamp())
  324. class DatasetKeywordTable(db.Model):
  325. __tablename__ = 'dataset_keyword_tables'
  326. __table_args__ = (
  327. db.PrimaryKeyConstraint('id', name='dataset_keyword_table_pkey'),
  328. db.Index('dataset_keyword_table_dataset_id_idx', 'dataset_id'),
  329. )
  330. id = db.Column(UUID, primary_key=True, server_default=db.text('uuid_generate_v4()'))
  331. dataset_id = db.Column(UUID, nullable=False, unique=True)
  332. keyword_table = db.Column(db.Text, nullable=False)
  333. @property
  334. def keyword_table_dict(self):
  335. return json.loads(self.keyword_table) if self.keyword_table else None
  336. class Embedding(db.Model):
  337. __tablename__ = 'embeddings'
  338. __table_args__ = (
  339. db.PrimaryKeyConstraint('id', name='embedding_pkey'),
  340. db.UniqueConstraint('hash', name='embedding_hash_idx')
  341. )
  342. id = db.Column(UUID, primary_key=True, server_default=db.text('uuid_generate_v4()'))
  343. hash = db.Column(db.String(64), nullable=False)
  344. embedding = db.Column(db.LargeBinary, nullable=False)
  345. created_at = db.Column(db.DateTime, nullable=False, server_default=db.text('CURRENT_TIMESTAMP(0)'))
  346. def set_embedding(self, embedding_data: list[float]):
  347. self.embedding = pickle.dumps(embedding_data, protocol=pickle.HIGHEST_PROTOCOL)
  348. def get_embedding(self) -> list[float]:
  349. return pickle.loads(self.embedding)