|
@@ -24,56 +24,64 @@ class Jieba(BaseKeyword):
|
|
|
self._config = KeywordTableConfig()
|
|
|
|
|
|
def create(self, texts: list[Document], **kwargs) -> BaseKeyword:
|
|
|
- keyword_table_handler = JiebaKeywordTableHandler()
|
|
|
- keyword_table = self._get_dataset_keyword_table()
|
|
|
- for text in texts:
|
|
|
- keywords = keyword_table_handler.extract_keywords(text.page_content, self._config.max_keywords_per_chunk)
|
|
|
- self._update_segment_keywords(self.dataset.id, text.metadata['doc_id'], list(keywords))
|
|
|
- keyword_table = self._add_text_to_keyword_table(keyword_table, text.metadata['doc_id'], list(keywords))
|
|
|
+ lock_name = 'keyword_indexing_lock_{}'.format(self.dataset.id)
|
|
|
+ with redis_client.lock(lock_name, timeout=600):
|
|
|
+ keyword_table_handler = JiebaKeywordTableHandler()
|
|
|
+ keyword_table = self._get_dataset_keyword_table()
|
|
|
+ for text in texts:
|
|
|
+ keywords = keyword_table_handler.extract_keywords(text.page_content, self._config.max_keywords_per_chunk)
|
|
|
+ self._update_segment_keywords(self.dataset.id, text.metadata['doc_id'], list(keywords))
|
|
|
+ keyword_table = self._add_text_to_keyword_table(keyword_table, text.metadata['doc_id'], list(keywords))
|
|
|
|
|
|
- self._save_dataset_keyword_table(keyword_table)
|
|
|
+ self._save_dataset_keyword_table(keyword_table)
|
|
|
|
|
|
- return self
|
|
|
+ return self
|
|
|
|
|
|
def add_texts(self, texts: list[Document], **kwargs):
|
|
|
- keyword_table_handler = JiebaKeywordTableHandler()
|
|
|
-
|
|
|
- keyword_table = self._get_dataset_keyword_table()
|
|
|
- keywords_list = kwargs.get('keywords_list', None)
|
|
|
- for i in range(len(texts)):
|
|
|
- text = texts[i]
|
|
|
- if keywords_list:
|
|
|
- keywords = keywords_list[i]
|
|
|
- else:
|
|
|
- keywords = keyword_table_handler.extract_keywords(text.page_content, self._config.max_keywords_per_chunk)
|
|
|
- self._update_segment_keywords(self.dataset.id, text.metadata['doc_id'], list(keywords))
|
|
|
- keyword_table = self._add_text_to_keyword_table(keyword_table, text.metadata['doc_id'], list(keywords))
|
|
|
-
|
|
|
- self._save_dataset_keyword_table(keyword_table)
|
|
|
+ lock_name = 'keyword_indexing_lock_{}'.format(self.dataset.id)
|
|
|
+ with redis_client.lock(lock_name, timeout=600):
|
|
|
+ keyword_table_handler = JiebaKeywordTableHandler()
|
|
|
+
|
|
|
+ keyword_table = self._get_dataset_keyword_table()
|
|
|
+ keywords_list = kwargs.get('keywords_list', None)
|
|
|
+ for i in range(len(texts)):
|
|
|
+ text = texts[i]
|
|
|
+ if keywords_list:
|
|
|
+ keywords = keywords_list[i]
|
|
|
+ else:
|
|
|
+ keywords = keyword_table_handler.extract_keywords(text.page_content, self._config.max_keywords_per_chunk)
|
|
|
+ self._update_segment_keywords(self.dataset.id, text.metadata['doc_id'], list(keywords))
|
|
|
+ keyword_table = self._add_text_to_keyword_table(keyword_table, text.metadata['doc_id'], list(keywords))
|
|
|
+
|
|
|
+ self._save_dataset_keyword_table(keyword_table)
|
|
|
|
|
|
def text_exists(self, id: str) -> bool:
|
|
|
keyword_table = self._get_dataset_keyword_table()
|
|
|
return id in set.union(*keyword_table.values())
|
|
|
|
|
|
def delete_by_ids(self, ids: list[str]) -> None:
|
|
|
- keyword_table = self._get_dataset_keyword_table()
|
|
|
- keyword_table = self._delete_ids_from_keyword_table(keyword_table, ids)
|
|
|
+ lock_name = 'keyword_indexing_lock_{}'.format(self.dataset.id)
|
|
|
+ with redis_client.lock(lock_name, timeout=600):
|
|
|
+ keyword_table = self._get_dataset_keyword_table()
|
|
|
+ keyword_table = self._delete_ids_from_keyword_table(keyword_table, ids)
|
|
|
|
|
|
- self._save_dataset_keyword_table(keyword_table)
|
|
|
+ self._save_dataset_keyword_table(keyword_table)
|
|
|
|
|
|
def delete_by_document_id(self, document_id: str):
|
|
|
- # get segment ids by document_id
|
|
|
- segments = db.session.query(DocumentSegment).filter(
|
|
|
- DocumentSegment.dataset_id == self.dataset.id,
|
|
|
- DocumentSegment.document_id == document_id
|
|
|
- ).all()
|
|
|
+ lock_name = 'keyword_indexing_lock_{}'.format(self.dataset.id)
|
|
|
+ with redis_client.lock(lock_name, timeout=600):
|
|
|
+ # get segment ids by document_id
|
|
|
+ segments = db.session.query(DocumentSegment).filter(
|
|
|
+ DocumentSegment.dataset_id == self.dataset.id,
|
|
|
+ DocumentSegment.document_id == document_id
|
|
|
+ ).all()
|
|
|
|
|
|
- ids = [segment.index_node_id for segment in segments]
|
|
|
+ ids = [segment.index_node_id for segment in segments]
|
|
|
|
|
|
- keyword_table = self._get_dataset_keyword_table()
|
|
|
- keyword_table = self._delete_ids_from_keyword_table(keyword_table, ids)
|
|
|
+ keyword_table = self._get_dataset_keyword_table()
|
|
|
+ keyword_table = self._delete_ids_from_keyword_table(keyword_table, ids)
|
|
|
|
|
|
- self._save_dataset_keyword_table(keyword_table)
|
|
|
+ self._save_dataset_keyword_table(keyword_table)
|
|
|
|
|
|
def search(
|
|
|
self, query: str,
|
|
@@ -106,13 +114,15 @@ class Jieba(BaseKeyword):
|
|
|
return documents
|
|
|
|
|
|
def delete(self) -> None:
|
|
|
- dataset_keyword_table = self.dataset.dataset_keyword_table
|
|
|
- if dataset_keyword_table:
|
|
|
- db.session.delete(dataset_keyword_table)
|
|
|
- db.session.commit()
|
|
|
- if dataset_keyword_table.data_source_type != 'database':
|
|
|
- file_key = 'keyword_files/' + self.dataset.tenant_id + '/' + self.dataset.id + '.txt'
|
|
|
- storage.delete(file_key)
|
|
|
+ lock_name = 'keyword_indexing_lock_{}'.format(self.dataset.id)
|
|
|
+ with redis_client.lock(lock_name, timeout=600):
|
|
|
+ dataset_keyword_table = self.dataset.dataset_keyword_table
|
|
|
+ if dataset_keyword_table:
|
|
|
+ db.session.delete(dataset_keyword_table)
|
|
|
+ db.session.commit()
|
|
|
+ if dataset_keyword_table.data_source_type != 'database':
|
|
|
+ file_key = 'keyword_files/' + self.dataset.tenant_id + '/' + self.dataset.id + '.txt'
|
|
|
+ storage.delete(file_key)
|
|
|
|
|
|
def _save_dataset_keyword_table(self, keyword_table):
|
|
|
keyword_table_dict = {
|
|
@@ -135,33 +145,31 @@ class Jieba(BaseKeyword):
|
|
|
storage.save(file_key, json.dumps(keyword_table_dict, cls=SetEncoder).encode('utf-8'))
|
|
|
|
|
|
def _get_dataset_keyword_table(self) -> Optional[dict]:
|
|
|
- lock_name = 'keyword_indexing_lock_{}'.format(self.dataset.id)
|
|
|
- with redis_client.lock(lock_name, timeout=20):
|
|
|
- dataset_keyword_table = self.dataset.dataset_keyword_table
|
|
|
- if dataset_keyword_table:
|
|
|
- keyword_table_dict = dataset_keyword_table.keyword_table_dict
|
|
|
- if keyword_table_dict:
|
|
|
- return keyword_table_dict['__data__']['table']
|
|
|
- else:
|
|
|
- keyword_data_source_type = current_app.config['KEYWORD_DATA_SOURCE_TYPE']
|
|
|
- dataset_keyword_table = DatasetKeywordTable(
|
|
|
- dataset_id=self.dataset.id,
|
|
|
- keyword_table='',
|
|
|
- data_source_type=keyword_data_source_type,
|
|
|
- )
|
|
|
- if keyword_data_source_type == 'database':
|
|
|
- dataset_keyword_table.keyword_table = json.dumps({
|
|
|
- '__type__': 'keyword_table',
|
|
|
- '__data__': {
|
|
|
- "index_id": self.dataset.id,
|
|
|
- "summary": None,
|
|
|
- "table": {}
|
|
|
- }
|
|
|
- }, cls=SetEncoder)
|
|
|
- db.session.add(dataset_keyword_table)
|
|
|
- db.session.commit()
|
|
|
+ dataset_keyword_table = self.dataset.dataset_keyword_table
|
|
|
+ if dataset_keyword_table:
|
|
|
+ keyword_table_dict = dataset_keyword_table.keyword_table_dict
|
|
|
+ if keyword_table_dict:
|
|
|
+ return keyword_table_dict['__data__']['table']
|
|
|
+ else:
|
|
|
+ keyword_data_source_type = current_app.config['KEYWORD_DATA_SOURCE_TYPE']
|
|
|
+ dataset_keyword_table = DatasetKeywordTable(
|
|
|
+ dataset_id=self.dataset.id,
|
|
|
+ keyword_table='',
|
|
|
+ data_source_type=keyword_data_source_type,
|
|
|
+ )
|
|
|
+ if keyword_data_source_type == 'database':
|
|
|
+ dataset_keyword_table.keyword_table = json.dumps({
|
|
|
+ '__type__': 'keyword_table',
|
|
|
+ '__data__': {
|
|
|
+ "index_id": self.dataset.id,
|
|
|
+ "summary": None,
|
|
|
+ "table": {}
|
|
|
+ }
|
|
|
+ }, cls=SetEncoder)
|
|
|
+ db.session.add(dataset_keyword_table)
|
|
|
+ db.session.commit()
|
|
|
|
|
|
- return {}
|
|
|
+ return {}
|
|
|
|
|
|
def _add_text_to_keyword_table(self, keyword_table: dict, id: str, keywords: list[str]) -> dict:
|
|
|
for keyword in keywords:
|