|
@@ -1046,73 +1046,11 @@ class SegmentService:
|
|
|
credentials=embedding_model.credentials,
|
|
|
texts=[content]
|
|
|
)
|
|
|
- max_position = db.session.query(func.max(DocumentSegment.position)).filter(
|
|
|
- DocumentSegment.document_id == document.id
|
|
|
- ).scalar()
|
|
|
- segment_document = DocumentSegment(
|
|
|
- tenant_id=current_user.current_tenant_id,
|
|
|
- dataset_id=document.dataset_id,
|
|
|
- document_id=document.id,
|
|
|
- index_node_id=doc_id,
|
|
|
- index_node_hash=segment_hash,
|
|
|
- position=max_position + 1 if max_position else 1,
|
|
|
- content=content,
|
|
|
- word_count=len(content),
|
|
|
- tokens=tokens,
|
|
|
- status='completed',
|
|
|
- indexing_at=datetime.datetime.utcnow(),
|
|
|
- completed_at=datetime.datetime.utcnow(),
|
|
|
- created_by=current_user.id
|
|
|
- )
|
|
|
- if document.doc_form == 'qa_model':
|
|
|
- segment_document.answer = args['answer']
|
|
|
-
|
|
|
- db.session.add(segment_document)
|
|
|
- db.session.commit()
|
|
|
-
|
|
|
- # save vector index
|
|
|
- try:
|
|
|
- VectorService.create_segments_vector([args['keywords']], [segment_document], dataset)
|
|
|
- except Exception as e:
|
|
|
- logging.exception("create segment index failed")
|
|
|
- segment_document.enabled = False
|
|
|
- segment_document.disabled_at = datetime.datetime.utcnow()
|
|
|
- segment_document.status = 'error'
|
|
|
- segment_document.error = str(e)
|
|
|
- db.session.commit()
|
|
|
- segment = db.session.query(DocumentSegment).filter(DocumentSegment.id == segment_document.id).first()
|
|
|
- return segment
|
|
|
-
|
|
|
- @classmethod
|
|
|
- def multi_create_segment(cls, segments: list, document: Document, dataset: Dataset):
|
|
|
- embedding_model = None
|
|
|
- if dataset.indexing_technique == 'high_quality':
|
|
|
- model_manager = ModelManager()
|
|
|
- embedding_model = model_manager.get_model_instance(
|
|
|
- tenant_id=current_user.current_tenant_id,
|
|
|
- provider=dataset.embedding_model_provider,
|
|
|
- model_type=ModelType.TEXT_EMBEDDING,
|
|
|
- model=dataset.embedding_model
|
|
|
- )
|
|
|
- max_position = db.session.query(func.max(DocumentSegment.position)).filter(
|
|
|
- DocumentSegment.document_id == document.id
|
|
|
- ).scalar()
|
|
|
- pre_segment_data_list = []
|
|
|
- segment_data_list = []
|
|
|
- keywords_list = []
|
|
|
- for segment_item in segments:
|
|
|
- content = segment_item['content']
|
|
|
- doc_id = str(uuid.uuid4())
|
|
|
- segment_hash = helper.generate_text_hash(content)
|
|
|
- tokens = 0
|
|
|
- if dataset.indexing_technique == 'high_quality' and embedding_model:
|
|
|
- # calc embedding use tokens
|
|
|
- model_type_instance = cast(TextEmbeddingModel, embedding_model.model_type_instance)
|
|
|
- tokens = model_type_instance.get_num_tokens(
|
|
|
- model=embedding_model.model,
|
|
|
- credentials=embedding_model.credentials,
|
|
|
- texts=[content]
|
|
|
- )
|
|
|
+ lock_name = 'add_segment_lock_document_id_{}'.format(document.id)
|
|
|
+ with redis_client.lock(lock_name, timeout=600):
|
|
|
+ max_position = db.session.query(func.max(DocumentSegment.position)).filter(
|
|
|
+ DocumentSegment.document_id == document.id
|
|
|
+ ).scalar()
|
|
|
segment_document = DocumentSegment(
|
|
|
tenant_id=current_user.current_tenant_id,
|
|
|
dataset_id=document.dataset_id,
|
|
@@ -1129,25 +1067,91 @@ class SegmentService:
|
|
|
created_by=current_user.id
|
|
|
)
|
|
|
if document.doc_form == 'qa_model':
|
|
|
- segment_document.answer = segment_item['answer']
|
|
|
- db.session.add(segment_document)
|
|
|
- segment_data_list.append(segment_document)
|
|
|
+ segment_document.answer = args['answer']
|
|
|
|
|
|
- pre_segment_data_list.append(segment_document)
|
|
|
- keywords_list.append(segment_item['keywords'])
|
|
|
+ db.session.add(segment_document)
|
|
|
+ db.session.commit()
|
|
|
|
|
|
- try:
|
|
|
# save vector index
|
|
|
- VectorService.create_segments_vector(keywords_list, pre_segment_data_list, dataset)
|
|
|
- except Exception as e:
|
|
|
- logging.exception("create segment index failed")
|
|
|
- for segment_document in segment_data_list:
|
|
|
+ try:
|
|
|
+ VectorService.create_segments_vector([args['keywords']], [segment_document], dataset)
|
|
|
+ except Exception as e:
|
|
|
+ logging.exception("create segment index failed")
|
|
|
segment_document.enabled = False
|
|
|
segment_document.disabled_at = datetime.datetime.utcnow()
|
|
|
segment_document.status = 'error'
|
|
|
segment_document.error = str(e)
|
|
|
- db.session.commit()
|
|
|
- return segment_data_list
|
|
|
+ db.session.commit()
|
|
|
+ segment = db.session.query(DocumentSegment).filter(DocumentSegment.id == segment_document.id).first()
|
|
|
+ return segment
|
|
|
+
|
|
|
+ @classmethod
|
|
|
+ def multi_create_segment(cls, segments: list, document: Document, dataset: Dataset):
|
|
|
+ lock_name = 'multi_add_segment_lock_document_id_{}'.format(document.id)
|
|
|
+ with redis_client.lock(lock_name, timeout=600):
|
|
|
+ embedding_model = None
|
|
|
+ if dataset.indexing_technique == 'high_quality':
|
|
|
+ model_manager = ModelManager()
|
|
|
+ embedding_model = model_manager.get_model_instance(
|
|
|
+ tenant_id=current_user.current_tenant_id,
|
|
|
+ provider=dataset.embedding_model_provider,
|
|
|
+ model_type=ModelType.TEXT_EMBEDDING,
|
|
|
+ model=dataset.embedding_model
|
|
|
+ )
|
|
|
+ max_position = db.session.query(func.max(DocumentSegment.position)).filter(
|
|
|
+ DocumentSegment.document_id == document.id
|
|
|
+ ).scalar()
|
|
|
+ pre_segment_data_list = []
|
|
|
+ segment_data_list = []
|
|
|
+ keywords_list = []
|
|
|
+ for segment_item in segments:
|
|
|
+ content = segment_item['content']
|
|
|
+ doc_id = str(uuid.uuid4())
|
|
|
+ segment_hash = helper.generate_text_hash(content)
|
|
|
+ tokens = 0
|
|
|
+ if dataset.indexing_technique == 'high_quality' and embedding_model:
|
|
|
+ # calc embedding use tokens
|
|
|
+ model_type_instance = cast(TextEmbeddingModel, embedding_model.model_type_instance)
|
|
|
+ tokens = model_type_instance.get_num_tokens(
|
|
|
+ model=embedding_model.model,
|
|
|
+ credentials=embedding_model.credentials,
|
|
|
+ texts=[content]
|
|
|
+ )
|
|
|
+ segment_document = DocumentSegment(
|
|
|
+ tenant_id=current_user.current_tenant_id,
|
|
|
+ dataset_id=document.dataset_id,
|
|
|
+ document_id=document.id,
|
|
|
+ index_node_id=doc_id,
|
|
|
+ index_node_hash=segment_hash,
|
|
|
+ position=max_position + 1 if max_position else 1,
|
|
|
+ content=content,
|
|
|
+ word_count=len(content),
|
|
|
+ tokens=tokens,
|
|
|
+ status='completed',
|
|
|
+ indexing_at=datetime.datetime.utcnow(),
|
|
|
+ completed_at=datetime.datetime.utcnow(),
|
|
|
+ created_by=current_user.id
|
|
|
+ )
|
|
|
+ if document.doc_form == 'qa_model':
|
|
|
+ segment_document.answer = segment_item['answer']
|
|
|
+ db.session.add(segment_document)
|
|
|
+ segment_data_list.append(segment_document)
|
|
|
+
|
|
|
+ pre_segment_data_list.append(segment_document)
|
|
|
+ keywords_list.append(segment_item['keywords'])
|
|
|
+
|
|
|
+ try:
|
|
|
+ # save vector index
|
|
|
+ VectorService.create_segments_vector(keywords_list, pre_segment_data_list, dataset)
|
|
|
+ except Exception as e:
|
|
|
+ logging.exception("create segment index failed")
|
|
|
+ for segment_document in segment_data_list:
|
|
|
+ segment_document.enabled = False
|
|
|
+ segment_document.disabled_at = datetime.datetime.utcnow()
|
|
|
+ segment_document.status = 'error'
|
|
|
+ segment_document.error = str(e)
|
|
|
+ db.session.commit()
|
|
|
+ return segment_data_list
|
|
|
|
|
|
@classmethod
|
|
|
def update_segment(cls, args: dict, segment: DocumentSegment, document: Document, dataset: Dataset):
|