7 meses atrás · d489b8b3e0
--- a/api/core/rag/datasource/vdb/vector_factory.py
+++ b/api/core/rag/datasource/vdb/vector_factory.py
@@ -30,7 +30,7 @@ class AbstractVectorFactory(ABC):
 
				 class Vector:
			
 
				     def __init__(self, dataset: Dataset, attributes: list = None):
			
 
				         if attributes is None:
			
 
				-            attributes = ['doc_id', 'dataset_id', 'document_id', 'doc_hash']
			
 
				+            attributes = ['doc_id', 'dataset_id', 'document_id', 'doc_hash', 'page']
			
 
				         self._dataset = dataset
			
 
				         self._embeddings = self._get_embeddings()
			
 
				         self._attributes = attributes
			
@@ -107,6 +107,7 @@ class Vector:
 
				     def add_texts(self, documents: list[Document], **kwargs):
			
 
				         if kwargs.get('duplicate_check', False):
			
 
				             documents = self._filter_duplicate_texts(documents)
			
 
				+
			
 
				         embeddings = self._embeddings.embed_documents([document.page_content for document in documents])
			
 
				         self._vector_processor.create(
			
 
				             texts=documents,
			
--- a/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py
+++ b/api/core/workflow/nodes/knowledge_retrieval/knowledge_retrieval_node.py
@@ -173,9 +173,13 @@ class KnowledgeRetrievalNode(BaseNode):
 
				         context_list = []
			
 
				         if all_documents:
			
 
				             document_score_list = {}
			
 
				+            page_number_list = {}
			
 
				             for item in all_documents:
			
 
				                 if item.metadata.get('score'):
			
 
				                     document_score_list[item.metadata['doc_id']] = item.metadata['score']
			
 
				+                # both 'page' and 'score' are metadata fields
			
 
				+                if item.metadata.get('page'):
			
 
				+                    page_number_list[item.metadata['doc_id']] = item.metadata['page']
			
 
				 
			
 
				             index_node_ids = [document.metadata['doc_id'] for document in all_documents]
			
 
				             segments = DocumentSegment.query.filter(
			
@@ -199,9 +203,9 @@ class KnowledgeRetrievalNode(BaseNode):
 
				                                                      Document.enabled == True,
			
 
				                                                      Document.archived == False,
			
 
				                                                      ).first()
			
 
				+
			
 
				                     resource_number = 1
			
 
				                     if dataset and document:
			
 
				-
			
 
				                         source = {
			
 
				                             'metadata': {
			
 
				                                 '_source': 'knowledge',
			
@@ -211,6 +215,7 @@ class KnowledgeRetrievalNode(BaseNode):
 
				                                 'document_id': document.id,
			
 
				                                 'document_name': document.name,
			
 
				                                 'document_data_source_type': document.data_source_type,
			
 
				+                                'page': page_number_list.get(segment.index_node_id, None),
			
 
				                                 'segment_id': segment.id,
			
 
				                                 'retriever_from': 'workflow',
			
 
				                                 'score': document_score_list.get(segment.index_node_id, None),
			
--- a/api/core/workflow/nodes/llm/llm_node.py
+++ b/api/core/workflow/nodes/llm/llm_node.py
@@ -402,6 +402,7 @@ class LLMNode(BaseNode):
 
				         if ('metadata' in context_dict and '_source' in context_dict['metadata']
			
 
				                 and context_dict['metadata']['_source'] == 'knowledge'):
			
 
				             metadata = context_dict.get('metadata', {})
			
 
				+
			
 
				             source = {
			
 
				                 'position': metadata.get('position'),
			
 
				                 'dataset_id': metadata.get('dataset_id'),
			
@@ -417,6 +418,7 @@ class LLMNode(BaseNode):
 
				                 'segment_position': metadata.get('segment_position'),
			
 
				                 'index_node_hash': metadata.get('segment_index_node_hash'),
			
 
				                 'content': context_dict.get('content'),
			
 
				+                'page': metadata.get('page'),
			
 
				             }
			
 
				 
			
 
				             return source