Jelajahi Sumber

Fix/filter empty segment (#1004)

Co-authored-by: jyong <jyong@dify.ai>
Jyong 1 tahun lalu
induk
melakukan
2d604d9330
2 mengubah file dengan 10 tambahan dan 5 penghapusan
  1. 6 5
      api/core/indexing_runner.py
  2. 4 0
      api/services/dataset_service.py

+ 6 - 5
api/core/indexing_runner.py

@@ -525,12 +525,13 @@ class IndexingRunner:
             documents = splitter.split_documents([text_doc])
             split_documents = []
             for document_node in documents:
-                doc_id = str(uuid.uuid4())
-                hash = helper.generate_text_hash(document_node.page_content)
-                document_node.metadata['doc_id'] = doc_id
-                document_node.metadata['doc_hash'] = hash
 
-                split_documents.append(document_node)
+                if document_node.page_content.strip():
+                    doc_id = str(uuid.uuid4())
+                    hash = helper.generate_text_hash(document_node.page_content)
+                    document_node.metadata['doc_id'] = doc_id
+                    document_node.metadata['doc_hash'] = hash
+                    split_documents.append(document_node)
             all_documents.extend(split_documents)
         # processing qa document
         if document_form == 'qa_model':

+ 4 - 0
api/services/dataset_service.py

@@ -891,6 +891,10 @@ class SegmentService:
         if document.doc_form == 'qa_model':
             if 'answer' not in args or not args['answer']:
                 raise ValueError("Answer is required")
+            if not args['answer'].strip():
+                raise ValueError("Answer is empty")
+        if 'content' not in args or not args['content'] or not args['content'].strip():
+            raise ValueError("Content is empty")
 
     @classmethod
     def create_segment(cls, args: dict, document: Document, dataset: Dataset):