Browse Source

fix: Ignore some emtpy page_content when append to split_documents (#2898)

listeng 1 năm trước cách đây
mục cha
commit
696efe494e

+ 4 - 3
api/core/rag/index_processor/processor/paragraph_index_processor.py

@@ -45,11 +45,12 @@ class ParagraphIndexProcessor(BaseIndexProcessor):
                     # delete Spliter character
                     page_content = document_node.page_content
                     if page_content.startswith(".") or page_content.startswith("。"):
-                        page_content = page_content[1:]
+                        page_content = page_content[1:].strip()
                     else:
                         page_content = page_content
-                    document_node.page_content = page_content
-                    split_documents.append(document_node)
+                    if len(page_content) > 0:
+                        document_node.page_content = page_content
+                        split_documents.append(document_node)
             all_documents.extend(split_documents)
         return all_documents