1 месяц назад · 1789437cc5
--- a/api/core/workflow/nodes/document_extractor/node.py
+++ b/api/core/workflow/nodes/document_extractor/node.py
@@ -2,7 +2,6 @@ import csv
 
				 import io
			
 
				 import json
			
 
				 import logging
			
 
				-import operator
			
 
				 import os
			
 
				 import tempfile
			
 
				 from collections.abc import Mapping, Sequence
			
@@ -12,6 +11,9 @@ import docx
 
				 import pandas as pd
			
 
				 import pypdfium2  # type: ignore
			
 
				 import yaml  # type: ignore
			
 
				+from docx.document import Document
			
 
				+from docx.oxml.table import CT_Tbl
			
 
				+from docx.oxml.text.paragraph import CT_P
			
 
				 from docx.table import Table
			
 
				 from docx.text.paragraph import Paragraph
			
 
				 
			
@@ -231,6 +233,13 @@ def _extract_text_from_doc(file_content: bytes) -> str:
 
				         raise TextExtractionError(f"Failed to extract text from DOC: {str(e)}") from e
			
 
				 
			
 
				 
			
 
				+def paser_docx_part(block, doc: Document, content_items, i):
			
 
				+    if isinstance(block, CT_P):
			
 
				+        content_items.append((i, "paragraph", Paragraph(block, doc)))
			
 
				+    elif isinstance(block, CT_Tbl):
			
 
				+        content_items.append((i, "table", Table(block, doc)))
			
 
				+
			
 
				+
			
 
				 def _extract_text_from_docx(file_content: bytes) -> str:
			
 
				     """
			
 
				     Extract text from a DOCX file.
			
@@ -244,16 +253,13 @@ def _extract_text_from_docx(file_content: bytes) -> str:
 
				         # Keep track of paragraph and table positions
			
 
				         content_items: list[tuple[int, str, Table | Paragraph]] = []
			
 
				 
			
 
				-        # Process paragraphs and tables
			
 
				-        for i, paragraph in enumerate(doc.paragraphs):
			
 
				-            if paragraph.text.strip():
			
 
				-                content_items.append((i, "paragraph", paragraph))
			
 
				-
			
 
				-        for i, table in enumerate(doc.tables):
			
 
				-            content_items.append((i, "table", table))
			
 
				-
			
 
				-        # Sort content items based on their original position
			
 
				-        content_items.sort(key=operator.itemgetter(0))
			
 
				+        it = iter(doc.element.body)
			
 
				+        part = next(it, None)
			
 
				+        i = 0
			
 
				+        while part is not None:
			
 
				+            paser_docx_part(part, doc, content_items, i)
			
 
				+            i = i + 1
			
 
				+            part = next(it, None)
			
 
				 
			
 
				         # Process sorted content
			
 
				         for _, item_type, item in content_items:
			
--- a/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py
+++ b/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py
@@ -1,6 +1,7 @@
 
				 from unittest.mock import Mock, patch
			
 
				 
			
 
				 import pytest
			
 
				+from docx.oxml.text.paragraph import CT_P
			
 
				 
			
 
				 from core.file import File, FileTransferMethod
			
 
				 from core.variables import ArrayFileSegment
			
@@ -169,7 +170,12 @@ def test_extract_text_from_docx(mock_document):
 
				     mock_paragraph2 = Mock()
			
 
				     mock_paragraph2.text = "Paragraph 2"
			
 
				     mock_document.return_value.paragraphs = [mock_paragraph1, mock_paragraph2]
			
 
				-
			
 
				+    mock_ct_p1 = Mock(spec=CT_P)
			
 
				+    mock_ct_p1.text = "Paragraph 1"
			
 
				+    mock_ct_p2 = Mock(spec=CT_P)
			
 
				+    mock_ct_p2.text = "Paragraph 2"
			
 
				+    mock_element = Mock(body=[mock_ct_p1, mock_ct_p2])
			
 
				+    mock_document.return_value.element = mock_element
			
 
				     text = _extract_text_from_docx(b"PK\x03\x04")
			
 
				     assert text == "Paragraph 1\nParagraph 2"