il y a 3 mois · 41f39bf3fc
--- a/api/core/workflow/nodes/document_extractor/node.py
+++ b/api/core/workflow/nodes/document_extractor/node.py
@@ -2,6 +2,7 @@ import csv
 
				 import io
			
 
				 import json
			
 
				 import logging
			
 
				+import operator
			
 
				 import os
			
 
				 import tempfile
			
 
				 from typing import cast
			
@@ -10,6 +11,8 @@ import docx
 
				 import pandas as pd
			
 
				 import pypdfium2  # type: ignore
			
 
				 import yaml  # type: ignore
			
 
				+from docx.table import Table
			
 
				+from docx.text.paragraph import Paragraph
			
 
				 
			
 
				 from configs import dify_config
			
 
				 from core.file import File, FileTransferMethod, file_manager
			
@@ -189,35 +192,56 @@ def _extract_text_from_doc(file_content: bytes) -> str:
 
				         doc_file = io.BytesIO(file_content)
			
 
				         doc = docx.Document(doc_file)
			
 
				         text = []
			
 
				-        # Process paragraphs
			
 
				-        for paragraph in doc.paragraphs:
			
 
				-            if paragraph.text.strip():
			
 
				-                text.append(paragraph.text)
			
 
				 
			
 
				-        # Process tables
			
 
				-        for table in doc.tables:
			
 
				-            # Table header
			
 
				-            try:
			
 
				-                # table maybe cause errors so ignore it.
			
 
				-                if len(table.rows) > 0 and table.rows[0].cells is not None:
			
 
				+        # Keep track of paragraph and table positions
			
 
				+        content_items: list[tuple[int, str, Table | Paragraph]] = []
			
 
				+
			
 
				+        # Process paragraphs and tables
			
 
				+        for i, paragraph in enumerate(doc.paragraphs):
			
 
				+            if paragraph.text.strip():
			
 
				+                content_items.append((i, "paragraph", paragraph))
			
 
				+
			
 
				+        for i, table in enumerate(doc.tables):
			
 
				+            content_items.append((i, "table", table))
			
 
				+
			
 
				+        # Sort content items based on their original position
			
 
				+        content_items.sort(key=operator.itemgetter(0))
			
 
				+
			
 
				+        # Process sorted content
			
 
				+        for _, item_type, item in content_items:
			
 
				+            if item_type == "paragraph":
			
 
				+                if isinstance(item, Table):
			
 
				+                    continue
			
 
				+                text.append(item.text)
			
 
				+            elif item_type == "table":
			
 
				+                # Process tables
			
 
				+                if not isinstance(item, Table):
			
 
				+                    continue
			
 
				+                try:
			
 
				                     # Check if any cell in the table has text
			
 
				                     has_content = False
			
 
				-                    for row in table.rows:
			
 
				+                    for row in item.rows:
			
 
				                         if any(cell.text.strip() for cell in row.cells):
			
 
				                             has_content = True
			
 
				                             break
			
 
				 
			
 
				                     if has_content:
			
 
				-                        markdown_table = "| " + " | ".join(cell.text for cell in table.rows[0].cells) + " |\n"
			
 
				-                        markdown_table += "| " + " | ".join(["---"] * len(table.rows[0].cells)) + " |\n"
			
 
				-                        for row in table.rows[1:]:
			
 
				-                            markdown_table += "| " + " | ".join(cell.text for cell in row.cells) + " |\n"
			
 
				+                        cell_texts = [cell.text.replace("\n", "<br>") for cell in item.rows[0].cells]
			
 
				+                        markdown_table = f"| {' | '.join(cell_texts)} |\n"
			
 
				+                        markdown_table += f"| {' | '.join(['---'] * len(item.rows[0].cells))} |\n"
			
 
				+
			
 
				+                        for row in item.rows[1:]:
			
 
				+                            # Replace newlines with <br> in each cell
			
 
				+                            row_cells = [cell.text.replace("\n", "<br>") for cell in row.cells]
			
 
				+                            markdown_table += "| " + " | ".join(row_cells) + " |\n"
			
 
				+
			
 
				                         text.append(markdown_table)
			
 
				-            except Exception as e:
			
 
				-                logger.warning(f"Failed to extract table from DOC/DOCX: {e}")
			
 
				-                continue
			
 
				+                except Exception as e:
			
 
				+                    logger.warning(f"Failed to extract table from DOC/DOCX: {e}")
			
 
				+                    continue
			
 
				 
			
 
				         return "\n".join(text)
			
 
				+
			
 
				     except Exception as e:
			
 
				         raise TextExtractionError(f"Failed to extract text from DOC/DOCX: {str(e)}") from e