před 5 měsíci · 216442ddc1
--- a/api/core/workflow/nodes/document_extractor/node.py
+++ b/api/core/workflow/nodes/document_extractor/node.py
@@ -1,5 +1,6 @@
 
				 import csv
			
 
				 import io
			
 
				+import json
			
 
				 
			
 
				 import docx
			
 
				 import pandas as pd
			
@@ -77,34 +78,31 @@ class DocumentExtractorNode(BaseNode[DocumentExtractorNodeData]):
 
				 
			
 
				 def _extract_text_by_mime_type(*, file_content: bytes, mime_type: str) -> str:
			
 
				     """Extract text from a file based on its MIME type."""
			
 
				-    if mime_type.startswith("text/plain") or mime_type in {"text/html", "text/htm", "text/markdown", "text/xml"}:
			
 
				-        return _extract_text_from_plain_text(file_content)
			
 
				-    elif mime_type == "application/pdf":
			
 
				-        return _extract_text_from_pdf(file_content)
			
 
				-    elif mime_type in {
			
 
				-        "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
			
 
				-        "application/msword",
			
 
				-    }:
			
 
				-        return _extract_text_from_doc(file_content)
			
 
				-    elif mime_type == "text/csv":
			
 
				-        return _extract_text_from_csv(file_content)
			
 
				-    elif mime_type in {
			
 
				-        "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
			
 
				-        "application/vnd.ms-excel",
			
 
				-    }:
			
 
				-        return _extract_text_from_excel(file_content)
			
 
				-    elif mime_type == "application/vnd.ms-powerpoint":
			
 
				-        return _extract_text_from_ppt(file_content)
			
 
				-    elif mime_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation":
			
 
				-        return _extract_text_from_pptx(file_content)
			
 
				-    elif mime_type == "application/epub+zip":
			
 
				-        return _extract_text_from_epub(file_content)
			
 
				-    elif mime_type == "message/rfc822":
			
 
				-        return _extract_text_from_eml(file_content)
			
 
				-    elif mime_type == "application/vnd.ms-outlook":
			
 
				-        return _extract_text_from_msg(file_content)
			
 
				-    else:
			
 
				-        raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")
			
 
				+    match mime_type:
			
 
				+        case "text/plain" | "text/html" | "text/htm" | "text/markdown" | "text/xml":
			
 
				+            return _extract_text_from_plain_text(file_content)
			
 
				+        case "application/pdf":
			
 
				+            return _extract_text_from_pdf(file_content)
			
 
				+        case "application/vnd.openxmlformats-officedocument.wordprocessingml.document" | "application/msword":
			
 
				+            return _extract_text_from_doc(file_content)
			
 
				+        case "text/csv":
			
 
				+            return _extract_text_from_csv(file_content)
			
 
				+        case "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet" | "application/vnd.ms-excel":
			
 
				+            return _extract_text_from_excel(file_content)
			
 
				+        case "application/vnd.ms-powerpoint":
			
 
				+            return _extract_text_from_ppt(file_content)
			
 
				+        case "application/vnd.openxmlformats-officedocument.presentationml.presentation":
			
 
				+            return _extract_text_from_pptx(file_content)
			
 
				+        case "application/epub+zip":
			
 
				+            return _extract_text_from_epub(file_content)
			
 
				+        case "message/rfc822":
			
 
				+            return _extract_text_from_eml(file_content)
			
 
				+        case "application/vnd.ms-outlook":
			
 
				+            return _extract_text_from_msg(file_content)
			
 
				+        case "application/json":
			
 
				+            return _extract_text_from_json(file_content)
			
 
				+        case _:
			
 
				+            raise UnsupportedFileTypeError(f"Unsupported MIME type: {mime_type}")
			
 
				 
			
 
				 
			
 
				 def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str) -> str:
			
@@ -112,6 +110,8 @@ def _extract_text_by_file_extension(*, file_content: bytes, file_extension: str)
 
				     match file_extension:
			
 
				         case ".txt" | ".markdown" | ".md" | ".html" | ".htm" | ".xml":
			
 
				             return _extract_text_from_plain_text(file_content)
			
 
				+        case ".json":
			
 
				+            return _extract_text_from_json(file_content)
			
 
				         case ".pdf":
			
 
				             return _extract_text_from_pdf(file_content)
			
 
				         case ".doc" | ".docx":
			
@@ -141,6 +141,14 @@ def _extract_text_from_plain_text(file_content: bytes) -> str:
 
				         raise TextExtractionError("Failed to decode plain text file") from e
			
 
				 
			
 
				 
			
 
				+def _extract_text_from_json(file_content: bytes) -> str:
			
 
				+    try:
			
 
				+        json_data = json.loads(file_content.decode("utf-8"))
			
 
				+        return json.dumps(json_data, indent=2, ensure_ascii=False)
			
 
				+    except (UnicodeDecodeError, json.JSONDecodeError) as e:
			
 
				+        raise TextExtractionError(f"Failed to decode or parse JSON file: {e}") from e
			
 
				+
			
 
				+
			
 
				 def _extract_text_from_pdf(file_content: bytes) -> str:
			
 
				     try:
			
 
				         pdf_file = io.BytesIO(file_content)
			
@@ -183,13 +191,13 @@ def _download_file_content(file: File) -> bytes:
 
				 
			
 
				 
			
 
				 def _extract_text_from_file(file: File):
			
 
				-    if file.mime_type is None:
			
 
				-        raise UnsupportedFileTypeError("Unable to determine file type: MIME type is missing")
			
 
				     file_content = _download_file_content(file)
			
 
				-    if file.transfer_method == FileTransferMethod.REMOTE_URL:
			
 
				+    if file.extension:
			
 
				+        extracted_text = _extract_text_by_file_extension(file_content=file_content, file_extension=file.extension)
			
 
				+    elif file.mime_type:
			
 
				         extracted_text = _extract_text_by_mime_type(file_content=file_content, mime_type=file.mime_type)
			
 
				     else:
			
 
				-        extracted_text = _extract_text_by_file_extension(file_content=file_content, file_extension=file.extension)
			
 
				+        raise UnsupportedFileTypeError("Unable to determine file type: MIME type or file extension is missing")
			
 
				     return extracted_text
			
 
				 
			
 
				 
			
--- a/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py
+++ b/api/tests/unit_tests/core/workflow/nodes/test_document_extractor_node.py
@@ -125,7 +125,7 @@ def test_run_extract_text(
 
				     result = document_extractor_node._run()
			
 
				 
			
 
				     assert isinstance(result, NodeRunResult)
			
 
				-    assert result.status == WorkflowNodeExecutionStatus.SUCCEEDED
			
 
				+    assert result.status == WorkflowNodeExecutionStatus.SUCCEEDED, result.error
			
 
				     assert result.outputs is not None
			
 
				     assert result.outputs["text"] == expected_text