Explorar o código

feat(document_extractor): integrate unstructured API for PPTX extraction (#10180)

-LAN- hai 5 meses
pai
achega
53a7cb0e9d
Modificáronse 1 ficheiros con 10 adicións e 1 borrados
  1. 10 1
      api/core/workflow/nodes/document_extractor/node.py

+ 10 - 1
api/core/workflow/nodes/document_extractor/node.py

@@ -6,12 +6,14 @@ import docx
 import pandas as pd
 import pypdfium2
 import yaml
+from unstructured.partition.api import partition_via_api
 from unstructured.partition.email import partition_email
 from unstructured.partition.epub import partition_epub
 from unstructured.partition.msg import partition_msg
 from unstructured.partition.ppt import partition_ppt
 from unstructured.partition.pptx import partition_pptx
 
+from configs import dify_config
 from core.file import File, FileTransferMethod, file_manager
 from core.helper import ssrf_proxy
 from core.variables import ArrayFileSegment
@@ -263,7 +265,14 @@ def _extract_text_from_ppt(file_content: bytes) -> str:
 def _extract_text_from_pptx(file_content: bytes) -> str:
     try:
         with io.BytesIO(file_content) as file:
-            elements = partition_pptx(file=file)
+            if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY:
+                elements = partition_via_api(
+                    file=file,
+                    api_url=dify_config.UNSTRUCTURED_API_URL,
+                    api_key=dify_config.UNSTRUCTURED_API_KEY,
+                )
+            else:
+                elements = partition_pptx(file=file)
         return "\n".join([getattr(element, "text", "") for element in elements])
     except Exception as e:
         raise TextExtractionError(f"Failed to extract text from PPTX: {str(e)}") from e