Explorar o código

fix(document_extractor): pptx file type and missing metadata_filename UnstructuredIO (#11364)

Co-authored-by: Julian Huynh <julian.huynh@immersio.io>
Huỳnh Gia Bôi hai 4 meses
pai
achega
9277156b6c
Modificáronse 1 ficheiros con 16 adicións e 8 borrados
  1. 16 8
      api/core/workflow/nodes/document_extractor/node.py

+ 16 - 8
api/core/workflow/nodes/document_extractor/node.py

@@ -1,6 +1,8 @@
 import csv
 import io
 import json
+import os
+import tempfile
 
 import docx
 import pandas as pd
@@ -264,14 +266,20 @@ def _extract_text_from_ppt(file_content: bytes) -> str:
 
 def _extract_text_from_pptx(file_content: bytes) -> str:
     try:
-        with io.BytesIO(file_content) as file:
-            if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY:
-                elements = partition_via_api(
-                    file=file,
-                    api_url=dify_config.UNSTRUCTURED_API_URL,
-                    api_key=dify_config.UNSTRUCTURED_API_KEY,
-                )
-            else:
+        if dify_config.UNSTRUCTURED_API_URL and dify_config.UNSTRUCTURED_API_KEY:
+            with tempfile.NamedTemporaryFile(suffix=".pptx", delete=False) as temp_file:
+                temp_file.write(file_content)
+                temp_file.flush()
+                with open(temp_file.name, "rb") as file:
+                    elements = partition_via_api(
+                        file=file,
+                        metadata_filename=temp_file.name,
+                        api_url=dify_config.UNSTRUCTURED_API_URL,
+                        api_key=dify_config.UNSTRUCTURED_API_KEY,
+                    )
+                os.unlink(temp_file.name)
+        else:
+            with io.BytesIO(file_content) as file:
                 elements = partition_pptx(file=file)
         return "\n".join([getattr(element, "text", "") for element in elements])
     except Exception as e: