Procházet zdrojové kódy

Add UNSTRUCTURED_API_KEY env support (#4369)

majian před 11 měsíci
rodič
revize
b5204111da

+ 1 - 0
api/.env.example

@@ -144,6 +144,7 @@ NOTION_INTERNAL_SECRET=you-internal-secret
 
 ETL_TYPE=dify
 UNSTRUCTURED_API_URL=
+UNSTRUCTURED_API_KEY=
 
 SSRF_PROXY_HTTP_URL=
 SSRF_PROXY_HTTPS_URL=

+ 1 - 0
api/config.py

@@ -365,6 +365,7 @@ class Config:
 
         self.ETL_TYPE = get_env('ETL_TYPE')
         self.UNSTRUCTURED_API_URL = get_env('UNSTRUCTURED_API_URL')
+        self.UNSTRUCTURED_API_KEY = get_env('UNSTRUCTURED_API_KEY')
         self.BILLING_ENABLED = get_bool_env('BILLING_ENABLED')
         self.CAN_REPLACE_LOGO = get_bool_env('CAN_REPLACE_LOGO')
 

+ 2 - 1
api/core/rag/extractor/extract_processor.py

@@ -96,6 +96,7 @@ class ExtractProcessor:
                 file_extension = input_file.suffix.lower()
                 etl_type = current_app.config['ETL_TYPE']
                 unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL']
+                unstructured_api_key = current_app.config['UNSTRUCTURED_API_KEY']
                 if etl_type == 'Unstructured':
                     if file_extension == '.xlsx' or file_extension == '.xls':
                         extractor = ExcelExtractor(file_path)
@@ -115,7 +116,7 @@ class ExtractProcessor:
                     elif file_extension == '.eml':
                         extractor = UnstructuredEmailExtractor(file_path, unstructured_api_url)
                     elif file_extension == '.ppt':
-                        extractor = UnstructuredPPTExtractor(file_path, unstructured_api_url)
+                        extractor = UnstructuredPPTExtractor(file_path, unstructured_api_url, unstructured_api_key)
                     elif file_extension == '.pptx':
                         extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url)
                     elif file_extension == '.xml':

+ 4 - 2
api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py

@@ -17,16 +17,18 @@ class UnstructuredPPTExtractor(BaseExtractor):
     def __init__(
             self,
             file_path: str,
-            api_url: str
+            api_url: str,
+            api_key: str
     ):
         """Initialize with file path."""
         self._file_path = file_path
         self._api_url = api_url
+        self._api_key = api_key
 
     def extract(self) -> list[Document]:
         from unstructured.partition.api import partition_via_api
 
-        elements = partition_via_api(filename=self._file_path, api_url=self._api_url)
+        elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
         text_by_page = {}
         for element in elements:
             page = element.metadata.page_number