1 år sedan · e4f686deb7
--- a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
@@ -26,7 +26,7 @@ class UnstructuredEmailExtractor(BaseExtractor):
 
				 
			
 
				     def extract(self) -> list[Document]:
			
 
				         from unstructured.partition.email import partition_email
			
 
				-        elements = partition_email(filename=self._file_path, api_url=self._api_url)
			
 
				+        elements = partition_email(filename=self._file_path)
			
 
				 
			
 
				         # noinspection PyBroadException
			
 
				         try:
			
--- a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
@@ -36,7 +36,7 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
 
				     def extract(self) -> list[Document]:
			
 
				         from unstructured.partition.md import partition_md
			
 
				 
			
 
				-        elements = partition_md(filename=self._file_path, api_url=self._api_url)
			
 
				+        elements = partition_md(filename=self._file_path)
			
 
				         from unstructured.chunking.title import chunk_by_title
			
 
				         chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
			
 
				         documents = []
			
--- a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
@@ -26,7 +26,7 @@ class UnstructuredMsgExtractor(BaseExtractor):
 
				     def extract(self) -> list[Document]:
			
 
				         from unstructured.partition.msg import partition_msg
			
 
				 
			
 
				-        elements = partition_msg(filename=self._file_path, api_url=self._api_url)
			
 
				+        elements = partition_msg(filename=self._file_path)
			
 
				         from unstructured.chunking.title import chunk_by_title
			
 
				         chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
			
 
				         documents = []
			
--- a/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
@@ -24,9 +24,9 @@ class UnstructuredPPTExtractor(BaseExtractor):
 
				         self._api_url = api_url
			
 
				 
			
 
				     def extract(self) -> list[Document]:
			
 
				-        from unstructured.partition.ppt import partition_ppt
			
 
				+        from unstructured.partition.api import partition_via_api
			
 
				 
			
 
				-        elements = partition_ppt(filename=self._file_path, api_url=self._api_url)
			
 
				+        elements = partition_via_api(filename=self._file_path, api_url=self._api_url)
			
 
				         text_by_page = {}
			
 
				         for element in elements:
			
 
				             page = element.metadata.page_number
			
--- a/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
@@ -26,7 +26,7 @@ class UnstructuredPPTXExtractor(BaseExtractor):
 
				     def extract(self) -> list[Document]:
			
 
				         from unstructured.partition.pptx import partition_pptx
			
 
				 
			
 
				-        elements = partition_pptx(filename=self._file_path, api_url=self._api_url)
			
 
				+        elements = partition_pptx(filename=self._file_path)
			
 
				         text_by_page = {}
			
 
				         for element in elements:
			
 
				             page = element.metadata.page_number
			
--- a/api/core/rag/extractor/unstructured/unstructured_text_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_text_extractor.py
@@ -26,7 +26,7 @@ class UnstructuredTextExtractor(BaseExtractor):
 
				     def extract(self) -> list[Document]:
			
 
				         from unstructured.partition.text import partition_text
			
 
				 
			
 
				-        elements = partition_text(filename=self._file_path, api_url=self._api_url)
			
 
				+        elements = partition_text(filename=self._file_path)
			
 
				         from unstructured.chunking.title import chunk_by_title
			
 
				         chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
			
 
				         documents = []
			
--- a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
+++ b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
@@ -26,7 +26,7 @@ class UnstructuredXmlExtractor(BaseExtractor):
 
				     def extract(self) -> list[Document]:
			
 
				         from unstructured.partition.xml import partition_xml
			
 
				 
			
 
				-        elements = partition_xml(filename=self._file_path, xml_keep_tags=True, api_url=self._api_url)
			
 
				+        elements = partition_xml(filename=self._file_path, xml_keep_tags=True)
			
 
				         from unstructured.chunking.title import chunk_by_title
			
 
				         chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
			
 
				         documents = []