|
@@ -3,7 +3,8 @@ from pathlib import Path
|
|
|
from typing import List, Union, Optional
|
|
|
|
|
|
import requests
|
|
|
-from langchain.document_loaders import TextLoader, Docx2txtLoader, UnstructuredFileLoader, UnstructuredAPIFileLoader
|
|
|
+from flask import current_app
|
|
|
+from langchain.document_loaders import TextLoader, Docx2txtLoader
|
|
|
from langchain.schema import Document
|
|
|
|
|
|
from core.data_loader.loader.csv_loader import CSVLoader
|
|
@@ -11,6 +12,13 @@ from core.data_loader.loader.excel import ExcelLoader
|
|
|
from core.data_loader.loader.html import HTMLLoader
|
|
|
from core.data_loader.loader.markdown import MarkdownLoader
|
|
|
from core.data_loader.loader.pdf import PdfLoader
|
|
|
+from core.data_loader.loader.unstructured.unstructured_eml import UnstructuredEmailLoader
|
|
|
+from core.data_loader.loader.unstructured.unstructured_markdown import UnstructuredMarkdownLoader
|
|
|
+from core.data_loader.loader.unstructured.unstructured_msg import UnstructuredMsgLoader
|
|
|
+from core.data_loader.loader.unstructured.unstructured_ppt import UnstructuredPPTLoader
|
|
|
+from core.data_loader.loader.unstructured.unstructured_pptx import UnstructuredPPTXLoader
|
|
|
+from core.data_loader.loader.unstructured.unstructured_text import UnstructuredTextLoader
|
|
|
+from core.data_loader.loader.unstructured.unstructured_xml import UnstructuredXmlLoader
|
|
|
from extensions.ext_storage import storage
|
|
|
from models.model import UploadFile
|
|
|
|
|
@@ -49,14 +57,34 @@ class FileExtractor:
|
|
|
input_file = Path(file_path)
|
|
|
delimiter = '\n'
|
|
|
file_extension = input_file.suffix.lower()
|
|
|
- if is_automatic:
|
|
|
- loader = UnstructuredFileLoader(
|
|
|
- file_path, strategy="hi_res", mode="elements"
|
|
|
- )
|
|
|
- # loader = UnstructuredAPIFileLoader(
|
|
|
- # file_path=filenames[0],
|
|
|
- # api_key="FAKE_API_KEY",
|
|
|
- # )
|
|
|
+ etl_type = current_app.config['ETL_TYPE']
|
|
|
+ unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL']
|
|
|
+ if etl_type == 'Unstructured':
|
|
|
+ if file_extension == '.xlsx':
|
|
|
+ loader = ExcelLoader(file_path)
|
|
|
+ elif file_extension == '.pdf':
|
|
|
+ loader = PdfLoader(file_path, upload_file=upload_file)
|
|
|
+ elif file_extension in ['.md', '.markdown']:
|
|
|
+ loader = UnstructuredMarkdownLoader(file_path, unstructured_api_url)
|
|
|
+ elif file_extension in ['.htm', '.html']:
|
|
|
+ loader = HTMLLoader(file_path)
|
|
|
+ elif file_extension == '.docx':
|
|
|
+ loader = Docx2txtLoader(file_path)
|
|
|
+ elif file_extension == '.csv':
|
|
|
+ loader = CSVLoader(file_path, autodetect_encoding=True)
|
|
|
+ elif file_extension == '.msg':
|
|
|
+ loader = UnstructuredMsgLoader(file_path, unstructured_api_url)
|
|
|
+ elif file_extension == '.eml':
|
|
|
+ loader = UnstructuredEmailLoader(file_path, unstructured_api_url)
|
|
|
+ elif file_extension == '.ppt':
|
|
|
+ loader = UnstructuredPPTLoader(file_path, unstructured_api_url)
|
|
|
+ elif file_extension == '.pptx':
|
|
|
+ loader = UnstructuredPPTXLoader(file_path, unstructured_api_url)
|
|
|
+ elif file_extension == '.xml':
|
|
|
+ loader = UnstructuredXmlLoader(file_path, unstructured_api_url)
|
|
|
+ else:
|
|
|
+ # txt
|
|
|
+ loader = UnstructuredTextLoader(file_path, unstructured_api_url)
|
|
|
else:
|
|
|
if file_extension == '.xlsx':
|
|
|
loader = ExcelLoader(file_path)
|