|
@@ -16,6 +16,7 @@ from core.rag.extractor.pdf_extractor import PdfExtractor
|
|
|
from core.rag.extractor.text_extractor import TextExtractor
|
|
|
from core.rag.extractor.unstructured.unstructured_doc_extractor import UnstructuredWordExtractor
|
|
|
from core.rag.extractor.unstructured.unstructured_eml_extractor import UnstructuredEmailExtractor
|
|
|
+from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor
|
|
|
from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor
|
|
|
from core.rag.extractor.unstructured.unstructured_msg_extractor import UnstructuredMsgExtractor
|
|
|
from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor
|
|
@@ -106,6 +107,8 @@ class ExtractProcessor:
|
|
|
extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url)
|
|
|
elif file_extension == '.xml':
|
|
|
extractor = UnstructuredXmlExtractor(file_path, unstructured_api_url)
|
|
|
+ elif file_extension == 'epub':
|
|
|
+ extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url)
|
|
|
else:
|
|
|
# txt
|
|
|
extractor = UnstructuredTextExtractor(file_path, unstructured_api_url) if is_automatic \
|
|
@@ -123,6 +126,8 @@ class ExtractProcessor:
|
|
|
extractor = WordExtractor(file_path)
|
|
|
elif file_extension == '.csv':
|
|
|
extractor = CSVExtractor(file_path, autodetect_encoding=True)
|
|
|
+ elif file_extension == 'epub':
|
|
|
+ extractor = UnstructuredEpubExtractor(file_path)
|
|
|
else:
|
|
|
# txt
|
|
|
extractor = TextExtractor(file_path, autodetect_encoding=True)
|