|
@@ -1,12 +1,20 @@
|
|
|
"""Abstract interface for document loader implementations."""
|
|
|
+import datetime
|
|
|
+import mimetypes
|
|
|
import os
|
|
|
import tempfile
|
|
|
+import uuid
|
|
|
from urllib.parse import urlparse
|
|
|
|
|
|
import requests
|
|
|
+from docx import Document as DocxDocument
|
|
|
+from flask import current_app
|
|
|
|
|
|
from core.rag.extractor.extractor_base import BaseExtractor
|
|
|
from core.rag.models.document import Document
|
|
|
+from extensions.ext_database import db
|
|
|
+from extensions.ext_storage import storage
|
|
|
+from models.model import UploadFile
|
|
|
|
|
|
|
|
|
class WordExtractor(BaseExtractor):
|
|
@@ -17,9 +25,12 @@ class WordExtractor(BaseExtractor):
|
|
|
file_path: Path to the file to load.
|
|
|
"""
|
|
|
|
|
|
- def __init__(self, file_path: str):
|
|
|
+ def __init__(self, file_path: str, tenant_id: str, user_id: str):
|
|
|
"""Initialize with file path."""
|
|
|
self.file_path = file_path
|
|
|
+ self.tenant_id = tenant_id
|
|
|
+ self.user_id = user_id
|
|
|
+
|
|
|
if "~" in self.file_path:
|
|
|
self.file_path = os.path.expanduser(self.file_path)
|
|
|
|
|
@@ -45,12 +56,7 @@ class WordExtractor(BaseExtractor):
|
|
|
|
|
|
def extract(self) -> list[Document]:
|
|
|
"""Load given path as single page."""
|
|
|
- from docx import Document as docx_Document
|
|
|
-
|
|
|
- document = docx_Document(self.file_path)
|
|
|
- doc_texts = [paragraph.text for paragraph in document.paragraphs]
|
|
|
- content = '\n'.join(doc_texts)
|
|
|
-
|
|
|
+ content = self.parse_docx(self.file_path, 'storage')
|
|
|
return [Document(
|
|
|
page_content=content,
|
|
|
metadata={"source": self.file_path},
|
|
@@ -61,3 +67,111 @@ class WordExtractor(BaseExtractor):
|
|
|
"""Check if the url is valid."""
|
|
|
parsed = urlparse(url)
|
|
|
return bool(parsed.netloc) and bool(parsed.scheme)
|
|
|
+
|
|
|
+ def _extract_images_from_docx(self, doc, image_folder):
|
|
|
+ os.makedirs(image_folder, exist_ok=True)
|
|
|
+ image_count = 0
|
|
|
+ image_map = {}
|
|
|
+
|
|
|
+ for rel in doc.part.rels.values():
|
|
|
+ if "image" in rel.target_ref:
|
|
|
+ image_count += 1
|
|
|
+ image_ext = rel.target_ref.split('.')[-1]
|
|
|
+ # user uuid as file name
|
|
|
+ file_uuid = str(uuid.uuid4())
|
|
|
+ file_key = 'image_files/' + self.tenant_id + '/' + file_uuid + '.' + image_ext
|
|
|
+ mime_type, _ = mimetypes.guess_type(file_key)
|
|
|
+
|
|
|
+ storage.save(file_key, rel.target_part.blob)
|
|
|
+ # save file to db
|
|
|
+ config = current_app.config
|
|
|
+ upload_file = UploadFile(
|
|
|
+ tenant_id=self.tenant_id,
|
|
|
+ storage_type=config['STORAGE_TYPE'],
|
|
|
+ key=file_key,
|
|
|
+ name=file_key,
|
|
|
+ size=0,
|
|
|
+ extension=image_ext,
|
|
|
+ mime_type=mime_type,
|
|
|
+ created_by=self.user_id,
|
|
|
+ created_at=datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None),
|
|
|
+ used=True,
|
|
|
+ used_by=self.user_id,
|
|
|
+ used_at=datetime.datetime.now(datetime.timezone.utc).replace(tzinfo=None)
|
|
|
+ )
|
|
|
+
|
|
|
+ db.session.add(upload_file)
|
|
|
+ db.session.commit()
|
|
|
+ image_map[rel.target_part] = f"}/files/{upload_file.id}/image-preview)"
|
|
|
+
|
|
|
+ return image_map
|
|
|
+
|
|
|
+ def _table_to_markdown(self, table):
|
|
|
+ markdown = ""
|
|
|
+ # deal with table headers
|
|
|
+ header_row = table.rows[0]
|
|
|
+ headers = [cell.text for cell in header_row.cells]
|
|
|
+ markdown += "| " + " | ".join(headers) + " |\n"
|
|
|
+ markdown += "| " + " | ".join(["---"] * len(headers)) + " |\n"
|
|
|
+ # deal with table rows
|
|
|
+ for row in table.rows[1:]:
|
|
|
+ row_cells = [cell.text for cell in row.cells]
|
|
|
+ markdown += "| " + " | ".join(row_cells) + " |\n"
|
|
|
+
|
|
|
+ return markdown
|
|
|
+
|
|
|
+ def _parse_paragraph(self, paragraph, image_map):
|
|
|
+ paragraph_content = []
|
|
|
+ for run in paragraph.runs:
|
|
|
+ if run.element.xpath('.//a:blip'):
|
|
|
+ for blip in run.element.xpath('.//a:blip'):
|
|
|
+ embed_id = blip.get('{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
|
|
|
+ if embed_id:
|
|
|
+ rel_target = run.part.rels[embed_id].target_ref
|
|
|
+ if rel_target in image_map:
|
|
|
+ paragraph_content.append(image_map[rel_target])
|
|
|
+ if run.text.strip():
|
|
|
+ paragraph_content.append(run.text.strip())
|
|
|
+ return ' '.join(paragraph_content) if paragraph_content else ''
|
|
|
+
|
|
|
+ def parse_docx(self, docx_path, image_folder):
|
|
|
+ doc = DocxDocument(docx_path)
|
|
|
+ os.makedirs(image_folder, exist_ok=True)
|
|
|
+
|
|
|
+ content = []
|
|
|
+
|
|
|
+ image_map = self._extract_images_from_docx(doc, image_folder)
|
|
|
+
|
|
|
+ def parse_paragraph(paragraph):
|
|
|
+ paragraph_content = []
|
|
|
+ for run in paragraph.runs:
|
|
|
+ if run.element.tag.endswith('r'):
|
|
|
+ drawing_elements = run.element.findall(
|
|
|
+ './/{http://schemas.openxmlformats.org/wordprocessingml/2006/main}drawing')
|
|
|
+ for drawing in drawing_elements:
|
|
|
+ blip_elements = drawing.findall(
|
|
|
+ './/{http://schemas.openxmlformats.org/drawingml/2006/main}blip')
|
|
|
+ for blip in blip_elements:
|
|
|
+ embed_id = blip.get(
|
|
|
+ '{http://schemas.openxmlformats.org/officeDocument/2006/relationships}embed')
|
|
|
+ if embed_id:
|
|
|
+ image_part = doc.part.related_parts.get(embed_id)
|
|
|
+ if image_part in image_map:
|
|
|
+ paragraph_content.append(image_map[image_part])
|
|
|
+ if run.text.strip():
|
|
|
+ paragraph_content.append(run.text.strip())
|
|
|
+ return ''.join(paragraph_content) if paragraph_content else ''
|
|
|
+
|
|
|
+ paragraphs = doc.paragraphs.copy()
|
|
|
+ tables = doc.tables.copy()
|
|
|
+ for element in doc.element.body:
|
|
|
+ if element.tag.endswith('p'): # paragraph
|
|
|
+ para = paragraphs.pop(0)
|
|
|
+ parsed_paragraph = parse_paragraph(para)
|
|
|
+ if parsed_paragraph:
|
|
|
+ content.append(parsed_paragraph)
|
|
|
+ elif element.tag.endswith('tbl'): # table
|
|
|
+ table = tables.pop(0)
|
|
|
+ content.append(self._table_to_markdown(table))
|
|
|
+ return '\n'.join(content)
|
|
|
+
|