il y a 1 an · ad65c891e7
--- a/api/app.py
+++ b/api/app.py
@@ -4,12 +4,15 @@ from werkzeug.exceptions import Unauthorized
 
				 
			
 
				 if not os.environ.get("DEBUG") or os.environ.get("DEBUG").lower() != 'true':
			
 
				     from gevent import monkey
			
 
				+
			
 
				     monkey.patch_all()
			
 
				     # if os.environ.get("VECTOR_STORE") == 'milvus':
			
 
				     import grpc.experimental.gevent
			
 
				+
			
 
				     grpc.experimental.gevent.init_gevent()
			
 
				 
			
 
				     import langchain
			
 
				+
			
 
				     langchain.verbose = True
			
 
				 
			
 
				 import json
			
@@ -44,6 +47,7 @@ from services.account_service import AccountService
 
				 # DO NOT REMOVE BELOW
			
 
				 from events import event_handlers
			
 
				 from models import account, dataset, model, source, task, tool, tools, web
			
 
				+
			
 
				 # DO NOT REMOVE ABOVE
			
 
				 
			
 
				 
			
@@ -51,7 +55,7 @@ warnings.simplefilter("ignore", ResourceWarning)
 
				 
			
 
				 # fix windows platform
			
 
				 if os.name == "nt":
			
 
				-    os.system('tzutil /s "UTC"')    
			
 
				+    os.system('tzutil /s "UTC"')
			
 
				 else:
			
 
				     os.environ['TZ'] = 'UTC'
			
 
				     time.tzset()
			
@@ -60,6 +64,7 @@ else:
 
				 class DifyApp(Flask):
			
 
				     pass
			
 
				 
			
 
				+
			
 
				 # -------------
			
 
				 # Configuration
			
 
				 # -------------
			
@@ -67,6 +72,7 @@ class DifyApp(Flask):
 
				 
			
 
				 config_type = os.getenv('EDITION', default='SELF_HOSTED')  # ce edition first
			
 
				 
			
 
				+
			
 
				 # ----------------------------
			
 
				 # Application Factory Function
			
 
				 # ----------------------------
			
@@ -192,7 +198,6 @@ def register_blueprints(app):
 
				 app = create_app()
			
 
				 celery = app.extensions["celery"]
			
 
				 
			
 
				-
			
 
				 if app.config['TESTING']:
			
 
				     print("App is running in TESTING mode")
			
 
				 
			
--- a/api/core/rag/extractor/excel_extractor.py
+++ b/api/core/rag/extractor/excel_extractor.py
@@ -2,6 +2,7 @@
 
				 from typing import Optional
			
 
				 
			
 
				 import pandas as pd
			
 
				+import xlrd
			
 
				 
			
 
				 from core.rag.extractor.extractor_base import BaseExtractor
			
 
				 from core.rag.models.document import Document
			
@@ -27,10 +28,37 @@ class ExcelExtractor(BaseExtractor):
 
				         self._autodetect_encoding = autodetect_encoding
			
 
				 
			
 
				     def extract(self) -> list[Document]:
			
 
				+        """ parse excel file"""
			
 
				+        if self._file_path.endswith('.xls'):
			
 
				+            return self._extract4xls()
			
 
				+        elif self._file_path.endswith('.xlsx'):
			
 
				+            return self._extract4xlsx()
			
 
				+
			
 
				+    def _extract4xls(self) -> list[Document]:
			
 
				+        wb = xlrd.open_workbook(filename=self._file_path)
			
 
				+        documents = []
			
 
				+        # loop over all sheets
			
 
				+        for sheet in wb.sheets():
			
 
				+            for row_index, row in enumerate(sheet.get_rows(), start=1):
			
 
				+                row_header = None
			
 
				+                if self.is_blank_row(row):
			
 
				+                    continue
			
 
				+                if row_header is None:
			
 
				+                    row_header = row
			
 
				+                    continue
			
 
				+                item_arr = []
			
 
				+                for index, cell in enumerate(row):
			
 
				+                    txt_value = str(cell.value)
			
 
				+                    item_arr.append(f'{row_header[index].value}:{txt_value}')
			
 
				+                item_str = "\n".join(item_arr)
			
 
				+                document = Document(page_content=item_str, metadata={'source': self._file_path})
			
 
				+                documents.append(document)
			
 
				+        return documents
			
 
				+
			
 
				+    def _extract4xlsx(self) -> list[Document]:
			
 
				         """Load from file path using Pandas."""
			
 
				         data = []
			
 
				-
			
 
				-        # 使用 Pandas 读取 Excel 文件的每个工作表
			
 
				+        # Read each worksheet of an Excel file using Pandas
			
 
				         xls = pd.ExcelFile(self._file_path)
			
 
				         for sheet_name in xls.sheet_names:
			
 
				             df = pd.read_excel(xls, sheet_name=sheet_name)
			
@@ -43,5 +71,18 @@ class ExcelExtractor(BaseExtractor):
 
				                 item = ';'.join(f'{k}:{v}' for k, v in row.items() if pd.notna(v))
			
 
				                 document = Document(page_content=item, metadata={'source': self._file_path})
			
 
				                 data.append(document)
			
 
				-
			
 
				         return data
			
 
				+
			
 
				+    @staticmethod
			
 
				+    def is_blank_row(row):
			
 
				+        """
			
 
				+
			
 
				+        Determine whether the specified line is a blank line.
			
 
				+        :param row: row object。
			
 
				+        :return: Returns True if the row is blank, False otherwise.
			
 
				+        """
			
 
				+        # Iterates through the cells and returns False if a non-empty cell is found
			
 
				+        for cell in row:
			
 
				+            if cell.value is not None and cell.value != '':
			
 
				+                return False
			
 
				+        return True
			
--- a/api/core/rag/extractor/extract_processor.py
+++ b/api/core/rag/extractor/extract_processor.py
@@ -84,7 +84,7 @@ class ExtractProcessor:
 
				                 etl_type = current_app.config['ETL_TYPE']
			
 
				                 unstructured_api_url = current_app.config['UNSTRUCTURED_API_URL']
			
 
				                 if etl_type == 'Unstructured':
			
 
				-                    if file_extension == '.xlsx':
			
 
				+                    if file_extension == '.xlsx' or file_extension == '.xls':
			
 
				                         extractor = ExcelExtractor(file_path)
			
 
				                     elif file_extension == '.pdf':
			
 
				                         extractor = PdfExtractor(file_path)
			
@@ -114,7 +114,7 @@ class ExtractProcessor:
 
				                         extractor = UnstructuredTextExtractor(file_path, unstructured_api_url) if is_automatic \
			
 
				                             else TextExtractor(file_path, autodetect_encoding=True)
			
 
				                 else:
			
 
				-                    if file_extension == '.xlsx':
			
 
				+                    if file_extension == '.xlsx' or file_extension == '.xls':
			
 
				                         extractor = ExcelExtractor(file_path)
			
 
				                     elif file_extension == '.pdf':
			
 
				                         extractor = PdfExtractor(file_path)
			
--- a/api/requirements.txt
+++ b/api/requirements.txt
@@ -82,3 +82,4 @@ qrcode~=7.4.2
 
				 azure-storage-blob==12.9.0
			
 
				 azure-identity==1.15.0
			
 
				 lxml==5.1.0
			
 
				+xlrd~=2.0.1
			
--- a/api/services/file_service.py
+++ b/api/services/file_service.py
@@ -20,9 +20,10 @@ from services.errors.file import FileTooLargeError, UnsupportedFileTypeError
 
				 IMAGE_EXTENSIONS = ['jpg', 'jpeg', 'png', 'webp', 'gif', 'svg']
			
 
				 IMAGE_EXTENSIONS.extend([ext.upper() for ext in IMAGE_EXTENSIONS])
			
 
				 
			
 
				-ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'docx', 'csv']
			
 
				-UNSTRUSTURED_ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx',
			
 
				+ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'xls', 'docx', 'csv']
			
 
				+UNSTRUSTURED_ALLOWED_EXTENSIONS = ['txt', 'markdown', 'md', 'pdf', 'html', 'htm', 'xlsx', 'xls',
			
 
				                                    'docx', 'csv', 'eml', 'msg', 'pptx', 'ppt', 'xml', 'epub']
			
 
				+
			
 
				 PREVIEW_WORDS_LIMIT = 3000