|
@@ -2,6 +2,7 @@
|
|
|
from typing import Optional
|
|
|
|
|
|
import pandas as pd
|
|
|
+import xlrd
|
|
|
|
|
|
from core.rag.extractor.extractor_base import BaseExtractor
|
|
|
from core.rag.models.document import Document
|
|
@@ -27,10 +28,37 @@ class ExcelExtractor(BaseExtractor):
|
|
|
self._autodetect_encoding = autodetect_encoding
|
|
|
|
|
|
def extract(self) -> list[Document]:
|
|
|
+ """ parse excel file"""
|
|
|
+ if self._file_path.endswith('.xls'):
|
|
|
+ return self._extract4xls()
|
|
|
+ elif self._file_path.endswith('.xlsx'):
|
|
|
+ return self._extract4xlsx()
|
|
|
+
|
|
|
+ def _extract4xls(self) -> list[Document]:
|
|
|
+ wb = xlrd.open_workbook(filename=self._file_path)
|
|
|
+ documents = []
|
|
|
+ # loop over all sheets
|
|
|
+ for sheet in wb.sheets():
|
|
|
+ for row_index, row in enumerate(sheet.get_rows(), start=1):
|
|
|
+ row_header = None
|
|
|
+ if self.is_blank_row(row):
|
|
|
+ continue
|
|
|
+ if row_header is None:
|
|
|
+ row_header = row
|
|
|
+ continue
|
|
|
+ item_arr = []
|
|
|
+ for index, cell in enumerate(row):
|
|
|
+ txt_value = str(cell.value)
|
|
|
+ item_arr.append(f'{row_header[index].value}:{txt_value}')
|
|
|
+ item_str = "\n".join(item_arr)
|
|
|
+ document = Document(page_content=item_str, metadata={'source': self._file_path})
|
|
|
+ documents.append(document)
|
|
|
+ return documents
|
|
|
+
|
|
|
+ def _extract4xlsx(self) -> list[Document]:
|
|
|
"""Load from file path using Pandas."""
|
|
|
data = []
|
|
|
-
|
|
|
- # 使用 Pandas 读取 Excel 文件的每个工作表
|
|
|
+ # Read each worksheet of an Excel file using Pandas
|
|
|
xls = pd.ExcelFile(self._file_path)
|
|
|
for sheet_name in xls.sheet_names:
|
|
|
df = pd.read_excel(xls, sheet_name=sheet_name)
|
|
@@ -43,5 +71,18 @@ class ExcelExtractor(BaseExtractor):
|
|
|
item = ';'.join(f'{k}:{v}' for k, v in row.items() if pd.notna(v))
|
|
|
document = Document(page_content=item, metadata={'source': self._file_path})
|
|
|
data.append(document)
|
|
|
-
|
|
|
return data
|
|
|
+
|
|
|
+ @staticmethod
|
|
|
+ def is_blank_row(row):
|
|
|
+ """
|
|
|
+
|
|
|
+ Determine whether the specified line is a blank line.
|
|
|
+ :param row: row object。
|
|
|
+ :return: Returns True if the row is blank, False otherwise.
|
|
|
+ """
|
|
|
+ # Iterates through the cells and returns False if a non-empty cell is found
|
|
|
+ for cell in row:
|
|
|
+ if cell.value is not None and cell.value != '':
|
|
|
+ return False
|
|
|
+ return True
|