|
@@ -39,8 +39,8 @@ class ExcelExtractor(BaseExtractor):
|
|
documents = []
|
|
documents = []
|
|
# loop over all sheets
|
|
# loop over all sheets
|
|
for sheet in wb.sheets():
|
|
for sheet in wb.sheets():
|
|
- for row_index, row in enumerate(sheet.get_rows(), start=1):
|
|
|
|
- row_header = None
|
|
|
|
|
|
+ row_header = None
|
|
|
|
+ for row_index, row in enumerate(sheet.get_rows(), start=1):
|
|
if self.is_blank_row(row):
|
|
if self.is_blank_row(row):
|
|
continue
|
|
continue
|
|
if row_header is None:
|
|
if row_header is None:
|
|
@@ -49,8 +49,8 @@ class ExcelExtractor(BaseExtractor):
|
|
item_arr = []
|
|
item_arr = []
|
|
for index, cell in enumerate(row):
|
|
for index, cell in enumerate(row):
|
|
txt_value = str(cell.value)
|
|
txt_value = str(cell.value)
|
|
- item_arr.append(f'{row_header[index].value}:{txt_value}')
|
|
|
|
- item_str = "\n".join(item_arr)
|
|
|
|
|
|
+ item_arr.append(f'"{row_header[index].value}":"{txt_value}"')
|
|
|
|
+ item_str = ",".join(item_arr)
|
|
document = Document(page_content=item_str, metadata={'source': self._file_path})
|
|
document = Document(page_content=item_str, metadata={'source': self._file_path})
|
|
documents.append(document)
|
|
documents.append(document)
|
|
return documents
|
|
return documents
|
|
@@ -68,7 +68,7 @@ class ExcelExtractor(BaseExtractor):
|
|
|
|
|
|
# transform each row into a Document
|
|
# transform each row into a Document
|
|
for _, row in df.iterrows():
|
|
for _, row in df.iterrows():
|
|
- item = ';'.join(f'{k}:{v}' for k, v in row.items() if pd.notna(v))
|
|
|
|
|
|
+ item = ';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v))
|
|
document = Document(page_content=item, metadata={'source': self._file_path})
|
|
document = Document(page_content=item, metadata={'source': self._file_path})
|
|
data.append(document)
|
|
data.append(document)
|
|
return data
|
|
return data
|