Browse Source

fixing a bug of handling header row when parsing xls file, and tune xls/xlsx parsing result to be more structured (#3600)

YC 10 months ago
parent
commit
9f8ca75a81
1 changed files with 5 additions and 5 deletions
  1. 5 5
      api/core/rag/extractor/excel_extractor.py

+ 5 - 5
api/core/rag/extractor/excel_extractor.py

@@ -39,8 +39,8 @@ class ExcelExtractor(BaseExtractor):
         documents = []
         # loop over all sheets
         for sheet in wb.sheets():
-            for row_index, row in enumerate(sheet.get_rows(), start=1):
-                row_header = None
+            row_header = None
+            for row_index, row in enumerate(sheet.get_rows(), start=1):                
                 if self.is_blank_row(row):
                     continue
                 if row_header is None:
@@ -49,8 +49,8 @@ class ExcelExtractor(BaseExtractor):
                 item_arr = []
                 for index, cell in enumerate(row):
                     txt_value = str(cell.value)
-                    item_arr.append(f'{row_header[index].value}:{txt_value}')
-                item_str = "\n".join(item_arr)
+                    item_arr.append(f'"{row_header[index].value}":"{txt_value}"')
+                item_str = ",".join(item_arr)
                 document = Document(page_content=item_str, metadata={'source': self._file_path})
                 documents.append(document)
         return documents
@@ -68,7 +68,7 @@ class ExcelExtractor(BaseExtractor):
 
             # transform each row into a Document
             for _, row in df.iterrows():
-                item = ';'.join(f'{k}:{v}' for k, v in row.items() if pd.notna(v))
+                item = ';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v))
                 document = Document(page_content=item, metadata={'source': self._file_path})
                 data.append(document)
         return data