Ver Fonte

fixing a bug of handling header row when parsing xls file, and tune xls/xlsx parsing result to be more structured (#3600)

YC há 10 meses atrás
pai
commit
9f8ca75a81
1 ficheiros alterados com 5 adições e 5 exclusões
  1. 5 5
      api/core/rag/extractor/excel_extractor.py

+ 5 - 5
api/core/rag/extractor/excel_extractor.py

@@ -39,8 +39,8 @@ class ExcelExtractor(BaseExtractor):
         documents = []
         documents = []
         # loop over all sheets
         # loop over all sheets
         for sheet in wb.sheets():
         for sheet in wb.sheets():
-            for row_index, row in enumerate(sheet.get_rows(), start=1):
-                row_header = None
+            row_header = None
+            for row_index, row in enumerate(sheet.get_rows(), start=1):                
                 if self.is_blank_row(row):
                 if self.is_blank_row(row):
                     continue
                     continue
                 if row_header is None:
                 if row_header is None:
@@ -49,8 +49,8 @@ class ExcelExtractor(BaseExtractor):
                 item_arr = []
                 item_arr = []
                 for index, cell in enumerate(row):
                 for index, cell in enumerate(row):
                     txt_value = str(cell.value)
                     txt_value = str(cell.value)
-                    item_arr.append(f'{row_header[index].value}:{txt_value}')
-                item_str = "\n".join(item_arr)
+                    item_arr.append(f'"{row_header[index].value}":"{txt_value}"')
+                item_str = ",".join(item_arr)
                 document = Document(page_content=item_str, metadata={'source': self._file_path})
                 document = Document(page_content=item_str, metadata={'source': self._file_path})
                 documents.append(document)
                 documents.append(document)
         return documents
         return documents
@@ -68,7 +68,7 @@ class ExcelExtractor(BaseExtractor):
 
 
             # transform each row into a Document
             # transform each row into a Document
             for _, row in df.iterrows():
             for _, row in df.iterrows():
-                item = ';'.join(f'{k}:{v}' for k, v in row.items() if pd.notna(v))
+                item = ';'.join(f'"{k}":"{v}"' for k, v in row.items() if pd.notna(v))
                 document = Document(page_content=item, metadata={'source': self._file_path})
                 document = Document(page_content=item, metadata={'source': self._file_path})
                 data.append(document)
                 data.append(document)
         return data
         return data