xlsx_parser.py 1.1 KB

123456789101112131415161718192021222324252627282930313233
  1. from pathlib import Path
  2. import json
  3. from typing import Dict
  4. from openpyxl import load_workbook
  5. from llama_index.readers.file.base_parser import BaseParser
  6. from flask import current_app
  7. class XLSXParser(BaseParser):
  8. """XLSX parser."""
  9. def _init_parser(self) -> Dict:
  10. """Init parser"""
  11. return {}
  12. def parse_file(self, file: Path, errors: str = "ignore") -> str:
  13. data = []
  14. keys = []
  15. with open(file, "r") as fp:
  16. wb = load_workbook(filename=file, read_only=True)
  17. # loop over all sheets
  18. for sheet in wb:
  19. for row in sheet.iter_rows(values_only=True):
  20. if all(v is None for v in row):
  21. continue
  22. if keys == []:
  23. keys = list(map(str, row))
  24. else:
  25. row_dict = dict(zip(keys, row))
  26. row_dict = {k: v for k, v in row_dict.items() if v}
  27. data.append(json.dumps(row_dict, ensure_ascii=False))
  28. return '\n\n'.join(data)