excel.py 1.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. import json
  2. import logging
  3. from typing import List
  4. from langchain.document_loaders.base import BaseLoader
  5. from langchain.schema import Document
  6. from openpyxl.reader.excel import load_workbook
  7. logger = logging.getLogger(__name__)
  8. class ExcelLoader(BaseLoader):
  9. """Load xlxs files.
  10. Args:
  11. file_path: Path to the file to load.
  12. """
  13. def __init__(
  14. self,
  15. file_path: str
  16. ):
  17. """Initialize with file path."""
  18. self._file_path = file_path
  19. def load(self) -> List[Document]:
  20. data = []
  21. keys = []
  22. wb = load_workbook(filename=self._file_path, read_only=True)
  23. # loop over all sheets
  24. for sheet in wb:
  25. for row in sheet.iter_rows(values_only=True):
  26. if all(v is None for v in row):
  27. continue
  28. if keys == []:
  29. keys = list(map(str, row))
  30. else:
  31. row_dict = dict(zip(keys, list(map(str, row))))
  32. row_dict = {k: v for k, v in row_dict.items() if v}
  33. item = ''.join(f'{k}:{v}\n' for k, v in row_dict.items())
  34. document = Document(page_content=item)
  35. data.append(document)
  36. return data