excel.py 1.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445
  1. import logging
  2. from langchain.document_loaders.base import BaseLoader
  3. from langchain.schema import Document
  4. from openpyxl.reader.excel import load_workbook
  5. logger = logging.getLogger(__name__)
  6. class ExcelLoader(BaseLoader):
  7. """Load xlxs files.
  8. Args:
  9. file_path: Path to the file to load.
  10. """
  11. def __init__(
  12. self,
  13. file_path: str
  14. ):
  15. """Initialize with file path."""
  16. self._file_path = file_path
  17. def load(self) -> list[Document]:
  18. data = []
  19. keys = []
  20. wb = load_workbook(filename=self._file_path, read_only=True)
  21. # loop over all sheets
  22. for sheet in wb:
  23. if 'A1:A1' == sheet.calculate_dimension():
  24. sheet.reset_dimensions()
  25. for row in sheet.iter_rows(values_only=True):
  26. if all(v is None for v in row):
  27. continue
  28. if keys == []:
  29. keys = list(map(str, row))
  30. else:
  31. row_dict = dict(zip(keys, list(map(str, row))))
  32. row_dict = {k: v for k, v in row_dict.items() if v}
  33. item = ''.join(f'{k}:{v};' for k, v in row_dict.items())
  34. document = Document(page_content=item, metadata={'source': self._file_path})
  35. data.append(document)
  36. return data