html.py 779 B

12345678910111213141516171819202122232425262728293031323334
  1. import logging
  2. from bs4 import BeautifulSoup
  3. from langchain.document_loaders.base import BaseLoader
  4. from langchain.schema import Document
  5. logger = logging.getLogger(__name__)
  6. class HTMLLoader(BaseLoader):
  7. """Load html files.
  8. Args:
  9. file_path: Path to the file to load.
  10. """
  11. def __init__(
  12. self,
  13. file_path: str
  14. ):
  15. """Initialize with file path."""
  16. self._file_path = file_path
  17. def load(self) -> list[Document]:
  18. return [Document(page_content=self._load_as_text())]
  19. def _load_as_text(self) -> str:
  20. with open(self._file_path, "rb") as fp:
  21. soup = BeautifulSoup(fp, 'html.parser')
  22. text = soup.get_text()
  23. text = text.strip() if text else ''
  24. return text