12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667 |
- import logging
- from typing import Optional, Dict, List
- from langchain.document_loaders import CSVLoader as LCCSVLoader
- from langchain.document_loaders.helpers import detect_file_encodings
- from models.dataset import Document
- logger = logging.getLogger(__name__)
- class CSVLoader(LCCSVLoader):
- def __init__(
- self,
- file_path: str,
- source_column: Optional[str] = None,
- csv_args: Optional[Dict] = None,
- encoding: Optional[str] = None,
- autodetect_encoding: bool = True,
- ):
- self.file_path = file_path
- self.source_column = source_column
- self.encoding = encoding
- self.csv_args = csv_args or {}
- self.autodetect_encoding = autodetect_encoding
- def load(self) -> List[Document]:
- """Load data into document objects."""
- try:
- with open(self.file_path, newline="", encoding=self.encoding) as csvfile:
- docs = self._read_from_file(csvfile)
- except UnicodeDecodeError as e:
- if self.autodetect_encoding:
- detected_encodings = detect_file_encodings(self.file_path)
- for encoding in detected_encodings:
- logger.debug("Trying encoding: ", encoding.encoding)
- try:
- with open(self.file_path, newline="", encoding=encoding.encoding) as csvfile:
- docs = self._read_from_file(csvfile)
- break
- except UnicodeDecodeError:
- continue
- else:
- raise RuntimeError(f"Error loading {self.file_path}") from e
- return docs
- def _read_from_file(self, csvfile):
- docs = []
- csv_reader = csv.DictReader(csvfile, **self.csv_args) # type: ignore
- for i, row in enumerate(csv_reader):
- content = "\n".join(f"{k.strip()}: {v.strip()}" for k, v in row.items())
- try:
- source = (
- row[self.source_column]
- if self.source_column is not None
- else ''
- )
- except KeyError:
- raise ValueError(
- f"Source column '{self.source_column}' not found in CSV file."
- )
- metadata = {"source": source, "row": i}
- doc = Document(page_content=content, metadata=metadata)
- docs.append(doc)
- return docs
|