document.py 2.9 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394
  1. from abc import ABC, abstractmethod
  2. from collections.abc import Sequence
  3. from typing import Any, Optional
  4. from pydantic import BaseModel
  5. class ChildDocument(BaseModel):
  6. """Class for storing a piece of text and associated metadata."""
  7. page_content: str
  8. vector: Optional[list[float]] = None
  9. """Arbitrary metadata about the page content (e.g., source, relationships to other
  10. documents, etc.).
  11. """
  12. metadata: dict = {}
  13. class Document(BaseModel):
  14. """Class for storing a piece of text and associated metadata."""
  15. page_content: str
  16. vector: Optional[list[float]] = None
  17. """Arbitrary metadata about the page content (e.g., source, relationships to other
  18. documents, etc.).
  19. """
  20. metadata: dict = {}
  21. provider: Optional[str] = "dify"
  22. children: Optional[list[ChildDocument]] = None
  23. class BaseDocumentTransformer(ABC):
  24. """Abstract base class for document transformation systems.
  25. A document transformation system takes a sequence of Documents and returns a
  26. sequence of transformed Documents.
  27. Example:
  28. .. code-block:: python
  29. class EmbeddingsRedundantFilter(BaseDocumentTransformer, BaseModel):
  30. embeddings: Embeddings
  31. similarity_fn: Callable = cosine_similarity
  32. similarity_threshold: float = 0.95
  33. class Config:
  34. arbitrary_types_allowed = True
  35. def transform_documents(
  36. self, documents: Sequence[Document], **kwargs: Any
  37. ) -> Sequence[Document]:
  38. stateful_documents = get_stateful_documents(documents)
  39. embedded_documents = _get_embeddings_from_stateful_docs(
  40. self.embeddings, stateful_documents
  41. )
  42. included_idxs = _filter_similar_embeddings(
  43. embedded_documents, self.similarity_fn, self.similarity_threshold
  44. )
  45. return [stateful_documents[i] for i in sorted(included_idxs)]
  46. async def atransform_documents(
  47. self, documents: Sequence[Document], **kwargs: Any
  48. ) -> Sequence[Document]:
  49. raise NotImplementedError
  50. """
  51. @abstractmethod
  52. def transform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]:
  53. """Transform a list of documents.
  54. Args:
  55. documents: A sequence of Documents to be transformed.
  56. Returns:
  57. A list of transformed Documents.
  58. """
  59. @abstractmethod
  60. async def atransform_documents(self, documents: Sequence[Document], **kwargs: Any) -> Sequence[Document]:
  61. """Asynchronously transform a list of documents.
  62. Args:
  63. documents: A sequence of Documents to be transformed.
  64. Returns:
  65. A list of transformed Documents.
  66. """