|
@@ -6,7 +6,6 @@ import re
|
|
|
from abc import ABC, abstractmethod
|
|
|
from collections.abc import Callable, Collection, Iterable, Sequence, Set
|
|
|
from dataclasses import dataclass
|
|
|
-from enum import Enum
|
|
|
from typing import (
|
|
|
Any,
|
|
|
Literal,
|
|
@@ -477,27 +476,6 @@ class TokenTextSplitter(TextSplitter):
|
|
|
return split_text_on_tokens(text=text, tokenizer=tokenizer)
|
|
|
|
|
|
|
|
|
-class Language(str, Enum):
|
|
|
- """Enum of the programming languages."""
|
|
|
-
|
|
|
- CPP = "cpp"
|
|
|
- GO = "go"
|
|
|
- JAVA = "java"
|
|
|
- JS = "js"
|
|
|
- PHP = "php"
|
|
|
- PROTO = "proto"
|
|
|
- PYTHON = "python"
|
|
|
- RST = "rst"
|
|
|
- RUBY = "ruby"
|
|
|
- RUST = "rust"
|
|
|
- SCALA = "scala"
|
|
|
- SWIFT = "swift"
|
|
|
- MARKDOWN = "markdown"
|
|
|
- LATEX = "latex"
|
|
|
- HTML = "html"
|
|
|
- SOL = "sol"
|
|
|
-
|
|
|
-
|
|
|
class RecursiveCharacterTextSplitter(TextSplitter):
|
|
|
"""Splitting text by recursively look at characters.
|
|
|
|
|
@@ -554,350 +532,3 @@ class RecursiveCharacterTextSplitter(TextSplitter):
|
|
|
|
|
|
def split_text(self, text: str) -> list[str]:
|
|
|
return self._split_text(text, self._separators)
|
|
|
-
|
|
|
- @classmethod
|
|
|
- def from_language(
|
|
|
- cls, language: Language, **kwargs: Any
|
|
|
- ) -> RecursiveCharacterTextSplitter:
|
|
|
- separators = cls.get_separators_for_language(language)
|
|
|
- return cls(separators=separators, **kwargs)
|
|
|
-
|
|
|
- @staticmethod
|
|
|
- def get_separators_for_language(language: Language) -> list[str]:
|
|
|
- if language == Language.CPP:
|
|
|
- return [
|
|
|
- # Split along class definitions
|
|
|
- "\nclass ",
|
|
|
- # Split along function definitions
|
|
|
- "\nvoid ",
|
|
|
- "\nint ",
|
|
|
- "\nfloat ",
|
|
|
- "\ndouble ",
|
|
|
- # Split along control flow statements
|
|
|
- "\nif ",
|
|
|
- "\nfor ",
|
|
|
- "\nwhile ",
|
|
|
- "\nswitch ",
|
|
|
- "\ncase ",
|
|
|
- # Split by the normal type of lines
|
|
|
- "\n\n",
|
|
|
- "\n",
|
|
|
- " ",
|
|
|
- "",
|
|
|
- ]
|
|
|
- elif language == Language.GO:
|
|
|
- return [
|
|
|
- # Split along function definitions
|
|
|
- "\nfunc ",
|
|
|
- "\nvar ",
|
|
|
- "\nconst ",
|
|
|
- "\ntype ",
|
|
|
- # Split along control flow statements
|
|
|
- "\nif ",
|
|
|
- "\nfor ",
|
|
|
- "\nswitch ",
|
|
|
- "\ncase ",
|
|
|
- # Split by the normal type of lines
|
|
|
- "\n\n",
|
|
|
- "\n",
|
|
|
- " ",
|
|
|
- "",
|
|
|
- ]
|
|
|
- elif language == Language.JAVA:
|
|
|
- return [
|
|
|
- # Split along class definitions
|
|
|
- "\nclass ",
|
|
|
- # Split along method definitions
|
|
|
- "\npublic ",
|
|
|
- "\nprotected ",
|
|
|
- "\nprivate ",
|
|
|
- "\nstatic ",
|
|
|
- # Split along control flow statements
|
|
|
- "\nif ",
|
|
|
- "\nfor ",
|
|
|
- "\nwhile ",
|
|
|
- "\nswitch ",
|
|
|
- "\ncase ",
|
|
|
- # Split by the normal type of lines
|
|
|
- "\n\n",
|
|
|
- "\n",
|
|
|
- " ",
|
|
|
- "",
|
|
|
- ]
|
|
|
- elif language == Language.JS:
|
|
|
- return [
|
|
|
- # Split along function definitions
|
|
|
- "\nfunction ",
|
|
|
- "\nconst ",
|
|
|
- "\nlet ",
|
|
|
- "\nvar ",
|
|
|
- "\nclass ",
|
|
|
- # Split along control flow statements
|
|
|
- "\nif ",
|
|
|
- "\nfor ",
|
|
|
- "\nwhile ",
|
|
|
- "\nswitch ",
|
|
|
- "\ncase ",
|
|
|
- "\ndefault ",
|
|
|
- # Split by the normal type of lines
|
|
|
- "\n\n",
|
|
|
- "\n",
|
|
|
- " ",
|
|
|
- "",
|
|
|
- ]
|
|
|
- elif language == Language.PHP:
|
|
|
- return [
|
|
|
- # Split along function definitions
|
|
|
- "\nfunction ",
|
|
|
- # Split along class definitions
|
|
|
- "\nclass ",
|
|
|
- # Split along control flow statements
|
|
|
- "\nif ",
|
|
|
- "\nforeach ",
|
|
|
- "\nwhile ",
|
|
|
- "\ndo ",
|
|
|
- "\nswitch ",
|
|
|
- "\ncase ",
|
|
|
- # Split by the normal type of lines
|
|
|
- "\n\n",
|
|
|
- "\n",
|
|
|
- " ",
|
|
|
- "",
|
|
|
- ]
|
|
|
- elif language == Language.PROTO:
|
|
|
- return [
|
|
|
- # Split along message definitions
|
|
|
- "\nmessage ",
|
|
|
- # Split along service definitions
|
|
|
- "\nservice ",
|
|
|
- # Split along enum definitions
|
|
|
- "\nenum ",
|
|
|
- # Split along option definitions
|
|
|
- "\noption ",
|
|
|
- # Split along import statements
|
|
|
- "\nimport ",
|
|
|
- # Split along syntax declarations
|
|
|
- "\nsyntax ",
|
|
|
- # Split by the normal type of lines
|
|
|
- "\n\n",
|
|
|
- "\n",
|
|
|
- " ",
|
|
|
- "",
|
|
|
- ]
|
|
|
- elif language == Language.PYTHON:
|
|
|
- return [
|
|
|
- # First, try to split along class definitions
|
|
|
- "\nclass ",
|
|
|
- "\ndef ",
|
|
|
- "\n\tdef ",
|
|
|
- # Now split by the normal type of lines
|
|
|
- "\n\n",
|
|
|
- "\n",
|
|
|
- " ",
|
|
|
- "",
|
|
|
- ]
|
|
|
- elif language == Language.RST:
|
|
|
- return [
|
|
|
- # Split along section titles
|
|
|
- "\n=+\n",
|
|
|
- "\n-+\n",
|
|
|
- "\n\\*+\n",
|
|
|
- # Split along directive markers
|
|
|
- "\n\n.. *\n\n",
|
|
|
- # Split by the normal type of lines
|
|
|
- "\n\n",
|
|
|
- "\n",
|
|
|
- " ",
|
|
|
- "",
|
|
|
- ]
|
|
|
- elif language == Language.RUBY:
|
|
|
- return [
|
|
|
- # Split along method definitions
|
|
|
- "\ndef ",
|
|
|
- "\nclass ",
|
|
|
- # Split along control flow statements
|
|
|
- "\nif ",
|
|
|
- "\nunless ",
|
|
|
- "\nwhile ",
|
|
|
- "\nfor ",
|
|
|
- "\ndo ",
|
|
|
- "\nbegin ",
|
|
|
- "\nrescue ",
|
|
|
- # Split by the normal type of lines
|
|
|
- "\n\n",
|
|
|
- "\n",
|
|
|
- " ",
|
|
|
- "",
|
|
|
- ]
|
|
|
- elif language == Language.RUST:
|
|
|
- return [
|
|
|
- # Split along function definitions
|
|
|
- "\nfn ",
|
|
|
- "\nconst ",
|
|
|
- "\nlet ",
|
|
|
- # Split along control flow statements
|
|
|
- "\nif ",
|
|
|
- "\nwhile ",
|
|
|
- "\nfor ",
|
|
|
- "\nloop ",
|
|
|
- "\nmatch ",
|
|
|
- "\nconst ",
|
|
|
- # Split by the normal type of lines
|
|
|
- "\n\n",
|
|
|
- "\n",
|
|
|
- " ",
|
|
|
- "",
|
|
|
- ]
|
|
|
- elif language == Language.SCALA:
|
|
|
- return [
|
|
|
- # Split along class definitions
|
|
|
- "\nclass ",
|
|
|
- "\nobject ",
|
|
|
- # Split along method definitions
|
|
|
- "\ndef ",
|
|
|
- "\nval ",
|
|
|
- "\nvar ",
|
|
|
- # Split along control flow statements
|
|
|
- "\nif ",
|
|
|
- "\nfor ",
|
|
|
- "\nwhile ",
|
|
|
- "\nmatch ",
|
|
|
- "\ncase ",
|
|
|
- # Split by the normal type of lines
|
|
|
- "\n\n",
|
|
|
- "\n",
|
|
|
- " ",
|
|
|
- "",
|
|
|
- ]
|
|
|
- elif language == Language.SWIFT:
|
|
|
- return [
|
|
|
- # Split along function definitions
|
|
|
- "\nfunc ",
|
|
|
- # Split along class definitions
|
|
|
- "\nclass ",
|
|
|
- "\nstruct ",
|
|
|
- "\nenum ",
|
|
|
- # Split along control flow statements
|
|
|
- "\nif ",
|
|
|
- "\nfor ",
|
|
|
- "\nwhile ",
|
|
|
- "\ndo ",
|
|
|
- "\nswitch ",
|
|
|
- "\ncase ",
|
|
|
- # Split by the normal type of lines
|
|
|
- "\n\n",
|
|
|
- "\n",
|
|
|
- " ",
|
|
|
- "",
|
|
|
- ]
|
|
|
- elif language == Language.MARKDOWN:
|
|
|
- return [
|
|
|
- # First, try to split along Markdown headings (starting with level 2)
|
|
|
- "\n#{1,6} ",
|
|
|
- # Note the alternative syntax for headings (below) is not handled here
|
|
|
- # Heading level 2
|
|
|
- # ---------------
|
|
|
- # End of code block
|
|
|
- "```\n",
|
|
|
- # Horizontal lines
|
|
|
- "\n\\*\\*\\*+\n",
|
|
|
- "\n---+\n",
|
|
|
- "\n___+\n",
|
|
|
- # Note that this splitter doesn't handle horizontal lines defined
|
|
|
- # by *three or more* of ***, ---, or ___, but this is not handled
|
|
|
- "\n\n",
|
|
|
- "\n",
|
|
|
- " ",
|
|
|
- "",
|
|
|
- ]
|
|
|
- elif language == Language.LATEX:
|
|
|
- return [
|
|
|
- # First, try to split along Latex sections
|
|
|
- "\n\\\\chapter{",
|
|
|
- "\n\\\\section{",
|
|
|
- "\n\\\\subsection{",
|
|
|
- "\n\\\\subsubsection{",
|
|
|
- # Now split by environments
|
|
|
- "\n\\\begin{enumerate}",
|
|
|
- "\n\\\begin{itemize}",
|
|
|
- "\n\\\begin{description}",
|
|
|
- "\n\\\begin{list}",
|
|
|
- "\n\\\begin{quote}",
|
|
|
- "\n\\\begin{quotation}",
|
|
|
- "\n\\\begin{verse}",
|
|
|
- "\n\\\begin{verbatim}",
|
|
|
- # Now split by math environments
|
|
|
- "\n\\\begin{align}",
|
|
|
- "$$",
|
|
|
- "$",
|
|
|
- # Now split by the normal type of lines
|
|
|
- " ",
|
|
|
- "",
|
|
|
- ]
|
|
|
- elif language == Language.HTML:
|
|
|
- return [
|
|
|
- # First, try to split along HTML tags
|
|
|
- "<body",
|
|
|
- "<div",
|
|
|
- "<p",
|
|
|
- "<br",
|
|
|
- "<li",
|
|
|
- "<h1",
|
|
|
- "<h2",
|
|
|
- "<h3",
|
|
|
- "<h4",
|
|
|
- "<h5",
|
|
|
- "<h6",
|
|
|
- "<span",
|
|
|
- "<table",
|
|
|
- "<tr",
|
|
|
- "<td",
|
|
|
- "<th",
|
|
|
- "<ul",
|
|
|
- "<ol",
|
|
|
- "<header",
|
|
|
- "<footer",
|
|
|
- "<nav",
|
|
|
- # Head
|
|
|
- "<head",
|
|
|
- "<style",
|
|
|
- "<script",
|
|
|
- "<meta",
|
|
|
- "<title",
|
|
|
- "",
|
|
|
- ]
|
|
|
- elif language == Language.SOL:
|
|
|
- return [
|
|
|
- # Split along compiler information definitions
|
|
|
- "\npragma ",
|
|
|
- "\nusing ",
|
|
|
- # Split along contract definitions
|
|
|
- "\ncontract ",
|
|
|
- "\ninterface ",
|
|
|
- "\nlibrary ",
|
|
|
- # Split along method definitions
|
|
|
- "\nconstructor ",
|
|
|
- "\ntype ",
|
|
|
- "\nfunction ",
|
|
|
- "\nevent ",
|
|
|
- "\nmodifier ",
|
|
|
- "\nerror ",
|
|
|
- "\nstruct ",
|
|
|
- "\nenum ",
|
|
|
- # Split along control flow statements
|
|
|
- "\nif ",
|
|
|
- "\nfor ",
|
|
|
- "\nwhile ",
|
|
|
- "\ndo while ",
|
|
|
- "\nassembly ",
|
|
|
- # Split by the normal type of lines
|
|
|
- "\n\n",
|
|
|
- "\n",
|
|
|
- " ",
|
|
|
- "",
|
|
|
- ]
|
|
|
- else:
|
|
|
- raise ValueError(
|
|
|
- f"Language {language} is not supported! "
|
|
|
- f"Please choose from {list(Language)}"
|
|
|
- )
|