瀏覽代碼

chore: remove unused code and class in text splitter (#4864)

Bowen Liang 11 月之前
父節點
當前提交
5d15aca85f
共有 1 個文件被更改,包括 0 次插入369 次删除
  1. 0 369
      api/core/splitter/text_splitter.py

+ 0 - 369
api/core/splitter/text_splitter.py

@@ -6,7 +6,6 @@ import re
 from abc import ABC, abstractmethod
 from abc import ABC, abstractmethod
 from collections.abc import Callable, Collection, Iterable, Sequence, Set
 from collections.abc import Callable, Collection, Iterable, Sequence, Set
 from dataclasses import dataclass
 from dataclasses import dataclass
-from enum import Enum
 from typing import (
 from typing import (
     Any,
     Any,
     Literal,
     Literal,
@@ -477,27 +476,6 @@ class TokenTextSplitter(TextSplitter):
         return split_text_on_tokens(text=text, tokenizer=tokenizer)
         return split_text_on_tokens(text=text, tokenizer=tokenizer)
 
 
 
 
-class Language(str, Enum):
-    """Enum of the programming languages."""
-
-    CPP = "cpp"
-    GO = "go"
-    JAVA = "java"
-    JS = "js"
-    PHP = "php"
-    PROTO = "proto"
-    PYTHON = "python"
-    RST = "rst"
-    RUBY = "ruby"
-    RUST = "rust"
-    SCALA = "scala"
-    SWIFT = "swift"
-    MARKDOWN = "markdown"
-    LATEX = "latex"
-    HTML = "html"
-    SOL = "sol"
-
-
 class RecursiveCharacterTextSplitter(TextSplitter):
 class RecursiveCharacterTextSplitter(TextSplitter):
     """Splitting text by recursively look at characters.
     """Splitting text by recursively look at characters.
 
 
@@ -554,350 +532,3 @@ class RecursiveCharacterTextSplitter(TextSplitter):
 
 
     def split_text(self, text: str) -> list[str]:
     def split_text(self, text: str) -> list[str]:
         return self._split_text(text, self._separators)
         return self._split_text(text, self._separators)
-
-    @classmethod
-    def from_language(
-            cls, language: Language, **kwargs: Any
-    ) -> RecursiveCharacterTextSplitter:
-        separators = cls.get_separators_for_language(language)
-        return cls(separators=separators, **kwargs)
-
-    @staticmethod
-    def get_separators_for_language(language: Language) -> list[str]:
-        if language == Language.CPP:
-            return [
-                # Split along class definitions
-                "\nclass ",
-                # Split along function definitions
-                "\nvoid ",
-                "\nint ",
-                "\nfloat ",
-                "\ndouble ",
-                # Split along control flow statements
-                "\nif ",
-                "\nfor ",
-                "\nwhile ",
-                "\nswitch ",
-                "\ncase ",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.GO:
-            return [
-                # Split along function definitions
-                "\nfunc ",
-                "\nvar ",
-                "\nconst ",
-                "\ntype ",
-                # Split along control flow statements
-                "\nif ",
-                "\nfor ",
-                "\nswitch ",
-                "\ncase ",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.JAVA:
-            return [
-                # Split along class definitions
-                "\nclass ",
-                # Split along method definitions
-                "\npublic ",
-                "\nprotected ",
-                "\nprivate ",
-                "\nstatic ",
-                # Split along control flow statements
-                "\nif ",
-                "\nfor ",
-                "\nwhile ",
-                "\nswitch ",
-                "\ncase ",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.JS:
-            return [
-                # Split along function definitions
-                "\nfunction ",
-                "\nconst ",
-                "\nlet ",
-                "\nvar ",
-                "\nclass ",
-                # Split along control flow statements
-                "\nif ",
-                "\nfor ",
-                "\nwhile ",
-                "\nswitch ",
-                "\ncase ",
-                "\ndefault ",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.PHP:
-            return [
-                # Split along function definitions
-                "\nfunction ",
-                # Split along class definitions
-                "\nclass ",
-                # Split along control flow statements
-                "\nif ",
-                "\nforeach ",
-                "\nwhile ",
-                "\ndo ",
-                "\nswitch ",
-                "\ncase ",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.PROTO:
-            return [
-                # Split along message definitions
-                "\nmessage ",
-                # Split along service definitions
-                "\nservice ",
-                # Split along enum definitions
-                "\nenum ",
-                # Split along option definitions
-                "\noption ",
-                # Split along import statements
-                "\nimport ",
-                # Split along syntax declarations
-                "\nsyntax ",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.PYTHON:
-            return [
-                # First, try to split along class definitions
-                "\nclass ",
-                "\ndef ",
-                "\n\tdef ",
-                # Now split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.RST:
-            return [
-                # Split along section titles
-                "\n=+\n",
-                "\n-+\n",
-                "\n\\*+\n",
-                # Split along directive markers
-                "\n\n.. *\n\n",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.RUBY:
-            return [
-                # Split along method definitions
-                "\ndef ",
-                "\nclass ",
-                # Split along control flow statements
-                "\nif ",
-                "\nunless ",
-                "\nwhile ",
-                "\nfor ",
-                "\ndo ",
-                "\nbegin ",
-                "\nrescue ",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.RUST:
-            return [
-                # Split along function definitions
-                "\nfn ",
-                "\nconst ",
-                "\nlet ",
-                # Split along control flow statements
-                "\nif ",
-                "\nwhile ",
-                "\nfor ",
-                "\nloop ",
-                "\nmatch ",
-                "\nconst ",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.SCALA:
-            return [
-                # Split along class definitions
-                "\nclass ",
-                "\nobject ",
-                # Split along method definitions
-                "\ndef ",
-                "\nval ",
-                "\nvar ",
-                # Split along control flow statements
-                "\nif ",
-                "\nfor ",
-                "\nwhile ",
-                "\nmatch ",
-                "\ncase ",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.SWIFT:
-            return [
-                # Split along function definitions
-                "\nfunc ",
-                # Split along class definitions
-                "\nclass ",
-                "\nstruct ",
-                "\nenum ",
-                # Split along control flow statements
-                "\nif ",
-                "\nfor ",
-                "\nwhile ",
-                "\ndo ",
-                "\nswitch ",
-                "\ncase ",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.MARKDOWN:
-            return [
-                # First, try to split along Markdown headings (starting with level 2)
-                "\n#{1,6} ",
-                # Note the alternative syntax for headings (below) is not handled here
-                # Heading level 2
-                # ---------------
-                # End of code block
-                "```\n",
-                # Horizontal lines
-                "\n\\*\\*\\*+\n",
-                "\n---+\n",
-                "\n___+\n",
-                # Note that this splitter doesn't handle horizontal lines defined
-                # by *three or more* of ***, ---, or ___, but this is not handled
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        elif language == Language.LATEX:
-            return [
-                # First, try to split along Latex sections
-                "\n\\\\chapter{",
-                "\n\\\\section{",
-                "\n\\\\subsection{",
-                "\n\\\\subsubsection{",
-                # Now split by environments
-                "\n\\\begin{enumerate}",
-                "\n\\\begin{itemize}",
-                "\n\\\begin{description}",
-                "\n\\\begin{list}",
-                "\n\\\begin{quote}",
-                "\n\\\begin{quotation}",
-                "\n\\\begin{verse}",
-                "\n\\\begin{verbatim}",
-                # Now split by math environments
-                "\n\\\begin{align}",
-                "$$",
-                "$",
-                # Now split by the normal type of lines
-                " ",
-                "",
-            ]
-        elif language == Language.HTML:
-            return [
-                # First, try to split along HTML tags
-                "<body",
-                "<div",
-                "<p",
-                "<br",
-                "<li",
-                "<h1",
-                "<h2",
-                "<h3",
-                "<h4",
-                "<h5",
-                "<h6",
-                "<span",
-                "<table",
-                "<tr",
-                "<td",
-                "<th",
-                "<ul",
-                "<ol",
-                "<header",
-                "<footer",
-                "<nav",
-                # Head
-                "<head",
-                "<style",
-                "<script",
-                "<meta",
-                "<title",
-                "",
-            ]
-        elif language == Language.SOL:
-            return [
-                # Split along compiler information definitions
-                "\npragma ",
-                "\nusing ",
-                # Split along contract definitions
-                "\ncontract ",
-                "\ninterface ",
-                "\nlibrary ",
-                # Split along method definitions
-                "\nconstructor ",
-                "\ntype ",
-                "\nfunction ",
-                "\nevent ",
-                "\nmodifier ",
-                "\nerror ",
-                "\nstruct ",
-                "\nenum ",
-                # Split along control flow statements
-                "\nif ",
-                "\nfor ",
-                "\nwhile ",
-                "\ndo while ",
-                "\nassembly ",
-                # Split by the normal type of lines
-                "\n\n",
-                "\n",
-                " ",
-                "",
-            ]
-        else:
-            raise ValueError(
-                f"Language {language} is not supported! "
-                f"Please choose from {list(Language)}"
-            )