瀏覽代碼

fix: split chunks return empty strings (#2197)

takatost 1 年之前
父節點
當前提交
6cf93379b3

+ 3 - 1
api/core/indexing_runner.py

@@ -655,7 +655,9 @@ class IndexingRunner:
                     else:
                         page_content = page_content
                     document_node.page_content = page_content
-                    split_documents.append(document_node)
+
+                    if document_node.page_content:
+                        split_documents.append(document_node)
             all_documents.extend(split_documents)
         # processing qa document
         if document_form == 'qa_model':

+ 3 - 3
api/core/model_runtime/model_providers/azure_openai/text_embedding/text_embedding.py

@@ -1,7 +1,7 @@
 import base64
 import copy
 import time
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import tiktoken
@@ -76,7 +76,7 @@ class AzureOpenAITextEmbeddingModel(_CommonAzureOpenAI, TextEmbeddingModel):
                 embeddings_batch, embedding_used_tokens = self._embedding_invoke(
                     model=model,
                     client=client,
-                    texts=[""],
+                    texts="",
                     extra_model_kwargs=extra_model_kwargs
                 )
 
@@ -147,7 +147,7 @@ class AzureOpenAITextEmbeddingModel(_CommonAzureOpenAI, TextEmbeddingModel):
         return ai_model_entity.entity
 
     @staticmethod
-    def _embedding_invoke(model: str, client: AzureOpenAI, texts: list[str],
+    def _embedding_invoke(model: str, client: AzureOpenAI, texts: Union[list[str], str],
                           extra_model_kwargs: dict) -> Tuple[list[list[float]], int]:
         response = client.embeddings.create(
             input=texts,

+ 4 - 1
api/core/model_runtime/model_providers/cohere/text_embedding/text_embedding.py

@@ -76,7 +76,7 @@ class CohereTextEmbeddingModel(TextEmbeddingModel):
                 embeddings_batch, embedding_used_tokens = self._embedding_invoke(
                     model=model,
                     credentials=credentials,
-                    texts=[""]
+                    texts=[" "]
                 )
 
                 used_tokens += embedding_used_tokens
@@ -131,6 +131,9 @@ class CohereTextEmbeddingModel(TextEmbeddingModel):
         :param text: text to tokenize
         :return:
         """
+        if not text:
+            return Tokens([], [], {})
+
         # initialize client
         client = cohere.Client(credentials.get('api_key'))
 

+ 3 - 3
api/core/model_runtime/model_providers/openai/text_embedding/text_embedding.py

@@ -1,6 +1,6 @@
 import base64
 import time
-from typing import Optional, Tuple
+from typing import Optional, Tuple, Union
 
 import numpy as np
 import tiktoken
@@ -89,7 +89,7 @@ class OpenAITextEmbeddingModel(_CommonOpenAI, TextEmbeddingModel):
                 embeddings_batch, embedding_used_tokens = self._embedding_invoke(
                     model=model,
                     client=client,
-                    texts=[""],
+                    texts="",
                     extra_model_kwargs=extra_model_kwargs
                 )
 
@@ -160,7 +160,7 @@ class OpenAITextEmbeddingModel(_CommonOpenAI, TextEmbeddingModel):
         except Exception as ex:
             raise CredentialsValidateFailedError(str(ex))
 
-    def _embedding_invoke(self, model: str, client: OpenAI, texts: list[str],
+    def _embedding_invoke(self, model: str, client: OpenAI, texts: Union[list[str], str],
                           extra_model_kwargs: dict) -> Tuple[list[list[float]], int]:
         """
         Invoke embedding model