4 hónapja · aa135a3780
--- a/api/core/model_runtime/model_providers/azure_openai/tts/tts.py
+++ b/api/core/model_runtime/model_providers/azure_openai/tts/tts.py
@@ -14,7 +14,7 @@ from core.model_runtime.model_providers.azure_openai._constant import TTS_BASE_M
 
				 
			
 
				 class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
			
 
				     """
			
 
				-    Model class for OpenAI Speech to text model.
			
 
				+    Model class for OpenAI text2speech model.
			
 
				     """
			
 
				 
			
 
				     def _invoke(
			
--- a/api/core/model_runtime/model_providers/gitee_ai/tts/tts.py
+++ b/api/core/model_runtime/model_providers/gitee_ai/tts/tts.py
@@ -10,7 +10,7 @@ from core.model_runtime.model_providers.gitee_ai._common import _CommonGiteeAI
 
				 

			
 
				 class GiteeAIText2SpeechModel(_CommonGiteeAI, TTSModel):

			
 
				     """

			
 
				-    Model class for OpenAI Speech to text model.

			
 
				+    Model class for OpenAI text2speech model.

			
 
				     """

			
 
				 

			
 
				     def _invoke(

			
--- a/api/core/model_runtime/model_providers/openai/tts/tts.py
+++ b/api/core/model_runtime/model_providers/openai/tts/tts.py
@@ -11,7 +11,7 @@ from core.model_runtime.model_providers.openai._common import _CommonOpenAI
 
				 
			
 
				 class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
			
 
				     """
			
 
				-    Model class for OpenAI Speech to text model.
			
 
				+    Model class for OpenAI text2speech model.
			
 
				     """
			
 
				 
			
 
				     def _invoke(
			
--- a/api/core/model_runtime/model_providers/openai_api_compatible/openai_api_compatible.yaml
+++ b/api/core/model_runtime/model_providers/openai_api_compatible/openai_api_compatible.yaml
@@ -9,6 +9,7 @@ supported_model_types:
 
				   - text-embedding
			
 
				   - speech2text
			
 
				   - rerank
			
 
				+  - tts
			
 
				 configurate_methods:
			
 
				   - customizable-model
			
 
				 model_credential_schema:
			
@@ -67,7 +68,7 @@ model_credential_schema:
 
				         - variable: __model_type
			
 
				           value: llm
			
 
				       type: text-input
			
 
				-      default: '4096'
			
 
				+      default: "4096"
			
 
				       placeholder:
			
 
				         zh_Hans: 在此输入您的模型上下文长度
			
 
				         en_US: Enter your Model context size
			
@@ -80,7 +81,7 @@ model_credential_schema:
 
				         - variable: __model_type
			
 
				           value: text-embedding
			
 
				       type: text-input
			
 
				-      default: '4096'
			
 
				+      default: "4096"
			
 
				       placeholder:
			
 
				         zh_Hans: 在此输入您的模型上下文长度
			
 
				         en_US: Enter your Model context size
			
@@ -93,7 +94,7 @@ model_credential_schema:
 
				         - variable: __model_type
			
 
				           value: rerank
			
 
				       type: text-input
			
 
				-      default: '4096'
			
 
				+      default: "4096"
			
 
				       placeholder:
			
 
				         zh_Hans: 在此输入您的模型上下文长度
			
 
				         en_US: Enter your Model context size
			
@@ -104,7 +105,7 @@ model_credential_schema:
 
				       show_on:
			
 
				         - variable: __model_type
			
 
				           value: llm
			
 
				-      default: '4096'
			
 
				+      default: "4096"
			
 
				       type: text-input
			
 
				     - variable: function_calling_type
			
 
				       show_on:
			
@@ -174,3 +175,19 @@ model_credential_schema:
 
				           value: llm
			
 
				       default: '\n\n'
			
 
				       type: text-input
			
 
				+    - variable: voices
			
 
				+      show_on:
			
 
				+        - variable: __model_type
			
 
				+          value: tts
			
 
				+      label:
			
 
				+        en_US: Available Voices (comma-separated)
			
 
				+        zh_Hans: 可用声音（用英文逗号分隔）
			
 
				+      type: text-input
			
 
				+      required: false
			
 
				+      default: "alloy"
			
 
				+      placeholder:
			
 
				+        en_US: "alloy,echo,fable,onyx,nova,shimmer"
			
 
				+        zh_Hans: "alloy,echo,fable,onyx,nova,shimmer"
			
 
				+      help:
			
 
				+        en_US: "List voice names separated by commas. First voice will be used as default."
			
 
				+        zh_Hans: "用英文逗号分隔的声音列表。第一个声音将作为默认值。"
			
--- a/api/core/model_runtime/model_providers/openai_api_compatible/tts/__init__.py
+++ b/api/core/model_runtime/model_providers/openai_api_compatible/tts/__init__.py
--- a/api/core/model_runtime/model_providers/openai_api_compatible/tts/tts.py
+++ b/api/core/model_runtime/model_providers/openai_api_compatible/tts/tts.py
@@ -0,0 +1,145 @@
 
				+from collections.abc import Iterable
			
 
				+from typing import Optional
			
 
				+from urllib.parse import urljoin
			
 
				+
			
 
				+import requests
			
 
				+
			
 
				+from core.model_runtime.entities.common_entities import I18nObject
			
 
				+from core.model_runtime.entities.model_entities import AIModelEntity, FetchFrom, ModelPropertyKey, ModelType
			
 
				+from core.model_runtime.errors.invoke import InvokeBadRequestError
			
 
				+from core.model_runtime.errors.validate import CredentialsValidateFailedError
			
 
				+from core.model_runtime.model_providers.__base.tts_model import TTSModel
			
 
				+from core.model_runtime.model_providers.openai_api_compatible._common import _CommonOaiApiCompat
			
 
				+
			
 
				+
			
 
				+class OAICompatText2SpeechModel(_CommonOaiApiCompat, TTSModel):
			
 
				+    """
			
 
				+    Model class for OpenAI-compatible text2speech model.
			
 
				+    """
			
 
				+
			
 
				+    def _invoke(
			
 
				+        self,
			
 
				+        model: str,
			
 
				+        tenant_id: str,
			
 
				+        credentials: dict,
			
 
				+        content_text: str,
			
 
				+        voice: str,
			
 
				+        user: Optional[str] = None,
			
 
				+    ) -> Iterable[bytes]:
			
 
				+        """
			
 
				+        Invoke TTS model
			
 
				+
			
 
				+        :param model: model name
			
 
				+        :param tenant_id: user tenant id
			
 
				+        :param credentials: model credentials
			
 
				+        :param content_text: text content to be translated
			
 
				+        :param voice: model voice/speaker
			
 
				+        :param user: unique user id
			
 
				+        :return: audio data as bytes iterator
			
 
				+        """
			
 
				+        # Set up headers with authentication if provided
			
 
				+        headers = {}
			
 
				+        if api_key := credentials.get("api_key"):
			
 
				+            headers["Authorization"] = f"Bearer {api_key}"
			
 
				+
			
 
				+        # Construct endpoint URL
			
 
				+        endpoint_url = credentials.get("endpoint_url")
			
 
				+        if not endpoint_url.endswith("/"):
			
 
				+            endpoint_url += "/"
			
 
				+        endpoint_url = urljoin(endpoint_url, "audio/speech")
			
 
				+
			
 
				+        # Get audio format from model properties
			
 
				+        audio_format = self._get_model_audio_type(model, credentials)
			
 
				+
			
 
				+        # Split text into chunks if needed based on word limit
			
 
				+        word_limit = self._get_model_word_limit(model, credentials)
			
 
				+        sentences = self._split_text_into_sentences(content_text, word_limit)
			
 
				+
			
 
				+        for sentence in sentences:
			
 
				+            # Prepare request payload
			
 
				+            payload = {"model": model, "input": sentence, "voice": voice, "response_format": audio_format}
			
 
				+
			
 
				+            # Make POST request
			
 
				+            response = requests.post(endpoint_url, headers=headers, json=payload, stream=True)
			
 
				+
			
 
				+            if response.status_code != 200:
			
 
				+                raise InvokeBadRequestError(response.text)
			
 
				+
			
 
				+            # Stream the audio data
			
 
				+            for chunk in response.iter_content(chunk_size=4096):
			
 
				+                if chunk:
			
 
				+                    yield chunk
			
 
				+
			
 
				+    def validate_credentials(self, model: str, credentials: dict) -> None:
			
 
				+        """
			
 
				+        Validate model credentials
			
 
				+
			
 
				+        :param model: model name
			
 
				+        :param credentials: model credentials
			
 
				+        :return:
			
 
				+        """
			
 
				+        try:
			
 
				+            # Get default voice for validation
			
 
				+            voice = self._get_model_default_voice(model, credentials)
			
 
				+
			
 
				+            # Test with a simple text
			
 
				+            next(
			
 
				+                self._invoke(
			
 
				+                    model=model, tenant_id="validate", credentials=credentials, content_text="Test.", voice=voice
			
 
				+                )
			
 
				+            )
			
 
				+        except Exception as ex:
			
 
				+            raise CredentialsValidateFailedError(str(ex))
			
 
				+
			
 
				+    def get_customizable_model_schema(self, model: str, credentials: dict) -> Optional[AIModelEntity]:
			
 
				+        """
			
 
				+        Get customizable model schema
			
 
				+        """
			
 
				+        # Parse voices from comma-separated string
			
 
				+        voice_names = credentials.get("voices", "alloy").strip().split(",")
			
 
				+        voices = []
			
 
				+
			
 
				+        for voice in voice_names:
			
 
				+            voice = voice.strip()
			
 
				+            if not voice:
			
 
				+                continue
			
 
				+
			
 
				+            # Use en-US for all voices
			
 
				+            voices.append(
			
 
				+                {
			
 
				+                    "name": voice,
			
 
				+                    "mode": voice,
			
 
				+                    "language": "en-US",
			
 
				+                }
			
 
				+            )
			
 
				+
			
 
				+        # If no voices provided or all voices were empty strings, use 'alloy' as default
			
 
				+        if not voices:
			
 
				+            voices = [{"name": "Alloy", "mode": "alloy", "language": "en-US"}]
			
 
				+
			
 
				+        return AIModelEntity(
			
 
				+            model=model,
			
 
				+            label=I18nObject(en_US=model),
			
 
				+            fetch_from=FetchFrom.CUSTOMIZABLE_MODEL,
			
 
				+            model_type=ModelType.TTS,
			
 
				+            model_properties={
			
 
				+                ModelPropertyKey.AUDIO_TYPE: credentials.get("audio_type", "mp3"),
			
 
				+                ModelPropertyKey.WORD_LIMIT: int(credentials.get("word_limit", 4096)),
			
 
				+                ModelPropertyKey.DEFAULT_VOICE: voices[0]["mode"],
			
 
				+                ModelPropertyKey.VOICES: voices,
			
 
				+            },
			
 
				+        )
			
 
				+
			
 
				+    def get_tts_model_voices(self, model: str, credentials: dict, language: Optional[str] = None) -> list:
			
 
				+        """
			
 
				+        Override base get_tts_model_voices to handle customizable voices
			
 
				+        """
			
 
				+        model_schema = self.get_customizable_model_schema(model, credentials)
			
 
				+
			
 
				+        if not model_schema or ModelPropertyKey.VOICES not in model_schema.model_properties:
			
 
				+            raise ValueError("this model does not support voice")
			
 
				+
			
 
				+        voices = model_schema.model_properties[ModelPropertyKey.VOICES]
			
 
				+
			
 
				+        # Always return all voices regardless of language
			
 
				+        return [{"name": d["name"], "value": d["mode"]} for d in voices]