před 1 rokem · 300d9892a5
--- a/api/controllers/console/app/audio.py
+++ b/api/controllers/console/app/audio.py
@@ -1,7 +1,7 @@
 
				 import logging
			
 
				 
			
 
				 from flask import request
			
 
				-from flask_restful import Resource
			
 
				+from flask_restful import Resource, reqparse
			
 
				 from werkzeug.exceptions import InternalServerError
			
 
				 
			
 
				 import services
			
@@ -23,6 +23,7 @@ from controllers.console.wraps import account_initialization_required
 
				 from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
			
 
				 from core.model_runtime.errors.invoke import InvokeError
			
 
				 from libs.login import login_required
			
 
				+from models.model import AppModelConfig
			
 
				 from services.audio_service import AudioService
			
 
				 from services.errors.audio import (
			
 
				     AudioTooLargeServiceError,
			
@@ -45,7 +46,9 @@ class ChatMessageAudioApi(Resource):
 
				         try:
			
 
				             response = AudioService.transcript_asr(
			
 
				                 tenant_id=app_model.tenant_id,
			
 
				-                file=file
			
 
				+                file=file,
			
 
				+                end_user=None,
			
 
				+                promot=app_model.app_model_config.pre_prompt
			
 
				             )
			
 
				 
			
 
				             return response
			
@@ -71,7 +74,7 @@ class ChatMessageAudioApi(Resource):
 
				         except ValueError as e:
			
 
				             raise e
			
 
				         except Exception as e:
			
 
				-            logging.exception("internal server error.")
			
 
				+            logging.exception(f"internal server error, {str(e)}.")
			
 
				             raise InternalServerError()
			
 
				 
			
 
				 
			
@@ -82,10 +85,17 @@ class ChatMessageTextApi(Resource):
 
				     def post(self, app_id):
			
 
				         app_id = str(app_id)
			
 
				         app_model = _get_app(app_id, None)
			
 
				+
			
 
				+        app_model_config: AppModelConfig = app_model.app_model_config
			
 
				+
			
 
				+        if not app_model_config.text_to_speech_dict['enabled']:
			
 
				+            raise AppUnavailableError()
			
 
				+
			
 
				         try:
			
 
				             response = AudioService.transcript_tts(
			
 
				                 tenant_id=app_model.tenant_id,
			
 
				                 text=request.form['text'],
			
 
				+                voice=app_model.app_model_config.text_to_speech_dict.get('voice'),
			
 
				                 streaming=False
			
 
				             )
			
 
				 
			
@@ -112,9 +122,54 @@ class ChatMessageTextApi(Resource):
 
				         except ValueError as e:
			
 
				             raise e
			
 
				         except Exception as e:
			
 
				-            logging.exception("internal server error.")
			
 
				+            logging.exception(f"internal server error, {str(e)}.")
			
 
				+            raise InternalServerError()
			
 
				+
			
 
				+
			
 
				+class TextModesApi(Resource):
			
 
				+    def get(self, app_id: str):
			
 
				+        app_model = _get_app(str(app_id))
			
 
				+        app_model_config: AppModelConfig = app_model.app_model_config
			
 
				+
			
 
				+        if not app_model_config.text_to_speech_dict['enabled']:
			
 
				+            raise AppUnavailableError()
			
 
				+
			
 
				+        try:
			
 
				+            parser = reqparse.RequestParser()
			
 
				+            parser.add_argument('language', type=str, required=True, location='args')
			
 
				+            args = parser.parse_args()
			
 
				+
			
 
				+            response = AudioService.transcript_tts_voices(
			
 
				+                tenant_id=app_model.tenant_id,
			
 
				+                language=args['language'],
			
 
				+            )
			
 
				+
			
 
				+            return response
			
 
				+        except services.errors.audio.ProviderNotSupportTextToSpeechLanageServiceError:
			
 
				+            raise AppUnavailableError("Text to audio voices language parameter loss.")
			
 
				+        except NoAudioUploadedServiceError:
			
 
				+            raise NoAudioUploadedError()
			
 
				+        except AudioTooLargeServiceError as e:
			
 
				+            raise AudioTooLargeError(str(e))
			
 
				+        except UnsupportedAudioTypeServiceError:
			
 
				+            raise UnsupportedAudioTypeError()
			
 
				+        except ProviderNotSupportSpeechToTextServiceError:
			
 
				+            raise ProviderNotSupportSpeechToTextError()
			
 
				+        except ProviderTokenNotInitError as ex:
			
 
				+            raise ProviderNotInitializeError(ex.description)
			
 
				+        except QuotaExceededError:
			
 
				+            raise ProviderQuotaExceededError()
			
 
				+        except ModelCurrentlyNotSupportError:
			
 
				+            raise ProviderModelCurrentlyNotSupportError()
			
 
				+        except InvokeError as e:
			
 
				+            raise CompletionRequestError(e.description)
			
 
				+        except ValueError as e:
			
 
				+            raise e
			
 
				+        except Exception as e:
			
 
				+            logging.exception(f"internal server error, {str(e)}.")
			
 
				             raise InternalServerError()
			
 
				 
			
 
				 
			
 
				 api.add_resource(ChatMessageAudioApi, '/apps/<uuid:app_id>/audio-to-text')
			
 
				 api.add_resource(ChatMessageTextApi, '/apps/<uuid:app_id>/text-to-audio')
			
 
				+api.add_resource(TextModesApi, '/apps/<uuid:app_id>/text-to-audio/voices')
			
--- a/api/controllers/console/explore/audio.py
+++ b/api/controllers/console/explore/audio.py
@@ -85,6 +85,7 @@ class ChatTextApi(InstalledAppResource):
 
				             response = AudioService.transcript_tts(
			
 
				                 tenant_id=app_model.tenant_id,
			
 
				                 text=request.form['text'],
			
 
				+                voice=app_model.app_model_config.text_to_speech_dict.get('voice'),
			
 
				                 streaming=False
			
 
				             )
			
 
				             return {'data': response.data.decode('latin1')}
			
--- a/api/controllers/service_api/app/audio.py
+++ b/api/controllers/service_api/app/audio.py
@@ -86,6 +86,7 @@ class TextApi(AppApiResource):
 
				                 tenant_id=app_model.tenant_id,
			
 
				                 text=args['text'],
			
 
				                 end_user=args['user'],
			
 
				+                voice=app_model.app_model_config.text_to_speech_dict.get('voice'),
			
 
				                 streaming=args['streaming']
			
 
				             )
			
 
				 
			
--- a/api/controllers/web/audio.py
+++ b/api/controllers/web/audio.py
@@ -68,17 +68,23 @@ class AudioApi(WebApiResource):
 
				         except ValueError as e:
			
 
				             raise e
			
 
				         except Exception as e:
			
 
				-            logging.exception("internal server error.")
			
 
				+            logging.exception(f"internal server error: {str(e)}")
			
 
				             raise InternalServerError()
			
 
				 
			
 
				 
			
 
				 class TextApi(WebApiResource):
			
 
				     def post(self, app_model: App, end_user):
			
 
				+        app_model_config: AppModelConfig = app_model.app_model_config
			
 
				+
			
 
				+        if not app_model_config.text_to_speech_dict['enabled']:
			
 
				+            raise AppUnavailableError()
			
 
				+
			
 
				         try:
			
 
				             response = AudioService.transcript_tts(
			
 
				                 tenant_id=app_model.tenant_id,
			
 
				                 text=request.form['text'],
			
 
				                 end_user=end_user.external_user_id,
			
 
				+                voice=app_model.app_model_config.text_to_speech_dict.get('voice'),
			
 
				                 streaming=False
			
 
				             )
			
 
				 
			
@@ -105,7 +111,7 @@ class TextApi(WebApiResource):
 
				         except ValueError as e:
			
 
				             raise e
			
 
				         except Exception as e:
			
 
				-            logging.exception("internal server error.")
			
 
				+            logging.exception(f"internal server error: {str(e)}")
			
 
				             raise InternalServerError()
			
 
				 
			
 
				 
			
--- a/api/core/application_manager.py
+++ b/api/core/application_manager.py
@@ -28,6 +28,7 @@ from core.entities.application_entities import (
 
				     ModelConfigEntity,
			
 
				     PromptTemplateEntity,
			
 
				     SensitiveWordAvoidanceEntity,
			
 
				+    TextToSpeechEntity,
			
 
				 )
			
 
				 from core.entities.model_entities import ModelStatus
			
 
				 from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
			
@@ -572,7 +573,11 @@ class ApplicationManager:
 
				         text_to_speech_dict = copy_app_model_config_dict.get('text_to_speech')
			
 
				         if text_to_speech_dict:
			
 
				             if 'enabled' in text_to_speech_dict and text_to_speech_dict['enabled']:
			
 
				-                properties['text_to_speech'] = True
			
 
				+                properties['text_to_speech'] = TextToSpeechEntity(
			
 
				+                    enabled=text_to_speech_dict.get('enabled'),
			
 
				+                    voice=text_to_speech_dict.get('voice'),
			
 
				+                    language=text_to_speech_dict.get('language'),
			
 
				+                )
			
 
				 
			
 
				         # sensitive word avoidance
			
 
				         sensitive_word_avoidance_dict = copy_app_model_config_dict.get('sensitive_word_avoidance')
			
--- a/api/core/entities/application_entities.py
+++ b/api/core/entities/application_entities.py
@@ -42,6 +42,7 @@ class AdvancedCompletionPromptTemplateEntity(BaseModel):
 
				     """
			
 
				     Advanced Completion Prompt Template Entity.
			
 
				     """
			
 
				+
			
 
				     class RolePrefixEntity(BaseModel):
			
 
				         """
			
 
				         Role Prefix Entity.
			
@@ -57,6 +58,7 @@ class PromptTemplateEntity(BaseModel):
 
				     """
			
 
				     Prompt Template Entity.
			
 
				     """
			
 
				+
			
 
				     class PromptType(Enum):
			
 
				         """
			
 
				         Prompt Type.
			
@@ -97,6 +99,7 @@ class DatasetRetrieveConfigEntity(BaseModel):
 
				     """
			
 
				     Dataset Retrieve Config Entity.
			
 
				     """
			
 
				+
			
 
				     class RetrieveStrategy(Enum):
			
 
				         """
			
 
				         Dataset Retrieve Strategy.
			
@@ -143,6 +146,15 @@ class SensitiveWordAvoidanceEntity(BaseModel):
 
				     config: dict[str, Any] = {}
			
 
				 
			
 
				 
			
 
				+class TextToSpeechEntity(BaseModel):
			
 
				+    """
			
 
				+    Sensitive Word Avoidance Entity.
			
 
				+    """
			
 
				+    enabled: bool
			
 
				+    voice: Optional[str] = None
			
 
				+    language: Optional[str] = None
			
 
				+
			
 
				+
			
 
				 class FileUploadEntity(BaseModel):
			
 
				     """
			
 
				     File Upload Entity.
			
@@ -159,6 +171,7 @@ class AgentToolEntity(BaseModel):
 
				     tool_name: str
			
 
				     tool_parameters: dict[str, Any] = {}
			
 
				 
			
 
				+
			
 
				 class AgentPromptEntity(BaseModel):
			
 
				     """
			
 
				     Agent Prompt Entity.
			
@@ -166,6 +179,7 @@ class AgentPromptEntity(BaseModel):
 
				     first_prompt: str
			
 
				     next_iteration: str
			
 
				 
			
 
				+
			
 
				 class AgentScratchpadUnit(BaseModel):
			
 
				     """
			
 
				     Agent First Prompt Entity.
			
@@ -182,12 +196,14 @@ class AgentScratchpadUnit(BaseModel):
 
				     thought: Optional[str] = None
			
 
				     action_str: Optional[str] = None
			
 
				     observation: Optional[str] = None
			
 
				-    action: Optional[Action] = None    
			
 
				+    action: Optional[Action] = None
			
 
				+
			
 
				 
			
 
				 class AgentEntity(BaseModel):
			
 
				     """
			
 
				     Agent Entity.
			
 
				     """
			
 
				+
			
 
				     class Strategy(Enum):
			
 
				         """
			
 
				         Agent Strategy.
			
@@ -202,6 +218,7 @@ class AgentEntity(BaseModel):
 
				     tools: list[AgentToolEntity] = None
			
 
				     max_iteration: int = 5
			
 
				 
			
 
				+
			
 
				 class AppOrchestrationConfigEntity(BaseModel):
			
 
				     """
			
 
				     App Orchestration Config Entity.
			
@@ -219,7 +236,7 @@ class AppOrchestrationConfigEntity(BaseModel):
 
				     show_retrieve_source: bool = False
			
 
				     more_like_this: bool = False
			
 
				     speech_to_text: bool = False
			
 
				-    text_to_speech: bool = False
			
 
				+    text_to_speech: dict = {}
			
 
				     sensitive_word_avoidance: Optional[SensitiveWordAvoidanceEntity] = None
			
 
				 
			
 
				 
			
--- a/api/core/model_manager.py
+++ b/api/core/model_manager.py
@@ -99,7 +99,8 @@ class ModelInstance:
 
				             user=user
			
 
				         )
			
 
				 
			
 
				-    def invoke_rerank(self, query: str, docs: list[str], score_threshold: Optional[float] = None, top_n: Optional[int] = None,
			
 
				+    def invoke_rerank(self, query: str, docs: list[str], score_threshold: Optional[float] = None,
			
 
				+                      top_n: Optional[int] = None,
			
 
				                       user: Optional[str] = None) \
			
 
				             -> RerankResult:
			
 
				         """
			
@@ -166,13 +167,15 @@ class ModelInstance:
 
				             user=user
			
 
				         )
			
 
				 
			
 
				-    def invoke_tts(self, content_text: str, streaming: bool, user: Optional[str] = None) \
			
 
				+    def invoke_tts(self, content_text: str, tenant_id: str, voice: str, streaming: bool, user: Optional[str] = None) \
			
 
				             -> str:
			
 
				         """
			
 
				-        Invoke large language model
			
 
				+        Invoke large language tts model
			
 
				 
			
 
				         :param content_text: text content to be translated
			
 
				+        :param tenant_id: user tenant id
			
 
				         :param user: unique user id
			
 
				+        :param voice: model timbre
			
 
				         :param streaming: output is streaming
			
 
				         :return: text for given audio file
			
 
				         """
			
@@ -185,9 +188,28 @@ class ModelInstance:
 
				             credentials=self.credentials,
			
 
				             content_text=content_text,
			
 
				             user=user,
			
 
				+            tenant_id=tenant_id,
			
 
				+            voice=voice,
			
 
				             streaming=streaming
			
 
				         )
			
 
				 
			
 
				+    def get_tts_voices(self, language: str) -> list:
			
 
				+        """
			
 
				+        Invoke large language tts model voices
			
 
				+
			
 
				+        :param language: tts language
			
 
				+        :return: tts model voices
			
 
				+        """
			
 
				+        if not isinstance(self.model_type_instance, TTSModel):
			
 
				+            raise Exception("Model type instance is not TTSModel")
			
 
				+
			
 
				+        self.model_type_instance = cast(TTSModel, self.model_type_instance)
			
 
				+        return self.model_type_instance.get_tts_model_voices(
			
 
				+            model=self.model,
			
 
				+            credentials=self.credentials,
			
 
				+            language=language
			
 
				+        )
			
 
				+
			
 
				 
			
 
				 class ModelManager:
			
 
				     def __init__(self) -> None:
			
--- a/api/core/model_runtime/docs/en_US/schema.md
+++ b/api/core/model_runtime/docs/en_US/schema.md
@@ -48,6 +48,10 @@
 
				   - `file_upload_limit` (int) Maximum file upload limit, in MB (available for model type `speech2text`)
			
 
				   - `supported_file_extensions` (string) Supported file extension formats, e.g., mp3, mp4 (available for model type `speech2text`)
			
 
				   - `default_voice` (string)  default voice, e.g.：alloy,echo,fable,onyx,nova,shimmer（available for model type `tts`）
			
 
				+  - `voices` (list)  List of available voice.（available for model type `tts`）
			
 
				+    - `mode` (string)  voice model.（available for model type `tts`）
			
 
				+    - `name` (string)  voice model display name.（available for model type `tts`）
			
 
				+    - `lanuage` (string)  the voice model supports languages.（available for model type `tts`）
			
 
				   - `word_limit` (int)  Single conversion word limit, paragraphwise by default（available for model type `tts`）
			
 
				   - `audio_type` (string)  Support audio file extension format, e.g.：mp3,wav（available for model type `tts`）
			
 
				   - `max_workers` (int)  Number of concurrent workers supporting text and audio conversion（available for model type`tts`）
			
--- a/api/core/model_runtime/docs/zh_Hans/schema.md
+++ b/api/core/model_runtime/docs/zh_Hans/schema.md
@@ -48,7 +48,11 @@
 
				   - `max_chunks` (int) 最大分块数量 (模型类型 `text-embedding ` `moderation` 可用)
			
 
				   - `file_upload_limit` (int) 文件最大上传限制，单位：MB。（模型类型 `speech2text` 可用）
			
 
				   - `supported_file_extensions` (string)  支持文件扩展格式，如：mp3,mp4（模型类型 `speech2text` 可用）
			
 
				-  - `default_voice` (string)  缺省音色，可选：alloy,echo,fable,onyx,nova,shimmer（模型类型 `tts` 可用）
			
 
				+  - `default_voice` (string)  缺省音色，必选：alloy,echo,fable,onyx,nova,shimmer（模型类型 `tts` 可用）
			
 
				+  - `voices` (list)  可选音色列表。
			
 
				+    - `mode` (string)  音色模型。（模型类型 `tts` 可用）
			
 
				+    - `name` (string)  音色模型显示名称。（模型类型 `tts` 可用）
			
 
				+    - `lanuage` (string)  音色模型支持语言。（模型类型 `tts` 可用）
			
 
				   - `word_limit` (int)  单次转换字数限制，默认按段落分段（模型类型 `tts` 可用）
			
 
				   - `audio_type` (string)  支持音频文件扩展格式，如：mp3,wav（模型类型 `tts` 可用）
			
 
				   - `max_workers` (int)  支持文字音频转换并发任务数（模型类型 `tts` 可用）
			
--- a/api/core/model_runtime/entities/model_entities.py
+++ b/api/core/model_runtime/entities/model_entities.py
@@ -127,6 +127,7 @@ class ModelPropertyKey(Enum):
 
				     SUPPORTED_FILE_EXTENSIONS = "supported_file_extensions"
			
 
				     MAX_CHARACTERS_PER_CHUNK = "max_characters_per_chunk"
			
 
				     DEFAULT_VOICE = "default_voice"
			
 
				+    VOICES = "voices"
			
 
				     WORD_LIMIT = "word_limit"
			
 
				     AUDOI_TYPE = "audio_type"
			
 
				     MAX_WORKERS = "max_workers"
			
--- a/api/core/model_runtime/model_providers/__base/tts_model.py
+++ b/api/core/model_runtime/model_providers/__base/tts_model.py
@@ -15,29 +15,37 @@ class TTSModel(AIModel):
 
				     """
			
 
				     model_type: ModelType = ModelType.TTS
			
 
				 
			
 
				-    def invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None):
			
 
				+    def invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
			
 
				+               user: Optional[str] = None):
			
 
				         """
			
 
				         Invoke large language model
			
 
				 
			
 
				         :param model: model name
			
 
				+        :param tenant_id: user tenant id
			
 
				         :param credentials: model credentials
			
 
				+        :param voice: model timbre
			
 
				         :param content_text: text content to be translated
			
 
				         :param streaming: output is streaming
			
 
				         :param user: unique user id
			
 
				         :return: translated audio file
			
 
				         """
			
 
				         try:
			
 
				-            return self._invoke(model=model, credentials=credentials, user=user, streaming=streaming, content_text=content_text)
			
 
				+            self._is_ffmpeg_installed()
			
 
				+            return self._invoke(model=model, credentials=credentials, user=user, streaming=streaming,
			
 
				+                                content_text=content_text, voice=voice, tenant_id=tenant_id)
			
 
				         except Exception as e:
			
 
				             raise self._transform_invoke_error(e)
			
 
				 
			
 
				     @abstractmethod
			
 
				-    def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None):
			
 
				+    def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
			
 
				+                user: Optional[str] = None):
			
 
				         """
			
 
				         Invoke large language model
			
 
				 
			
 
				         :param model: model name
			
 
				+        :param tenant_id: user tenant id
			
 
				         :param credentials: model credentials
			
 
				+        :param voice: model timbre
			
 
				         :param content_text: text content to be translated
			
 
				         :param streaming: output is streaming
			
 
				         :param user: unique user id
			
@@ -45,7 +53,22 @@ class TTSModel(AIModel):
 
				         """
			
 
				         raise NotImplementedError
			
 
				 
			
 
				-    def _get_model_voice(self, model: str, credentials: dict) -> any:
			
 
				+    def get_tts_model_voices(self, model: str, credentials: dict, language: str) -> list:
			
 
				+        """
			
 
				+        Get voice for given tts model voices
			
 
				+
			
 
				+        :param language: tts language
			
 
				+        :param model: model name
			
 
				+        :param credentials: model credentials
			
 
				+        :return: voices lists
			
 
				+        """
			
 
				+        model_schema = self.get_model_schema(model, credentials)
			
 
				+
			
 
				+        if model_schema and ModelPropertyKey.VOICES in model_schema.model_properties:
			
 
				+            voices = model_schema.model_properties[ModelPropertyKey.VOICES]
			
 
				+            return [{'name': d['name'], 'value': d['mode']} for d in voices if language and language in d.get('language')]
			
 
				+
			
 
				+    def _get_model_default_voice(self, model: str, credentials: dict) -> any:
			
 
				         """
			
 
				         Get voice for given tts model
			
 
				 
			
--- a/api/core/model_runtime/model_providers/openai/tts/tts-1-hd.yaml
+++ b/api/core/model_runtime/model_providers/openai/tts/tts-1-hd.yaml
@@ -1,7 +1,31 @@
 
				-model: tts-1-hd
			
 
				+model: tts-1
			
 
				 model_type: tts
			
 
				 model_properties:
			
 
				   default_voice: 'alloy'
			
 
				+  voices:
			
 
				+    - mode: 'alloy'
			
 
				+      name: 'Alloy'
			
 
				+      language: ['zh-CN', 'en-US']
			
 
				+    - mode: 'echo'
			
 
				+      name: 'Echo'
			
 
				+      language: ['zh-CN', 'en-US']
			
 
				+    - mode: 'fable'
			
 
				+      name: 'Fable'
			
 
				+      language: ['zh-CN', 'en-US']
			
 
				+    - mode: 'onyx'
			
 
				+      name: 'Onyx'
			
 
				+      language: ['zh-CN', 'en-US']
			
 
				+    - mode: 'nova'
			
 
				+      name: 'Nova'
			
 
				+      language: ['zh-CN', 'en-US']
			
 
				+    - mode: 'shimmer'
			
 
				+      name: 'Shimmer'
			
 
				+      language: ['zh-CN', 'en-US']
			
 
				   word_limit: 120
			
 
				   audio_type: 'mp3'
			
 
				   max_workers: 5
			
 
				+pricing:
			
 
				+  input: '0.03'
			
 
				+  output: '0'
			
 
				+  unit: '0.001'
			
 
				+  currency: USD
			
--- a/api/core/model_runtime/model_providers/openai/tts/tts-1.yaml
+++ b/api/core/model_runtime/model_providers/openai/tts/tts-1.yaml
@@ -2,6 +2,30 @@ model: tts-1
 
				 model_type: tts
			
 
				 model_properties:
			
 
				   default_voice: 'alloy'
			
 
				+  voices:
			
 
				+    - mode: 'alloy'
			
 
				+      name: 'Alloy'
			
 
				+      language: ['zh-CN', 'en-US']
			
 
				+    - mode: 'echo'
			
 
				+      name: 'Echo'
			
 
				+      language: ['zh-CN', 'en-US']
			
 
				+    - mode: 'fable'
			
 
				+      name: 'Fable'
			
 
				+      language: ['zh-CN', 'en-US']
			
 
				+    - mode: 'onyx'
			
 
				+      name: 'Onyx'
			
 
				+      language: ['zh-CN', 'en-US']
			
 
				+    - mode: 'nova'
			
 
				+      name: 'Nova'
			
 
				+      language: ['zh-CN', 'en-US']
			
 
				+    - mode: 'shimmer'
			
 
				+      name: 'Shimmer'
			
 
				+      language: ['zh-CN', 'en-US']
			
 
				   word_limit: 120
			
 
				   audio_type: 'mp3'
			
 
				   max_workers: 5
			
 
				+pricing:
			
 
				+  input: '0.015'
			
 
				+  output: '0'
			
 
				+  unit: '0.001'
			
 
				+  currency: USD
			
--- a/api/core/model_runtime/model_providers/openai/tts/tts.py
+++ b/api/core/model_runtime/model_providers/openai/tts/tts.py
@@ -11,33 +11,40 @@ from core.model_runtime.errors.invoke import InvokeBadRequestError
 
				 from core.model_runtime.errors.validate import CredentialsValidateFailedError
			
 
				 from core.model_runtime.model_providers.__base.tts_model import TTSModel
			
 
				 from core.model_runtime.model_providers.openai._common import _CommonOpenAI
			
 
				+from extensions.ext_storage import storage
			
 
				 
			
 
				 
			
 
				 class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
			
 
				     """
			
 
				     Model class for OpenAI Speech to text model.
			
 
				     """
			
 
				-    def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any:
			
 
				+
			
 
				+    def _invoke(self, model: str, tenant_id: str, credentials: dict,
			
 
				+                content_text: str, voice: str, streaming: bool, user: Optional[str] = None) -> any:
			
 
				         """
			
 
				         _invoke text2speech model
			
 
				 
			
 
				         :param model: model name
			
 
				+        :param tenant_id: user tenant id
			
 
				         :param credentials: model credentials
			
 
				         :param content_text: text content to be translated
			
 
				+        :param voice: model timbre
			
 
				         :param streaming: output is streaming
			
 
				         :param user: unique user id
			
 
				         :return: text translated to audio file
			
 
				         """
			
 
				-        self._is_ffmpeg_installed()
			
 
				         audio_type = self._get_model_audio_type(model, credentials)
			
 
				+        if not voice:
			
 
				+            voice = self._get_model_default_voice(model, credentials)
			
 
				         if streaming:
			
 
				             return Response(stream_with_context(self._tts_invoke_streaming(model=model,
			
 
				                                                                            credentials=credentials,
			
 
				                                                                            content_text=content_text,
			
 
				-                                                                           user=user)),
			
 
				+                                                                           tenant_id=tenant_id,
			
 
				+                                                                           voice=voice)),
			
 
				                             status=200, mimetype=f'audio/{audio_type}')
			
 
				         else:
			
 
				-            return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, user=user)
			
 
				+            return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)
			
 
				 
			
 
				     def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
			
 
				         """
			
@@ -52,91 +59,96 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
 
				             self._tts_invoke(
			
 
				                 model=model,
			
 
				                 credentials=credentials,
			
 
				-                content_text='Hello world!',
			
 
				-                user=user
			
 
				+                content_text='Hello Dify!',
			
 
				+                voice=self._get_model_default_voice(model, credentials),
			
 
				             )
			
 
				         except Exception as ex:
			
 
				             raise CredentialsValidateFailedError(str(ex))
			
 
				 
			
 
				-    def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response:
			
 
				+    def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
			
 
				         """
			
 
				         _tts_invoke text2speech model
			
 
				 
			
 
				         :param model: model name
			
 
				         :param credentials: model credentials
			
 
				         :param content_text: text content to be translated
			
 
				-        :param user: unique user id
			
 
				+        :param voice: model timbre
			
 
				         :return: text translated to audio file
			
 
				         """
			
 
				         audio_type = self._get_model_audio_type(model, credentials)
			
 
				         word_limit = self._get_model_word_limit(model, credentials)
			
 
				         max_workers = self._get_model_workers_limit(model, credentials)
			
 
				-
			
 
				         try:
			
 
				             sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
			
 
				             audio_bytes_list = list()
			
 
				 
			
 
				             # Create a thread pool and map the function to the list of sentences
			
 
				             with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
			
 
				-                futures = [executor.submit(self._process_sentence, sentence, model, credentials) for sentence
			
 
				-                           in sentences]
			
 
				+                futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice,
			
 
				+                                           credentials=credentials) for sentence in sentences]
			
 
				                 for future in futures:
			
 
				                     try:
			
 
				-                        audio_bytes_list.append(future.result())
			
 
				+                        if future.result():
			
 
				+                            audio_bytes_list.append(future.result())
			
 
				                     except Exception as ex:
			
 
				                         raise InvokeBadRequestError(str(ex))
			
 
				 
			
 
				-            audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
			
 
				-                              audio_bytes_list if audio_bytes]
			
 
				-            combined_segment = reduce(lambda x, y: x + y, audio_segments)
			
 
				-            buffer: BytesIO = BytesIO()
			
 
				-            combined_segment.export(buffer, format=audio_type)
			
 
				-            buffer.seek(0)
			
 
				-            return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
			
 
				+            if len(audio_bytes_list) > 0:
			
 
				+                audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
			
 
				+                                  audio_bytes_list if audio_bytes]
			
 
				+                combined_segment = reduce(lambda x, y: x + y, audio_segments)
			
 
				+                buffer: BytesIO = BytesIO()
			
 
				+                combined_segment.export(buffer, format=audio_type)
			
 
				+                buffer.seek(0)
			
 
				+                return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
			
 
				         except Exception as ex:
			
 
				             raise InvokeBadRequestError(str(ex))
			
 
				 
			
 
				     # Todo: To improve the streaming function
			
 
				-    def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
			
 
				+    def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str,
			
 
				+                              voice: str) -> any:
			
 
				         """
			
 
				         _tts_invoke_streaming text2speech model
			
 
				 
			
 
				         :param model: model name
			
 
				+        :param tenant_id: user tenant id
			
 
				         :param credentials: model credentials
			
 
				         :param content_text: text content to be translated
			
 
				-        :param user: unique user id
			
 
				+        :param voice: model timbre
			
 
				         :return: text translated to audio file
			
 
				         """
			
 
				         # transform credentials to kwargs for model instance
			
 
				         credentials_kwargs = self._to_credential_kwargs(credentials)
			
 
				-        voice_name = self._get_model_voice(model, credentials)
			
 
				+        if not voice:
			
 
				+            voice = self._get_model_default_voice(model, credentials)
			
 
				         word_limit = self._get_model_word_limit(model, credentials)
			
 
				         audio_type = self._get_model_audio_type(model, credentials)
			
 
				         tts_file_id = self._get_file_name(content_text)
			
 
				-        file_path = f'storage/generate_files/{audio_type}/{tts_file_id}.{audio_type}'
			
 
				+        file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
			
 
				         try:
			
 
				             client = OpenAI(**credentials_kwargs)
			
 
				             sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
			
 
				             for sentence in sentences:
			
 
				-                response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip())
			
 
				-                response.stream_to_file(file_path)
			
 
				+                response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())
			
 
				+                # response.stream_to_file(file_path)
			
 
				+                storage.save(file_path, response.read())
			
 
				         except Exception as ex:
			
 
				             raise InvokeBadRequestError(str(ex))
			
 
				 
			
 
				-    def _process_sentence(self, sentence: str, model: str, credentials: dict):
			
 
				+    def _process_sentence(self, sentence: str, model: str,
			
 
				+                          voice, credentials: dict):
			
 
				         """
			
 
				         _tts_invoke openai text2speech model api
			
 
				 
			
 
				         :param model: model name
			
 
				         :param credentials: model credentials
			
 
				+        :param voice: model timbre
			
 
				         :param sentence: text content to be translated
			
 
				         :return: text translated to audio file
			
 
				         """
			
 
				         # transform credentials to kwargs for model instance
			
 
				         credentials_kwargs = self._to_credential_kwargs(credentials)
			
 
				-        voice_name = self._get_model_voice(model, credentials)
			
 
				-
			
 
				         client = OpenAI(**credentials_kwargs)
			
 
				-        response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip())
			
 
				+        response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())
			
 
				         if isinstance(response.read(), bytes):
			
 
				             return response.read()
			
--- a/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml
@@ -1,7 +1,134 @@
 
				 model: tts-1
			
 
				 model_type: tts
			
 
				 model_properties:
			
 
				-  default_voice: 'sambert-zhiru-v1' # 音色参考 https://help.aliyun.com/zh/dashscope/model-list 配置
			
 
				+  default_voice: 'sambert-zhiru-v1'
			
 
				+  voices:
			
 
				+    - mode: "sambert-zhinan-v1"
			
 
				+      name: "知楠（广告男声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhiqi-v1"
			
 
				+      name: "知琪（温柔女声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhichu-v1"
			
 
				+      name: "知厨（新闻播报）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhide-v1"
			
 
				+      name: "知德（新闻男声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhijia-v1"
			
 
				+      name: "知佳（标准女声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhiru-v1"
			
 
				+      name: "知茹（新闻女声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhiqian-v1"
			
 
				+      name: "知倩（配音解说、新闻播报）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhixiang-v1"
			
 
				+      name: "知祥（配音解说）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhiwei-v1"
			
 
				+      name: "知薇（萝莉女声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhihao-v1"
			
 
				+      name: "知浩（咨询男声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhijing-v1"
			
 
				+      name: "知婧（严厉女声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhiming-v1"
			
 
				+      name: "知茗（诙谐男声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhimo-v1"
			
 
				+      name: "知墨（情感男声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhina-v1"
			
 
				+      name: "知娜（浙普女声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhishu-v1"
			
 
				+      name: "知树（资讯男声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhistella-v1"
			
 
				+      name: "知莎（知性女声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhiting-v1"
			
 
				+      name: "知婷（电台女声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhixiao-v1"
			
 
				+      name: "知笑（资讯女声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhiya-v1"
			
 
				+      name: "知雅（严厉女声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhiye-v1"
			
 
				+      name: "知晔（青年男声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhiying-v1"
			
 
				+      name: "知颖（软萌童声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhiyuan-v1"
			
 
				+      name: "知媛（知心姐姐）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhigui-v1"
			
 
				+      name: "知柜（直播女声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhishuo-v1"
			
 
				+      name: "知硕（自然男声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhimiao-emo-v1"
			
 
				+      name: "知妙（多种情感女声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhimao-v1"
			
 
				+      name: "知猫（直播女声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhilun-v1"
			
 
				+      name: "知伦（悬疑解说）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhifei-v1"
			
 
				+      name: "知飞（激昂解说）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-zhida-v1"
			
 
				+      name: "知达（标准男声）"
			
 
				+      language: [ "zh-CN", "en-US" ]
			
 
				+    - mode: "sambert-camila-v1"
			
 
				+      name: "Camila（西班牙语女声）"
			
 
				+      language: [ "es-ES" ]
			
 
				+    - mode: "sambert-perla-v1"
			
 
				+      name: "Perla（意大利语女声）"
			
 
				+      language: [ "it-IT" ]
			
 
				+    - mode: "sambert-indah-v1"
			
 
				+      name: "Indah（印尼语女声）"
			
 
				+      language: [ "id-ID" ]
			
 
				+    - mode: "sambert-clara-v1"
			
 
				+      name: "Clara（法语女声）"
			
 
				+      language: [ "fr-FR" ]
			
 
				+    - mode: "sambert-hanna-v1"
			
 
				+      name: "Hanna（德语女声）"
			
 
				+      language: [ "de-DE" ]
			
 
				+    - mode: "sambert-beth-v1"
			
 
				+      name: "Beth（咨询女声）"
			
 
				+      language: [ "en-US" ]
			
 
				+    - mode: "sambert-betty-v1"
			
 
				+      name: "Betty（客服女声）"
			
 
				+      language: [ "en-US" ]
			
 
				+    - mode: "sambert-cally-v1"
			
 
				+      name: "Cally（自然女声）"
			
 
				+      language: [ "en-US" ]
			
 
				+    - mode: "sambert-cindy-v1"
			
 
				+      name: "Cindy（对话女声）"
			
 
				+      language: [ "en-US" ]
			
 
				+    - mode: "sambert-eva-v1"
			
 
				+      name: "Eva（陪伴女声）"
			
 
				+      language: [ "en-US" ]
			
 
				+    - mode: "sambert-donna-v1"
			
 
				+      name: "Donna（教育女声）"
			
 
				+      language: [ "en-US" ]
			
 
				+    - mode: "sambert-brian-v1"
			
 
				+      name: "Brian（客服男声）"
			
 
				+      language: [ "en-US" ]
			
 
				+    - mode: "sambert-waan-v1"
			
 
				+      name: "Waan（泰语女声）"
			
 
				+      language: [ "th-TH" ]
			
 
				   word_limit: 120
			
 
				   audio_type: 'mp3'
			
 
				   max_workers: 5
			
--- a/api/core/model_runtime/model_providers/tongyi/tts/tts.py
+++ b/api/core/model_runtime/model_providers/tongyi/tts/tts.py
@@ -11,33 +11,40 @@ from core.model_runtime.errors.invoke import InvokeBadRequestError
 
				 from core.model_runtime.errors.validate import CredentialsValidateFailedError
			
 
				 from core.model_runtime.model_providers.__base.tts_model import TTSModel
			
 
				 from core.model_runtime.model_providers.tongyi._common import _CommonTongyi
			
 
				+from extensions.ext_storage import storage
			
 
				 
			
 
				 
			
 
				 class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
			
 
				     """
			
 
				     Model class for Tongyi Speech to text model.
			
 
				     """
			
 
				-    def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any:
			
 
				+
			
 
				+    def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
			
 
				+                user: Optional[str] = None) -> any:
			
 
				         """
			
 
				         _invoke text2speech model
			
 
				 
			
 
				         :param model: model name
			
 
				+        :param tenant_id: user tenant id
			
 
				         :param credentials: model credentials
			
 
				+        :param voice: model timbre
			
 
				         :param content_text: text content to be translated
			
 
				         :param streaming: output is streaming
			
 
				         :param user: unique user id
			
 
				         :return: text translated to audio file
			
 
				         """
			
 
				-        self._is_ffmpeg_installed()
			
 
				         audio_type = self._get_model_audio_type(model, credentials)
			
 
				+        if not voice:
			
 
				+            voice = self._get_model_default_voice(model, credentials)
			
 
				         if streaming:
			
 
				             return Response(stream_with_context(self._tts_invoke_streaming(model=model,
			
 
				                                                                            credentials=credentials,
			
 
				                                                                            content_text=content_text,
			
 
				-                                                                           user=user)),
			
 
				+                                                                           voice=voice,
			
 
				+                                                                           tenant_id=tenant_id)),
			
 
				                             status=200, mimetype=f'audio/{audio_type}')
			
 
				         else:
			
 
				-            return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, user=user)
			
 
				+            return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)
			
 
				 
			
 
				     def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
			
 
				         """
			
@@ -52,91 +59,96 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
 
				             self._tts_invoke(
			
 
				                 model=model,
			
 
				                 credentials=credentials,
			
 
				-                content_text='Hello world!',
			
 
				-                user=user
			
 
				+                content_text='Hello Dify!',
			
 
				+                voice=self._get_model_default_voice(model, credentials),
			
 
				             )
			
 
				         except Exception as ex:
			
 
				             raise CredentialsValidateFailedError(str(ex))
			
 
				 
			
 
				-    def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response:
			
 
				+    def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
			
 
				         """
			
 
				         _tts_invoke text2speech model
			
 
				 
			
 
				         :param model: model name
			
 
				         :param credentials: model credentials
			
 
				+        :param voice: model timbre
			
 
				         :param content_text: text content to be translated
			
 
				-        :param user: unique user id
			
 
				         :return: text translated to audio file
			
 
				         """
			
 
				         audio_type = self._get_model_audio_type(model, credentials)
			
 
				         word_limit = self._get_model_word_limit(model, credentials)
			
 
				         max_workers = self._get_model_workers_limit(model, credentials)
			
 
				-
			
 
				         try:
			
 
				             sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
			
 
				             audio_bytes_list = list()
			
 
				 
			
 
				             # Create a thread pool and map the function to the list of sentences
			
 
				             with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
			
 
				-                futures = [executor.submit(self._process_sentence, model=model, sentence=sentence,
			
 
				-                                           credentials=credentials, audio_type=audio_type) for sentence in sentences]
			
 
				+                futures = [executor.submit(self._process_sentence, sentence=sentence,
			
 
				+                                           credentials=credentials, voice=voice, audio_type=audio_type) for sentence in
			
 
				+                           sentences]
			
 
				                 for future in futures:
			
 
				                     try:
			
 
				-                        audio_bytes_list.append(future.result())
			
 
				+                        if future.result():
			
 
				+                            audio_bytes_list.append(future.result())
			
 
				                     except Exception as ex:
			
 
				                         raise InvokeBadRequestError(str(ex))
			
 
				 
			
 
				-            audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
			
 
				-                              audio_bytes_list if audio_bytes]
			
 
				-            combined_segment = reduce(lambda x, y: x + y, audio_segments)
			
 
				-            buffer: BytesIO = BytesIO()
			
 
				-            combined_segment.export(buffer, format=audio_type)
			
 
				-            buffer.seek(0)
			
 
				-            return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
			
 
				+            if len(audio_bytes_list) > 0:
			
 
				+                audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
			
 
				+                                  audio_bytes_list if audio_bytes]
			
 
				+                combined_segment = reduce(lambda x, y: x + y, audio_segments)
			
 
				+                buffer: BytesIO = BytesIO()
			
 
				+                combined_segment.export(buffer, format=audio_type)
			
 
				+                buffer.seek(0)
			
 
				+                return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
			
 
				         except Exception as ex:
			
 
				             raise InvokeBadRequestError(str(ex))
			
 
				 
			
 
				     # Todo: To improve the streaming function
			
 
				-    def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
			
 
				+    def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str,
			
 
				+                              voice: str) -> any:
			
 
				         """
			
 
				         _tts_invoke_streaming text2speech model
			
 
				 
			
 
				         :param model: model name
			
 
				+        :param tenant_id: user tenant id
			
 
				         :param credentials: model credentials
			
 
				+        :param voice: model timbre
			
 
				         :param content_text: text content to be translated
			
 
				-        :param user: unique user id
			
 
				         :return: text translated to audio file
			
 
				         """
			
 
				-        # transform credentials to kwargs for model instance
			
 
				         dashscope.api_key = credentials.get('dashscope_api_key')
			
 
				-        voice_name = self._get_model_voice(model, credentials)
			
 
				         word_limit = self._get_model_word_limit(model, credentials)
			
 
				         audio_type = self._get_model_audio_type(model, credentials)
			
 
				+        tts_file_id = self._get_file_name(content_text)
			
 
				+        file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
			
 
				         try:
			
 
				             sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
			
 
				             for sentence in sentences:
			
 
				-                response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(),
			
 
				+                response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice, sample_rate=48000,
			
 
				+                                                                      text=sentence.strip(),
			
 
				                                                                       format=audio_type, word_timestamp_enabled=True,
			
 
				                                                                       phoneme_timestamp_enabled=True)
			
 
				                 if isinstance(response.get_audio_data(), bytes):
			
 
				-                    return response.get_audio_data()
			
 
				+                    storage.save(file_path, response.get_audio_data())
			
 
				         except Exception as ex:
			
 
				             raise InvokeBadRequestError(str(ex))
			
 
				 
			
 
				-    def _process_sentence(self, sentence: str, model: str, credentials: dict, audio_type: str):
			
 
				+    @staticmethod
			
 
				+    def _process_sentence(sentence: str, credentials: dict, voice: str, audio_type: str):
			
 
				         """
			
 
				         _tts_invoke Tongyi text2speech model api
			
 
				 
			
 
				-        :param model: model name
			
 
				         :param credentials: model credentials
			
 
				         :param sentence: text content to be translated
			
 
				+        :param voice: model timbre
			
 
				         :param audio_type: audio file type
			
 
				         :return: text translated to audio file
			
 
				         """
			
 
				-        # transform credentials to kwargs for model instance
			
 
				         dashscope.api_key = credentials.get('dashscope_api_key')
			
 
				-        voice_name = self._get_model_voice(model, credentials)
			
 
				-
			
 
				-        response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(), format=audio_type)
			
 
				+        response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice, sample_rate=48000,
			
 
				+                                                              text=sentence.strip(),
			
 
				+                                                              format=audio_type)
			
 
				         if isinstance(response.get_audio_data(), bytes):
			
 
				             return response.get_audio_data()
			
--- a/api/services/app_model_config_service.py
+++ b/api/services/app_model_config_service.py
@@ -98,7 +98,9 @@ class AppModelConfigService:
 
				         # text_to_speech
			
 
				         if 'text_to_speech' not in config or not config["text_to_speech"]:
			
 
				             config["text_to_speech"] = {
			
 
				-                "enabled": False
			
 
				+                "enabled": False,
			
 
				+                "voice": "",
			
 
				+                "language": ""
			
 
				             }
			
 
				 
			
 
				         if not isinstance(config["text_to_speech"], dict):
			
@@ -106,6 +108,8 @@ class AppModelConfigService:
 
				 
			
 
				         if "enabled" not in config["text_to_speech"] or not config["text_to_speech"]["enabled"]:
			
 
				             config["text_to_speech"]["enabled"] = False
			
 
				+            config["text_to_speech"]["voice"] = ""
			
 
				+            config["text_to_speech"]["language"] = ""
			
 
				 
			
 
				         if not isinstance(config["text_to_speech"]["enabled"], bool):
			
 
				             raise ValueError("enabled in text_to_speech must be of boolean type")
			
--- a/api/services/audio_service.py
+++ b/api/services/audio_service.py
@@ -13,14 +13,14 @@ from services.errors.audio import (
 
				     UnsupportedAudioTypeServiceError,
			
 
				 )
			
 
				 
			
 
				-FILE_SIZE = 15
			
 
				+FILE_SIZE = 30
			
 
				 FILE_SIZE_LIMIT = FILE_SIZE * 1024 * 1024
			
 
				 ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm', 'amr']
			
 
				 
			
 
				 
			
 
				 class AudioService:
			
 
				     @classmethod
			
 
				-    def transcript_asr(cls, tenant_id: str, file: FileStorage, end_user: Optional[str] = None):
			
 
				+    def transcript_asr(cls, tenant_id: str, file: FileStorage, promot: str, end_user: Optional[str] = None):
			
 
				         if file is None:
			
 
				             raise NoAudioUploadedServiceError()
			
 
				 
			
@@ -49,7 +49,7 @@ class AudioService:
 
				         return {"text": model_instance.invoke_speech2text(file=buffer, user=end_user)}
			
 
				 
			
 
				     @classmethod
			
 
				-    def transcript_tts(cls, tenant_id: str, text: str, streaming: bool, end_user: Optional[str] = None):
			
 
				+    def transcript_tts(cls, tenant_id: str, text: str, voice: str, streaming: bool, end_user: Optional[str] = None):
			
 
				         model_manager = ModelManager()
			
 
				         model_instance = model_manager.get_default_model_instance(
			
 
				             tenant_id=tenant_id,
			
@@ -59,6 +59,21 @@ class AudioService:
 
				             raise ProviderNotSupportTextToSpeechServiceError()
			
 
				 
			
 
				         try:
			
 
				-            return model_instance.invoke_tts(content_text=text.strip(), user=end_user, streaming=streaming)
			
 
				+            return model_instance.invoke_tts(content_text=text.strip(), user=end_user, streaming=streaming, tenant_id=tenant_id, voice=voice)
			
 
				+        except Exception as e:
			
 
				+            raise e
			
 
				+
			
 
				+    @classmethod
			
 
				+    def transcript_tts_voices(cls, tenant_id: str, language: str):
			
 
				+        model_manager = ModelManager()
			
 
				+        model_instance = model_manager.get_default_model_instance(
			
 
				+            tenant_id=tenant_id,
			
 
				+            model_type=ModelType.TTS
			
 
				+        )
			
 
				+        if model_instance is None:
			
 
				+            raise ProviderNotSupportTextToSpeechServiceError()
			
 
				+
			
 
				+        try:
			
 
				+            return model_instance.get_tts_voices(language)
			
 
				         except Exception as e:
			
 
				             raise e
			
--- a/api/services/errors/audio.py
+++ b/api/services/errors/audio.py
@@ -16,3 +16,7 @@ class ProviderNotSupportSpeechToTextServiceError(Exception):
 
				 
			
 
				 class ProviderNotSupportTextToSpeechServiceError(Exception):
			
 
				     pass
			
 
				+
			
 
				+
			
 
				+class ProviderNotSupportTextToSpeechLanageServiceError(Exception):
			
 
				+    pass
			
--- a/web/app/components/app/configuration/base/feature-panel/index.tsx
+++ b/web/app/components/app/configuration/base/feature-panel/index.tsx
@@ -2,6 +2,7 @@
 
				 import type { FC, ReactNode } from 'react'
			
 
				 import React from 'react'
			
 
				 import cn from 'classnames'
			
 
				+import ParamsConfig from '@/app/components/app/configuration/config-voice/param-config'
			
 
				 
			
 
				 export type IFeaturePanelProps = {
			
 
				   className?: string
			
@@ -12,6 +13,7 @@ export type IFeaturePanelProps = {
 
				   isFocus?: boolean
			
 
				   noBodySpacing?: boolean
			
 
				   children?: ReactNode
			
 
				+  isShowTextToSpeech?: boolean
			
 
				 }
			
 
				 
			
 
				 const FeaturePanel: FC<IFeaturePanelProps> = ({
			
@@ -23,6 +25,7 @@ const FeaturePanel: FC<IFeaturePanelProps> = ({
 
				   isFocus,
			
 
				   noBodySpacing,
			
 
				   children,
			
 
				+  isShowTextToSpeech,
			
 
				 }) => {
			
 
				   return (
			
 
				     <div
			
@@ -41,7 +44,13 @@ const FeaturePanel: FC<IFeaturePanelProps> = ({
 
				             <div className='text-sm font-semibold text-gray-800'>{title}</div>
			
 
				           </div>
			
 
				           <div>
			
 
				-            {headerRight}
			
 
				+            {isShowTextToSpeech
			
 
				+              ? (
			
 
				+                <div className='flex items-center'>
			
 
				+                  <ParamsConfig/>
			
 
				+                </div>
			
 
				+              )
			
 
				+              : headerRight}
			
 
				           </div>
			
 
				         </div>
			
 
				       </div>
			
--- a/web/app/components/app/configuration/config-voice/param-config-content.tsx
+++ b/web/app/components/app/configuration/config-voice/param-config-content.tsx
@@ -0,0 +1,187 @@
 
				+'use client'
			
 
				+import useSWR from 'swr'
			
 
				+import type { FC } from 'react'
			
 
				+import { useContext } from 'use-context-selector'
			
 
				+import React, { Fragment } from 'react'
			
 
				+import classNames from 'classnames'
			
 
				+import { usePathname } from 'next/navigation'
			
 
				+import { useTranslation } from 'react-i18next'
			
 
				+import { Listbox, Transition } from '@headlessui/react'
			
 
				+import { CheckIcon, ChevronDownIcon } from '@heroicons/react/20/solid'
			
 
				+import type { Item } from '@/app/components/base/select'
			
 
				+import ConfigContext from '@/context/debug-configuration'
			
 
				+import { fetchAppVoices } from '@/service/apps'
			
 
				+import Tooltip from '@/app/components/base/tooltip'
			
 
				+import { HelpCircle } from '@/app/components/base/icons/src/vender/line/general'
			
 
				+
			
 
				+const VoiceParamConfig: FC = () => {
			
 
				+  const { t } = useTranslation()
			
 
				+  const pathname = usePathname()
			
 
				+  const matched = pathname.match(/\/app\/([^/]+)/)
			
 
				+  const appId = (matched?.length && matched[1]) ? matched[1] : ''
			
 
				+
			
 
				+  const LanguageItems = [
			
 
				+    { value: 'zh-CN', name: '中文' },
			
 
				+    { value: 'en-US', name: '英语' },
			
 
				+    { value: 'de-DE', name: '德语' },
			
 
				+    { value: 'fr-FR', name: '法语' },
			
 
				+    { value: 'es-ES', name: '西班牙语' },
			
 
				+    { value: 'it-IT', name: '意大利语' },
			
 
				+    { value: 'th-TH', name: '泰语' },
			
 
				+    { value: 'id-ID', name: '印尼语' },
			
 
				+  ]
			
 
				+  const {
			
 
				+    textToSpeechConfig,
			
 
				+    setTextToSpeechConfig,
			
 
				+  } = useContext(ConfigContext)
			
 
				+
			
 
				+  const languageItem = LanguageItems.find(item => item.value === textToSpeechConfig.language)
			
 
				+  const localLanguagePlaceholder = languageItem?.name || t('common.placeholder.select')
			
 
				+
			
 
				+  const voiceItems = useSWR({ url: `/apps/${appId}/text-to-audio/voices?language=${languageItem ? languageItem.value : 'zh-CN'}` }, fetchAppVoices).data
			
 
				+  const voiceItem = voiceItems?.find(item => item.value === textToSpeechConfig.voice)
			
 
				+  const localVoicePlaceholder = voiceItem?.name || t('common.placeholder.select')
			
 
				+
			
 
				+  return (
			
 
				+    <div>
			
 
				+      <div>
			
 
				+        <div className='leading-6 text-base font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.title')}</div>
			
 
				+        <div className='pt-3 space-y-6'>
			
 
				+          <div>
			
 
				+            <div className='mb-2 flex items-center  space-x-1'>
			
 
				+              <div className='leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.language')}</div>
			
 
				+              <Tooltip htmlContent={<div className='w-[180px]' >
			
 
				+                {t('appDebug.voice.voiceSettings.resolutionTooltip').split('\n').map(item => (
			
 
				+                  <div key={item}>{item}</div>
			
 
				+                ))}
			
 
				+              </div>} selector='config-resolution-tooltip'>
			
 
				+                <HelpCircle className='w-[14px] h-[14px] text-gray-400' />
			
 
				+              </Tooltip>
			
 
				+            </div>
			
 
				+            <Listbox
			
 
				+              value={languageItem}
			
 
				+              onChange={(value: Item) => {
			
 
				+                setTextToSpeechConfig({
			
 
				+                  ...textToSpeechConfig,
			
 
				+                  language: String(value.value),
			
 
				+                })
			
 
				+              }}
			
 
				+            >
			
 
				+              <div className={'relative h-9'}>
			
 
				+                <Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
			
 
				+                  <span className={classNames('block truncate text-left', !languageItem?.name && 'text-gray-400')}>{languageItem?.name ?? localLanguagePlaceholder}</span>
			
 
				+                  <span className="pointer-events-none absolute inset-y-0 right-0 flex items-center pr-2">
			
 
				+                    <ChevronDownIcon
			
 
				+                      className="h-5 w-5 text-gray-400"
			
 
				+                      aria-hidden="true"
			
 
				+                    />
			
 
				+                  </span>
			
 
				+                </Listbox.Button>
			
 
				+                <Transition
			
 
				+                  as={Fragment}
			
 
				+                  leave="transition ease-in duration-100"
			
 
				+                  leaveFrom="opacity-100"
			
 
				+                  leaveTo="opacity-0"
			
 
				+                >
			
 
				+
			
 
				+                  <Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
			
 
				+                    {LanguageItems.map((item: Item) => (
			
 
				+                      <Listbox.Option
			
 
				+                        key={item.value}
			
 
				+                        className={({ active }) =>
			
 
				+                          `relative cursor-pointer select-none py-2 pl-3 pr-9 rounded-lg hover:bg-gray-100 text-gray-700 ${active ? 'bg-gray-100' : ''
			
 
				+                          }`
			
 
				+                        }
			
 
				+                        value={item}
			
 
				+                        disabled={false}
			
 
				+                      >
			
 
				+                        {({ /* active, */ selected }) => (
			
 
				+                          <>
			
 
				+                            <span className={classNames('block', selected && 'font-normal')}>{item.name}</span>
			
 
				+                            {(selected || item.value === textToSpeechConfig.language) && (
			
 
				+                              <span
			
 
				+                                className={classNames(
			
 
				+                                  'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
			
 
				+                                )}
			
 
				+                              >
			
 
				+                                <CheckIcon className="h-5 w-5" aria-hidden="true" />
			
 
				+                              </span>
			
 
				+                            )}
			
 
				+                          </>
			
 
				+                        )}
			
 
				+                      </Listbox.Option>
			
 
				+                    ))}
			
 
				+                  </Listbox.Options>
			
 
				+                </Transition>
			
 
				+              </div>
			
 
				+            </Listbox>
			
 
				+          </div>
			
 
				+
			
 
				+          <div>
			
 
				+            <div className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.voice')}</div>
			
 
				+            <Listbox
			
 
				+              value={voiceItem}
			
 
				+              disabled={!languageItem}
			
 
				+              onChange={(value: Item) => {
			
 
				+                setTextToSpeechConfig({
			
 
				+                  ...textToSpeechConfig,
			
 
				+                  voice: String(value.value),
			
 
				+                })
			
 
				+              }}
			
 
				+            >
			
 
				+              <div className={'relative h-9'}>
			
 
				+                <Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
			
 
				+                  <span className={classNames('block truncate text-left', !voiceItem?.name && 'text-gray-400')}>{voiceItem?.name ?? localVoicePlaceholder}</span>
			
 
				+                  <span className="pointer-events-none absolute inset-y-0 right-0 flex items-center pr-2">
			
 
				+                    <ChevronDownIcon
			
 
				+                      className="h-5 w-5 text-gray-400"
			
 
				+                      aria-hidden="true"
			
 
				+                    />
			
 
				+                  </span>
			
 
				+                </Listbox.Button>
			
 
				+                <Transition
			
 
				+                  as={Fragment}
			
 
				+                  leave="transition ease-in duration-100"
			
 
				+                  leaveFrom="opacity-100"
			
 
				+                  leaveTo="opacity-0"
			
 
				+                >
			
 
				+
			
 
				+                  <Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
			
 
				+                    {voiceItems?.map((item: Item) => (
			
 
				+                      <Listbox.Option
			
 
				+                        key={item.value}
			
 
				+                        className={({ active }) =>
			
 
				+                          `relative cursor-pointer select-none py-2 pl-3 pr-9 rounded-lg hover:bg-gray-100 text-gray-700 ${active ? 'bg-gray-100' : ''
			
 
				+                          }`
			
 
				+                        }
			
 
				+                        value={item}
			
 
				+                        disabled={false}
			
 
				+                      >
			
 
				+                        {({ /* active, */ selected }) => (
			
 
				+                          <>
			
 
				+                            <span className={classNames('block', selected && 'font-normal')}>{item.name}</span>
			
 
				+                            {(selected || item.value === textToSpeechConfig.voice) && (
			
 
				+                              <span
			
 
				+                                className={classNames(
			
 
				+                                  'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
			
 
				+                                )}
			
 
				+                              >
			
 
				+                                <CheckIcon className="h-5 w-5" aria-hidden="true" />
			
 
				+                              </span>
			
 
				+                            )}
			
 
				+                          </>
			
 
				+                        )}
			
 
				+                      </Listbox.Option>
			
 
				+                    ))}
			
 
				+                  </Listbox.Options>
			
 
				+                </Transition>
			
 
				+              </div>
			
 
				+            </Listbox>
			
 
				+          </div>
			
 
				+        </div>
			
 
				+      </div>
			
 
				+    </div>
			
 
				+  )
			
 
				+}
			
 
				+
			
 
				+export default React.memo(VoiceParamConfig)
			
--- a/web/app/components/app/configuration/config-voice/param-config.tsx
+++ b/web/app/components/app/configuration/config-voice/param-config.tsx
@@ -0,0 +1,41 @@
 
				+'use client'
			
 
				+import type { FC } from 'react'
			
 
				+import { memo, useState } from 'react'
			
 
				+import { useTranslation } from 'react-i18next'
			
 
				+import cn from 'classnames'
			
 
				+import VoiceParamConfig from './param-config-content'
			
 
				+import { Settings01 } from '@/app/components/base/icons/src/vender/line/general'
			
 
				+import {
			
 
				+  PortalToFollowElem,
			
 
				+  PortalToFollowElemContent,
			
 
				+  PortalToFollowElemTrigger,
			
 
				+} from '@/app/components/base/portal-to-follow-elem'
			
 
				+
			
 
				+const ParamsConfig: FC = () => {
			
 
				+  const { t } = useTranslation()
			
 
				+  const [open, setOpen] = useState(false)
			
 
				+
			
 
				+  return (
			
 
				+    <PortalToFollowElem
			
 
				+      open={open}
			
 
				+      onOpenChange={setOpen}
			
 
				+      placement='bottom-end'
			
 
				+      offset={{
			
 
				+        mainAxis: 4,
			
 
				+      }}
			
 
				+    >
			
 
				+      <PortalToFollowElemTrigger onClick={() => setOpen(v => !v)}>
			
 
				+        <div className={cn('flex items-center rounded-md h-7 px-3 space-x-1 text-gray-700 cursor-pointer hover:bg-gray-200', open && 'bg-gray-200')}>
			
 
				+          <Settings01 className='w-3.5 h-3.5 ' />
			
 
				+          <div className='ml-1 leading-[18px] text-xs font-medium '>{t('appDebug.voice.settings')}</div>
			
 
				+        </div>
			
 
				+      </PortalToFollowElemTrigger>
			
 
				+      <PortalToFollowElemContent style={{ zIndex: 50 }}>
			
 
				+        <div className='w-80 sm:w-[412px] p-4 bg-white rounded-lg border-[0.5px] border-gray-200 shadow-lg space-y-3'>
			
 
				+          <VoiceParamConfig />
			
 
				+        </div>
			
 
				+      </PortalToFollowElemContent>
			
 
				+    </PortalToFollowElem>
			
 
				+  )
			
 
				+}
			
 
				+export default memo(ParamsConfig)
			
--- a/web/app/components/app/configuration/config/index.tsx
+++ b/web/app/components/app/configuration/config/index.tsx
@@ -119,6 +119,8 @@ const Config: FC = () => {
 
				     setTextToSpeech: (value) => {
			
 
				       setTextToSpeechConfig(produce(textToSpeechConfig, (draft: TextToSpeechConfig) => {
			
 
				         draft.enabled = value
			
 
				+        draft.voice = textToSpeechConfig?.voice
			
 
				+        draft.language = textToSpeechConfig?.language
			
 
				       }))
			
 
				     },
			
 
				     citation: citationConfig.enabled,
			
@@ -245,6 +247,7 @@ const Config: FC = () => {
 
				         {(isAgent && isChatApp) && (
			
 
				           <AgentTools />
			
 
				         )}
			
 
				+
			
 
				         <ConfigVision />
			
 
				 
			
 
				         {/* Chat History */}
			
--- a/web/app/components/app/configuration/debug/debug-with-multiple-model/text-generation-item.tsx
+++ b/web/app/components/app/configuration/debug/debug-with-multiple-model/text-generation-item.tsx
@@ -61,6 +61,11 @@ const TextGenerationItem: FC<TextGenerationItemProps> = ({
 
				     sensitive_word_avoidance: moderationConfig,
			
 
				     external_data_tools: externalDataToolsConfig,
			
 
				     more_like_this: moreLikeThisConfig,
			
 
				+    text_to_speech: {
			
 
				+      enabled: false,
			
 
				+      voice: '',
			
 
				+      language: '',
			
 
				+    },
			
 
				     agent_mode: {
			
 
				       enabled: false,
			
 
				       tools: [],
			
--- a/web/app/components/app/configuration/debug/index.tsx
+++ b/web/app/components/app/configuration/debug/index.tsx
@@ -213,9 +213,6 @@ const Debug: FC<IDebug> = ({
 
				     const contextVar = modelConfig.configs.prompt_variables.find(item => item.is_context_var)?.key
			
 
				 
			
 
				     const postModelConfig: BackendModelConfig = {
			
 
				-      text_to_speech: {
			
 
				-        enabled: false,
			
 
				-      },
			
 
				       pre_prompt: !isAdvancedMode ? modelConfig.configs.prompt_template : '',
			
 
				       prompt_type: promptMode,
			
 
				       chat_prompt_config: {},
			
@@ -234,6 +231,11 @@ const Debug: FC<IDebug> = ({
 
				         mode: modelConfig.mode,
			
 
				         completion_params: completionParams as any,
			
 
				       },
			
 
				+      text_to_speech: {
			
 
				+        enabled: false,
			
 
				+        voice: '',
			
 
				+        language: '',
			
 
				+      },
			
 
				       agent_mode: {
			
 
				         enabled: false,
			
 
				         tools: [],
			
--- a/web/app/components/app/configuration/features/chat-group/text-to-speech/index.tsx
+++ b/web/app/components/app/configuration/features/chat-group/text-to-speech/index.tsx
@@ -19,6 +19,7 @@ const TextToSpeech: FC = () => {
 
				         <div className='text-xs text-gray-500'>{t('appDebug.feature.textToSpeech.resDes')}</div>
			
 
				       }
			
 
				       noBodySpacing
			
 
				+      isShowTextToSpeech={true}
			
 
				     />
			
 
				   )
			
 
				 }
			
--- a/web/app/components/app/configuration/index.tsx
+++ b/web/app/components/app/configuration/index.tsx
@@ -30,6 +30,7 @@ import type {
 
				   MoreLikeThisConfig,
			
 
				   PromptConfig,
			
 
				   PromptVariable,
			
 
				+  TextToSpeechConfig,
			
 
				 } from '@/models/debug'
			
 
				 import type { ExternalDataTool } from '@/models/common'
			
 
				 import type { DataSet } from '@/models/datasets'
			
@@ -98,8 +99,10 @@ const Configuration: FC = () => {
 
				   const [speechToTextConfig, setSpeechToTextConfig] = useState<MoreLikeThisConfig>({
			
 
				     enabled: false,
			
 
				   })
			
 
				-  const [textToSpeechConfig, setTextToSpeechConfig] = useState<MoreLikeThisConfig>({
			
 
				+  const [textToSpeechConfig, setTextToSpeechConfig] = useState<TextToSpeechConfig>({
			
 
				     enabled: false,
			
 
				+    voice: '',
			
 
				+    language: '',
			
 
				   })
			
 
				   const [citationConfig, setCitationConfig] = useState<MoreLikeThisConfig>({
			
 
				     enabled: false,
			
@@ -246,6 +249,8 @@ const Configuration: FC = () => {
 
				     })
			
 
				     setTextToSpeechConfig(modelConfig.text_to_speech || {
			
 
				       enabled: false,
			
 
				+      voice: '',
			
 
				+      language: '',
			
 
				     })
			
 
				     setCitationConfig(modelConfig.retriever_resource || {
			
 
				       enabled: false,
			
--- a/web/app/components/base/chat/chat/answer/operation.tsx
+++ b/web/app/components/base/chat/chat/answer/operation.tsx
@@ -73,7 +73,8 @@ const Operation: FC<OperationProps> = ({
 
				           />
			
 
				         )
			
 
				       }
			
 
				-      {!isOpeningStatement && config?.text_to_speech && (
			
 
				+
			
 
				+      {(!isOpeningStatement && config?.text_to_speech.enabled) && (
			
 
				         <AudioBtn
			
 
				           value={content}
			
 
				           className='hidden group-hover:block'
			
--- a/web/context/debug-configuration.ts
+++ b/web/context/debug-configuration.ts
@@ -156,6 +156,8 @@ const DebugConfigurationContext = createContext<IDebugConfiguration>({
 
				   setSpeechToTextConfig: () => { },
			
 
				   textToSpeechConfig: {
			
 
				     enabled: false,
			
 
				+    voice: '',
			
 
				+    language: '',
			
 
				   },
			
 
				   setTextToSpeechConfig: () => { },
			
 
				   citationConfig: {
			
--- a/web/i18n/lang/app-debug.en.ts
+++ b/web/i18n/lang/app-debug.en.ts
@@ -298,6 +298,17 @@ const translation = {
 
				       uploadLimit: 'Upload Limit',
			
 
				     },
			
 
				   },
			
 
				+  voice: {
			
 
				+    name: 'Voice',
			
 
				+    description: 'Text to speech voice Settings',
			
 
				+    settings: 'Settings',
			
 
				+    voiceSettings: {
			
 
				+      title: 'Voice Settings',
			
 
				+      language: 'Language',
			
 
				+      resolutionTooltip: 'Text-to-speech voice support language。',
			
 
				+      voice: 'Voice',
			
 
				+    },
			
 
				+  },
			
 
				   openingStatement: {
			
 
				     title: 'Conversation Opener',
			
 
				     add: 'Add',
			
--- a/web/i18n/lang/app-debug.zh.ts
+++ b/web/i18n/lang/app-debug.zh.ts
@@ -294,6 +294,17 @@ const translation = {
 
				       uploadLimit: '上传数量限制',
			
 
				     },
			
 
				   },
			
 
				+  voice: {
			
 
				+    name: '音色',
			
 
				+    description: '文本转语音音色设置',
			
 
				+    settings: '设置',
			
 
				+    voiceSettings: {
			
 
				+      title: '音色设置',
			
 
				+      language: '语言',
			
 
				+      resolutionTooltip: '文本转语音音色支持语言。',
			
 
				+      voice: '音色',
			
 
				+    },
			
 
				+  },
			
 
				   openingStatement: {
			
 
				     title: '对话开场白',
			
 
				     add: '添加开场白',
			
--- a/web/models/app.ts
+++ b/web/models/app.ts
@@ -122,3 +122,8 @@ export type UpdateOpenAIKeyResponse = ValidateOpenAIKeyResponse
 
				 export type GenerationIntroductionResponse = {
			
 
				   introduction: string
			
 
				 }
			
 
				+
			
 
				+export type AppVoicesListResponse = [{
			
 
				+  name: string
			
 
				+  value: string
			
 
				+}]
			
--- a/web/models/debug.ts
+++ b/web/models/debug.ts
@@ -75,7 +75,11 @@ export type SuggestedQuestionsAfterAnswerConfig = MoreLikeThisConfig
 
				 
			
 
				 export type SpeechToTextConfig = MoreLikeThisConfig
			
 
				 
			
 
				-export type TextToSpeechConfig = MoreLikeThisConfig
			
 
				+export type TextToSpeechConfig = {
			
 
				+  enabled: boolean
			
 
				+  voice?: string
			
 
				+  language?: string
			
 
				+}
			
 
				 
			
 
				 export type CitationConfig = MoreLikeThisConfig
			
 
				 
			
--- a/web/service/apps.ts
+++ b/web/service/apps.ts
@@ -1,6 +1,6 @@
 
				 import type { Fetcher } from 'swr'
			
 
				 import { del, get, post } from './base'
			
 
				-import type { ApikeysListResponse, AppDailyConversationsResponse, AppDailyEndUsersResponse, AppDetailResponse, AppListResponse, AppStatisticsResponse, AppTemplatesResponse, AppTokenCostsResponse, CreateApiKeyResponse, GenerationIntroductionResponse, UpdateAppModelConfigResponse, UpdateAppSiteCodeResponse, UpdateOpenAIKeyResponse, ValidateOpenAIKeyResponse } from '@/models/app'
			
 
				+import type { ApikeysListResponse, AppDailyConversationsResponse, AppDailyEndUsersResponse, AppDetailResponse, AppListResponse, AppStatisticsResponse, AppTemplatesResponse, AppTokenCostsResponse, AppVoicesListResponse, CreateApiKeyResponse, GenerationIntroductionResponse, UpdateAppModelConfigResponse, UpdateAppSiteCodeResponse, UpdateOpenAIKeyResponse, ValidateOpenAIKeyResponse } from '@/models/app'
			
 
				 import type { CommonResponse } from '@/models/common'
			
 
				 import type { AppMode, ModelConfig } from '@/types/app'
			
 
				 
			
@@ -93,3 +93,7 @@ export const updateOpenAIKey: Fetcher<UpdateOpenAIKeyResponse, { url: string; bo
 
				 export const generationIntroduction: Fetcher<GenerationIntroductionResponse, { url: string; body: { prompt_template: string } }> = ({ url, body }) => {
			
 
				   return post<GenerationIntroductionResponse>(url, { body })
			
 
				 }
			
 
				+
			
 
				+export const fetchAppVoices: Fetcher<AppVoicesListResponse, { url: string }> = ({ url }) => {
			
 
				+  return get<AppVoicesListResponse>(url)
			
 
				+}
			
--- a/web/types/app.ts
+++ b/web/types/app.ts
@@ -155,6 +155,8 @@ export type ModelConfig = {
 
				   }
			
 
				   text_to_speech: {
			
 
				     enabled: boolean
			
 
				+    voice?: string
			
 
				+    language?: string
			
 
				   }
			
 
				   retriever_resource: {
			
 
				     enabled: boolean