9 months ago · 6ef401a9f0
--- a/api/constants/tts_auto_play_timeout.py
+++ b/api/constants/tts_auto_play_timeout.py
@@ -0,0 +1,4 @@
 
															+TTS_AUTO_PLAY_TIMEOUT = 5
														
 
															+
														
 
															+# sleep 20 ms ( 40ms => 1280 byte audio file,20ms => 640 byte audio file)
														
 
															+TTS_AUTO_PLAY_YIELD_CPU_TIME = 0.02
														
--- a/api/controllers/console/app/audio.py
+++ b/api/controllers/console/app/audio.py
@@ -81,15 +81,36 @@ class ChatMessageTextApi(Resource):
 
															     @account_initialization_required
														
 
															     @get_app_model
														
 
															     def post(self, app_model):
														
 
															+        from werkzeug.exceptions import InternalServerError
														
 
															+
														
 
															         try:
														
 
															+            parser = reqparse.RequestParser()
														
 
															+            parser.add_argument('message_id', type=str, location='json')
														
 
															+            parser.add_argument('text', type=str, location='json')
														
 
															+            parser.add_argument('voice', type=str, location='json')
														
 
															+            parser.add_argument('streaming', type=bool, location='json')
														
 
															+            args = parser.parse_args()
														
 
															+
														
 
															+            message_id = args.get('message_id', None)
														
 
															+            text = args.get('text', None)
														
 
															+            if (app_model.mode in [AppMode.ADVANCED_CHAT.value, AppMode.WORKFLOW.value]
														
 
															+                    and app_model.workflow
														
 
															+                    and app_model.workflow.features_dict):
														
 
															+                text_to_speech = app_model.workflow.features_dict.get('text_to_speech')
														
 
															+                voice = args.get('voice') if args.get('voice') else text_to_speech.get('voice')
														
 
															+            else:
														
 
															+                try:
														
 
															+                    voice = args.get('voice') if args.get('voice') else app_model.app_model_config.text_to_speech_dict.get(
														
 
															+                        'voice')
														
 
															+                except Exception:
														
 
															+                    voice = None
														
 
															             response = AudioService.transcript_tts(
														
 
															                 app_model=app_model,
														
 
															-                text=request.form['text'],
														
 
															-                voice=request.form['voice'],
														
 
															-                streaming=False
														
 
															+                text=text,
														
 
															+                message_id=message_id,
														
 
															+                voice=voice
														
 
															             )
														
 
															-
														
 
															-            return {'data': response.data.decode('latin1')}
														
 
															+            return response
														
 
															         except services.errors.app_model_config.AppModelConfigBrokenError:
														
 
															             logging.exception("App model config broken.")
														
 
															             raise AppUnavailableError()
														
--- a/api/controllers/console/explore/audio.py
+++ b/api/controllers/console/explore/audio.py
@@ -19,6 +19,7 @@ from controllers.console.app.error import (
 
															 from controllers.console.explore.wraps import InstalledAppResource
														
 
															 from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
														
 
															 from core.model_runtime.errors.invoke import InvokeError
														
 
															+from models.model import AppMode
														
 
															 from services.audio_service import AudioService
														
 
															 from services.errors.audio import (
														
 
															     AudioTooLargeServiceError,
														
@@ -70,16 +71,33 @@ class ChatAudioApi(InstalledAppResource):
 
															 class ChatTextApi(InstalledAppResource):
														
 
															     def post(self, installed_app):
														
 
															-        app_model = installed_app.app
														
 
															+        from flask_restful import reqparse
														
 
															+        app_model = installed_app.app
														
 
															         try:
														
 
															+            parser = reqparse.RequestParser()
														
 
															+            parser.add_argument('message_id', type=str, required=False, location='json')
														
 
															+            parser.add_argument('voice', type=str, location='json')
														
 
															+            parser.add_argument('streaming', type=bool, location='json')
														
 
															+            args = parser.parse_args()
														
 
															+
														
 
															+            message_id = args.get('message_id')
														
 
															+            if (app_model.mode in [AppMode.ADVANCED_CHAT.value, AppMode.WORKFLOW.value]
														
 
															+                    and app_model.workflow
														
 
															+                    and app_model.workflow.features_dict):
														
 
															+                text_to_speech = app_model.workflow.features_dict.get('text_to_speech')
														
 
															+                voice = args.get('voice') if args.get('voice') else text_to_speech.get('voice')
														
 
															+            else:
														
 
															+                try:
														
 
															+                    voice = args.get('voice') if args.get('voice') else app_model.app_model_config.text_to_speech_dict.get('voice')
														
 
															+                except Exception:
														
 
															+                    voice = None
														
 
															             response = AudioService.transcript_tts(
														
 
															                 app_model=app_model,
														
 
															-                text=request.form['text'],
														
 
															-                voice=request.form['voice'] if request.form.get('voice') else app_model.app_model_config.text_to_speech_dict.get('voice'),
														
 
															-                streaming=False
														
 
															+                message_id=message_id,
														
 
															+                voice=voice
														
 
															             )
														
 
															-            return {'data': response.data.decode('latin1')}
														
 
															+            return response
														
 
															         except services.errors.app_model_config.AppModelConfigBrokenError:
														
 
															             logging.exception("App model config broken.")
														
 
															             raise AppUnavailableError()
														
@@ -108,3 +126,5 @@ class ChatTextApi(InstalledAppResource):
 
															 api.add_resource(ChatAudioApi, '/installed-apps/<uuid:installed_app_id>/audio-to-text', endpoint='installed_app_audio')
														
 
															 api.add_resource(ChatTextApi, '/installed-apps/<uuid:installed_app_id>/text-to-audio', endpoint='installed_app_text')
														
 
															+# api.add_resource(ChatTextApiWithMessageId, '/installed-apps/<uuid:installed_app_id>/text-to-audio/message-id',
														
 
															+#                  endpoint='installed_app_text_with_message_id')
														
--- a/api/controllers/service_api/app/audio.py
+++ b/api/controllers/service_api/app/audio.py
@@ -20,7 +20,7 @@ from controllers.service_api.app.error import (
 
															 from controllers.service_api.wraps import FetchUserArg, WhereisUserArg, validate_app_token
														
 
															 from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
														
 
															 from core.model_runtime.errors.invoke import InvokeError
														
 
															-from models.model import App, EndUser
														
 
															+from models.model import App, AppMode, EndUser
														
 
															 from services.audio_service import AudioService
														
 
															 from services.errors.audio import (
														
 
															     AudioTooLargeServiceError,
														
@@ -72,19 +72,30 @@ class AudioApi(Resource):
 
															 class TextApi(Resource):
														
 
															     @validate_app_token(fetch_user_arg=FetchUserArg(fetch_from=WhereisUserArg.JSON))
														
 
															     def post(self, app_model: App, end_user: EndUser):
														
 
															-        parser = reqparse.RequestParser()
														
 
															-        parser.add_argument('text', type=str, required=True, nullable=False, location='json')
														
 
															-        parser.add_argument('voice', type=str, location='json')
														
 
															-        parser.add_argument('streaming', type=bool, required=False, nullable=False, location='json')
														
 
															-        args = parser.parse_args()
														
 
															-
														
 
															         try:
														
 
															+            parser = reqparse.RequestParser()
														
 
															+            parser.add_argument('message_id', type=str, required=False, location='json')
														
 
															+            parser.add_argument('voice', type=str, location='json')
														
 
															+            parser.add_argument('streaming', type=bool, location='json')
														
 
															+            args = parser.parse_args()
														
 
															+
														
 
															+            message_id = args.get('message_id')
														
 
															+            if (app_model.mode in [AppMode.ADVANCED_CHAT.value, AppMode.WORKFLOW.value]
														
 
															+                    and app_model.workflow
														
 
															+                    and app_model.workflow.features_dict):
														
 
															+                text_to_speech = app_model.workflow.features_dict.get('text_to_speech')
														
 
															+                voice = args.get('voice') if args.get('voice') else text_to_speech.get('voice')
														
 
															+            else:
														
 
															+                try:
														
 
															+                    voice = args.get('voice') if args.get('voice') else app_model.app_model_config.text_to_speech_dict.get(
														
 
															+                        'voice')
														
 
															+                except Exception:
														
 
															+                    voice = None
														
 
															             response = AudioService.transcript_tts(
														
 
															                 app_model=app_model,
														
 
															-                text=args['text'],
														
 
															-                end_user=end_user,
														
 
															-                voice=args.get('voice'),
														
 
															-                streaming=args['streaming']
														
 
															+                message_id=message_id,
														
 
															+                end_user=end_user.external_user_id,
														
 
															+                voice=voice
														
 
															             )
														
 
															             return response
														
--- a/api/controllers/web/audio.py
+++ b/api/controllers/web/audio.py
@@ -19,7 +19,7 @@ from controllers.web.error import (
 
															 from controllers.web.wraps import WebApiResource
														
 
															 from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
														
 
															 from core.model_runtime.errors.invoke import InvokeError
														
 
															-from models.model import App
														
 
															+from models.model import App, AppMode
														
 
															 from services.audio_service import AudioService
														
 
															 from services.errors.audio import (
														
 
															     AudioTooLargeServiceError,
														
@@ -69,16 +69,35 @@ class AudioApi(WebApiResource):
 
															 class TextApi(WebApiResource):
														
 
															     def post(self, app_model: App, end_user):
														
 
															+        from flask_restful import reqparse
														
 
															         try:
														
 
															+            parser = reqparse.RequestParser()
														
 
															+            parser.add_argument('message_id', type=str, required=False, location='json')
														
 
															+            parser.add_argument('voice', type=str, location='json')
														
 
															+            parser.add_argument('streaming', type=bool, location='json')
														
 
															+            args = parser.parse_args()
														
 
															+
														
 
															+            message_id = args.get('message_id')
														
 
															+            if (app_model.mode in [AppMode.ADVANCED_CHAT.value, AppMode.WORKFLOW.value]
														
 
															+                    and app_model.workflow
														
 
															+                    and app_model.workflow.features_dict):
														
 
															+                text_to_speech = app_model.workflow.features_dict.get('text_to_speech')
														
 
															+                voice = args.get('voice') if args.get('voice') else text_to_speech.get('voice')
														
 
															+            else:
														
 
															+                try:
														
 
															+                    voice = args.get('voice') if args.get(
														
 
															+                        'voice') else app_model.app_model_config.text_to_speech_dict.get('voice')
														
 
															+                except Exception:
														
 
															+                    voice = None
														
 
															+
														
 
															             response = AudioService.transcript_tts(
														
 
															                 app_model=app_model,
														
 
															-                text=request.form['text'],
														
 
															+                message_id=message_id,
														
 
															                 end_user=end_user.external_user_id,
														
 
															-                voice=request.form['voice'] if request.form.get('voice') else None,
														
 
															-                streaming=False
														
 
															+                voice=voice
														
 
															             )
														
 
															-            return {'data': response.data.decode('latin1')}
														
 
															+            return response
														
 
															         except services.errors.app_model_config.AppModelConfigBrokenError:
														
 
															             logging.exception("App model config broken.")
														
 
															             raise AppUnavailableError()
														
--- a/api/core/app/apps/advanced_chat/app_generator_tts_publisher.py
+++ b/api/core/app/apps/advanced_chat/app_generator_tts_publisher.py
@@ -0,0 +1,135 @@
 
															+import base64
														
 
															+import concurrent.futures
														
 
															+import logging
														
 
															+import queue
														
 
															+import re
														
 
															+import threading
														
 
															+
														
 
															+from core.app.entities.queue_entities import QueueAgentMessageEvent, QueueLLMChunkEvent, QueueTextChunkEvent
														
 
															+from core.model_manager import ModelManager
														
 
															+from core.model_runtime.entities.model_entities import ModelType
														
 
															+
														
 
															+
														
 
															+class AudioTrunk:
														
 
															+    def __init__(self, status: str, audio):
														
 
															+        self.audio = audio
														
 
															+        self.status = status
														
 
															+
														
 
															+
														
 
															+def _invoiceTTS(text_content: str, model_instance, tenant_id: str, voice: str):
														
 
															+    if not text_content or text_content.isspace():
														
 
															+        return
														
 
															+    return model_instance.invoke_tts(
														
 
															+        content_text=text_content.strip(),
														
 
															+        user="responding_tts",
														
 
															+        tenant_id=tenant_id,
														
 
															+        voice=voice
														
 
															+    )
														
 
															+
														
 
															+
														
 
															+def _process_future(future_queue, audio_queue):
														
 
															+    while True:
														
 
															+        try:
														
 
															+            future = future_queue.get()
														
 
															+            if future is None:
														
 
															+                break
														
 
															+            for audio in future.result():
														
 
															+                audio_base64 = base64.b64encode(bytes(audio))
														
 
															+                audio_queue.put(AudioTrunk("responding", audio=audio_base64))
														
 
															+        except Exception as e:
														
 
															+            logging.getLogger(__name__).warning(e)
														
 
															+            break
														
 
															+    audio_queue.put(AudioTrunk("finish", b''))
														
 
															+
														
 
															+
														
 
															+class AppGeneratorTTSPublisher:
														
 
															+
														
 
															+    def __init__(self, tenant_id: str, voice: str):
														
 
															+        self.logger = logging.getLogger(__name__)
														
 
															+        self.tenant_id = tenant_id
														
 
															+        self.msg_text = ''
														
 
															+        self._audio_queue = queue.Queue()
														
 
															+        self._msg_queue = queue.Queue()
														
 
															+        self.match = re.compile(r'[。.!?]')
														
 
															+        self.model_manager = ModelManager()
														
 
															+        self.model_instance = self.model_manager.get_default_model_instance(
														
 
															+            tenant_id=self.tenant_id,
														
 
															+            model_type=ModelType.TTS
														
 
															+        )
														
 
															+        self.voices = self.model_instance.get_tts_voices()
														
 
															+        values = [voice.get('value') for voice in self.voices]
														
 
															+        self.voice = voice
														
 
															+        if not voice or voice not in values:
														
 
															+            self.voice = self.voices[0].get('value')
														
 
															+        self.MAX_SENTENCE = 2
														
 
															+        self._last_audio_event = None
														
 
															+        self._runtime_thread = threading.Thread(target=self._runtime).start()
														
 
															+        self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=3)
														
 
															+
														
 
															+    def publish(self, message):
														
 
															+        try:
														
 
															+            self._msg_queue.put(message)
														
 
															+        except Exception as e:
														
 
															+            self.logger.warning(e)
														
 
															+
														
 
															+    def _runtime(self):
														
 
															+        future_queue = queue.Queue()
														
 
															+        threading.Thread(target=_process_future, args=(future_queue, self._audio_queue)).start()
														
 
															+        while True:
														
 
															+            try:
														
 
															+                message = self._msg_queue.get()
														
 
															+                if message is None:
														
 
															+                    if self.msg_text and len(self.msg_text.strip()) > 0:
														
 
															+                        futures_result = self.executor.submit(_invoiceTTS, self.msg_text,
														
 
															+                                                              self.model_instance, self.tenant_id, self.voice)
														
 
															+                        future_queue.put(futures_result)
														
 
															+                    break
														
 
															+                elif isinstance(message.event, QueueAgentMessageEvent | QueueLLMChunkEvent):
														
 
															+                    self.msg_text += message.event.chunk.delta.message.content
														
 
															+                elif isinstance(message.event, QueueTextChunkEvent):
														
 
															+                    self.msg_text += message.event.text
														
 
															+                self.last_message = message
														
 
															+                sentence_arr, text_tmp = self._extract_sentence(self.msg_text)
														
 
															+                if len(sentence_arr) >= min(self.MAX_SENTENCE, 7):
														
 
															+                    self.MAX_SENTENCE += 1
														
 
															+                    text_content = ''.join(sentence_arr)
														
 
															+                    futures_result = self.executor.submit(_invoiceTTS, text_content,
														
 
															+                                                          self.model_instance,
														
 
															+                                                          self.tenant_id,
														
 
															+                                                          self.voice)
														
 
															+                    future_queue.put(futures_result)
														
 
															+                    if text_tmp:
														
 
															+                        self.msg_text = text_tmp
														
 
															+                    else:
														
 
															+                        self.msg_text = ''
														
 
															+
														
 
															+            except Exception as e:
														
 
															+                self.logger.warning(e)
														
 
															+                break
														
 
															+        future_queue.put(None)
														
 
															+
														
 
															+    def checkAndGetAudio(self) -> AudioTrunk | None:
														
 
															+        try:
														
 
															+            if self._last_audio_event and self._last_audio_event.status == "finish":
														
 
															+                if self.executor:
														
 
															+                    self.executor.shutdown(wait=False)
														
 
															+                return self.last_message
														
 
															+            audio = self._audio_queue.get_nowait()
														
 
															+            if audio and audio.status == "finish":
														
 
															+                self.executor.shutdown(wait=False)
														
 
															+                self._runtime_thread = None
														
 
															+            if audio:
														
 
															+                self._last_audio_event = audio
														
 
															+            return audio
														
 
															+        except queue.Empty:
														
 
															+            return None
														
 
															+
														
 
															+    def _extract_sentence(self, org_text):
														
 
															+        tx = self.match.finditer(org_text)
														
 
															+        start = 0
														
 
															+        result = []
														
 
															+        for i in tx:
														
 
															+            end = i.regs[0][1]
														
 
															+            result.append(org_text[start:end])
														
 
															+            start = end
														
 
															+        return result, org_text[start:]
														
--- a/api/core/app/apps/advanced_chat/generate_task_pipeline.py
+++ b/api/core/app/apps/advanced_chat/generate_task_pipeline.py
@@ -4,6 +4,8 @@ import time
 
															 from collections.abc import Generator
														
 
															 from typing import Any, Optional, Union, cast
														
 
															+from constants.tts_auto_play_timeout import TTS_AUTO_PLAY_TIMEOUT, TTS_AUTO_PLAY_YIELD_CPU_TIME
														
 
															+from core.app.apps.advanced_chat.app_generator_tts_publisher import AppGeneratorTTSPublisher, AudioTrunk
														
 
															 from core.app.apps.base_app_queue_manager import AppQueueManager, PublishFrom
														
 
															 from core.app.entities.app_invoke_entities import (
														
 
															     AdvancedChatAppGenerateEntity,
														
@@ -33,6 +35,8 @@ from core.app.entities.task_entities import (
 
															     ChatbotAppStreamResponse,
														
 
															     ChatflowStreamGenerateRoute,
														
 
															     ErrorStreamResponse,
														
 
															+    MessageAudioEndStreamResponse,
														
 
															+    MessageAudioStreamResponse,
														
 
															     MessageEndStreamResponse,
														
 
															     StreamResponse,
														
 
															 )
														
@@ -71,13 +75,13 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
 
															     _iteration_nested_relations: dict[str, list[str]]
														
 
															     def __init__(
														
 
															-        self, application_generate_entity: AdvancedChatAppGenerateEntity,
														
 
															-        workflow: Workflow,
														
 
															-        queue_manager: AppQueueManager,
														
 
															-        conversation: Conversation,
														
 
															-        message: Message,
														
 
															-        user: Union[Account, EndUser],
														
 
															-        stream: bool
														
 
															+            self, application_generate_entity: AdvancedChatAppGenerateEntity,
														
 
															+            workflow: Workflow,
														
 
															+            queue_manager: AppQueueManager,
														
 
															+            conversation: Conversation,
														
 
															+            message: Message,
														
 
															+            user: Union[Account, EndUser],
														
 
															+            stream: bool
														
 
															     ) -> None:
														
 
															         """
														
 
															         Initialize AdvancedChatAppGenerateTaskPipeline.
														
@@ -129,7 +133,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
 
															             self._application_generate_entity.query
														
 
															         )
														
 
															-        generator = self._process_stream_response(
														
 
															+        generator = self._wrapper_process_stream_response(
														
 
															             trace_manager=self._application_generate_entity.trace_manager
														
 
															         )
														
 
															         if self._stream:
														
@@ -138,7 +142,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
 
															             return self._to_blocking_response(generator)
														
 
															     def _to_blocking_response(self, generator: Generator[StreamResponse, None, None]) \
														
 
															-        -> ChatbotAppBlockingResponse:
														
 
															+            -> ChatbotAppBlockingResponse:
														
 
															         """
														
 
															         Process blocking response.
														
 
															         :return:
														
@@ -169,7 +173,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
 
															         raise Exception('Queue listening stopped unexpectedly.')
														
 
															     def _to_stream_response(self, generator: Generator[StreamResponse, None, None]) \
														
 
															-        -> Generator[ChatbotAppStreamResponse, None, None]:
														
 
															+            -> Generator[ChatbotAppStreamResponse, None, None]:
														
 
															         """
														
 
															         To stream response.
														
 
															         :return:
														
@@ -182,14 +186,68 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
 
															                 stream_response=stream_response
														
 
															             )
														
 
															+    def _listenAudioMsg(self, publisher, task_id: str):
														
 
															+        if not publisher:
														
 
															+            return None
														
 
															+        audio_msg: AudioTrunk = publisher.checkAndGetAudio()
														
 
															+        if audio_msg and audio_msg.status != "finish":
														
 
															+            return MessageAudioStreamResponse(audio=audio_msg.audio, task_id=task_id)
														
 
															+        return None
														
 
															+
														
 
															+    def _wrapper_process_stream_response(self, trace_manager: Optional[TraceQueueManager] = None) -> \
														
 
															+            Generator[StreamResponse, None, None]:
														
 
															+
														
 
															+        publisher = None
														
 
															+        task_id = self._application_generate_entity.task_id
														
 
															+        tenant_id = self._application_generate_entity.app_config.tenant_id
														
 
															+        features_dict = self._workflow.features_dict
														
 
															+
														
 
															+        if features_dict.get('text_to_speech') and features_dict['text_to_speech'].get('enabled') and features_dict[
														
 
															+                'text_to_speech'].get('autoPlay') == 'enabled':
														
 
															+            publisher = AppGeneratorTTSPublisher(tenant_id, features_dict['text_to_speech'].get('voice'))
														
 
															+        for response in self._process_stream_response(publisher=publisher, trace_manager=trace_manager):
														
 
															+            while True:
														
 
															+                audio_response = self._listenAudioMsg(publisher, task_id=task_id)
														
 
															+                if audio_response:
														
 
															+                    yield audio_response
														
 
															+                else:
														
 
															+                    break
														
 
															+            yield response
														
 
															+
														
 
															+        start_listener_time = time.time()
														
 
															+        # timeout
														
 
															+        while (time.time() - start_listener_time) < TTS_AUTO_PLAY_TIMEOUT:
														
 
															+            try:
														
 
															+                if not publisher:
														
 
															+                    break
														
 
															+                audio_trunk = publisher.checkAndGetAudio()
														
 
															+                if audio_trunk is None:
														
 
															+                    # release cpu
														
 
															+                    # sleep 20 ms ( 40ms => 1280 byte audio file,20ms => 640 byte audio file)
														
 
															+                    time.sleep(TTS_AUTO_PLAY_YIELD_CPU_TIME)
														
 
															+                    continue
														
 
															+                if audio_trunk.status == "finish":
														
 
															+                    break
														
 
															+                else:
														
 
															+                    start_listener_time = time.time()
														
 
															+                    yield MessageAudioStreamResponse(audio=audio_trunk.audio, task_id=task_id)
														
 
															+            except Exception as e:
														
 
															+                logger.error(e)
														
 
															+                break
														
 
															+        yield MessageAudioEndStreamResponse(audio='', task_id=task_id)
														
 
															+
														
 
															     def _process_stream_response(
														
 
															-        self, trace_manager: Optional[TraceQueueManager] = None
														
 
															+            self,
														
 
															+            publisher: AppGeneratorTTSPublisher,
														
 
															+            trace_manager: Optional[TraceQueueManager] = None
														
 
															     ) -> Generator[StreamResponse, None, None]:
														
 
															         """
														
 
															         Process stream response.
														
 
															         :return:
														
 
															         """
														
 
															         for message in self._queue_manager.listen():
														
 
															+            if publisher:
														
 
															+                publisher.publish(message=message)
														
 
															             event = message.event
														
 
															             if isinstance(event, QueueErrorEvent):
														
@@ -301,7 +359,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
 
															                     continue
														
 
															                 if not self._is_stream_out_support(
														
 
															-                    event=event
														
 
															+                        event=event
														
 
															                 ):
														
 
															                     continue
														
@@ -318,7 +376,8 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
 
															                 yield self._ping_stream_response()
														
 
															             else:
														
 
															                 continue
														
 
															-
														
 
															+        if publisher:
														
 
															+            publisher.publish(None)
														
 
															         if self._conversation_name_generate_thread:
														
 
															             self._conversation_name_generate_thread.join()
														
@@ -402,7 +461,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
 
															         return stream_generate_routes
														
 
															     def _get_answer_start_at_node_ids(self, graph: dict, target_node_id: str) \
														
 
															-        -> list[str]:
														
 
															+            -> list[str]:
														
 
															         """
														
 
															         Get answer start at node id.
														
 
															         :param graph: graph
														
@@ -457,7 +516,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
 
															                 start_node_id = target_node_id
														
 
															                 start_node_ids.append(start_node_id)
														
 
															             elif node_type == NodeType.START.value or \
														
 
															-                node_iteration_id is not None and iteration_start_node_id == source_node.get('id'):
														
 
															+                    node_iteration_id is not None and iteration_start_node_id == source_node.get('id'):
														
 
															                 start_node_id = source_node_id
														
 
															                 start_node_ids.append(start_node_id)
														
 
															             else:
														
@@ -515,7 +574,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
 
															             # all route chunks are generated
														
 
															             if self._task_state.current_stream_generate_state.current_route_position == len(
														
 
															-                self._task_state.current_stream_generate_state.generate_route
														
 
															+                    self._task_state.current_stream_generate_state.generate_route
														
 
															             ):
														
 
															                 self._task_state.current_stream_generate_state = None
														
@@ -525,7 +584,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
 
															         :return:
														
 
															         """
														
 
															         if not self._task_state.current_stream_generate_state:
														
 
															-            return None
														
 
															+            return
														
 
															         route_chunks = self._task_state.current_stream_generate_state.generate_route[
														
 
															                        self._task_state.current_stream_generate_state.current_route_position:]
														
@@ -573,7 +632,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
 
															                     # get route chunk node execution info
														
 
															                     route_chunk_node_execution_info = self._task_state.ran_node_execution_infos[route_chunk_node_id]
														
 
															                     if (route_chunk_node_execution_info.node_type == NodeType.LLM
														
 
															-                        and latest_node_execution_info.node_type == NodeType.LLM):
														
 
															+                            and latest_node_execution_info.node_type == NodeType.LLM):
														
 
															                         # only LLM support chunk stream output
														
 
															                         self._task_state.current_stream_generate_state.current_route_position += 1
														
 
															                         continue
														
@@ -643,7 +702,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
 
															         # all route chunks are generated
														
 
															         if self._task_state.current_stream_generate_state.current_route_position == len(
														
 
															-            self._task_state.current_stream_generate_state.generate_route
														
 
															+                self._task_state.current_stream_generate_state.generate_route
														
 
															         ):
														
 
															             self._task_state.current_stream_generate_state = None
														
--- a/api/core/app/apps/base_app_queue_manager.py
+++ b/api/core/app/apps/base_app_queue_manager.py
@@ -51,7 +51,6 @@ class AppQueueManager:
 
															         listen_timeout = current_app.config.get("APP_MAX_EXECUTION_TIME")
														
 
															         start_time = time.time()
														
 
															         last_ping_time = 0
														
 
															-
														
 
															         while True:
														
 
															             try:
														
 
															                 message = self._q.get(timeout=1)
														
--- a/api/core/app/apps/workflow/generate_task_pipeline.py
+++ b/api/core/app/apps/workflow/generate_task_pipeline.py
@@ -1,7 +1,10 @@
 
															 import logging
														
 
															+import time
														
 
															 from collections.abc import Generator
														
 
															 from typing import Any, Optional, Union
														
 
															+from constants.tts_auto_play_timeout import TTS_AUTO_PLAY_TIMEOUT, TTS_AUTO_PLAY_YIELD_CPU_TIME
														
 
															+from core.app.apps.advanced_chat.app_generator_tts_publisher import AppGeneratorTTSPublisher, AudioTrunk
														
 
															 from core.app.apps.base_app_queue_manager import AppQueueManager
														
 
															 from core.app.entities.app_invoke_entities import (
														
 
															     InvokeFrom,
														
@@ -25,6 +28,8 @@ from core.app.entities.queue_entities import (
 
															 )
														
 
															 from core.app.entities.task_entities import (
														
 
															     ErrorStreamResponse,
														
 
															+    MessageAudioEndStreamResponse,
														
 
															+    MessageAudioStreamResponse,
														
 
															     StreamResponse,
														
 
															     TextChunkStreamResponse,
														
 
															     TextReplaceStreamResponse,
														
@@ -105,7 +110,7 @@ class WorkflowAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCycleMa
 
															         db.session.refresh(self._user)
														
 
															         db.session.close()
														
 
															-        generator = self._process_stream_response(
														
 
															+        generator = self._wrapper_process_stream_response(
														
 
															             trace_manager=self._application_generate_entity.trace_manager
														
 
															         )
														
 
															         if self._stream:
														
@@ -161,8 +166,58 @@ class WorkflowAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCycleMa
 
															                 stream_response=stream_response
														
 
															             )
														
 
															+    def _listenAudioMsg(self, publisher, task_id: str):
														
 
															+        if not publisher:
														
 
															+            return None
														
 
															+        audio_msg: AudioTrunk = publisher.checkAndGetAudio()
														
 
															+        if audio_msg and audio_msg.status != "finish":
														
 
															+            return MessageAudioStreamResponse(audio=audio_msg.audio, task_id=task_id)
														
 
															+        return None
														
 
															+
														
 
															+    def _wrapper_process_stream_response(self, trace_manager: Optional[TraceQueueManager] = None) -> \
														
 
															+            Generator[StreamResponse, None, None]:
														
 
															+
														
 
															+        publisher = None
														
 
															+        task_id = self._application_generate_entity.task_id
														
 
															+        tenant_id = self._application_generate_entity.app_config.tenant_id
														
 
															+        features_dict = self._workflow.features_dict
														
 
															+
														
 
															+        if features_dict.get('text_to_speech') and features_dict['text_to_speech'].get('enabled') and features_dict[
														
 
															+                'text_to_speech'].get('autoPlay') == 'enabled':
														
 
															+            publisher = AppGeneratorTTSPublisher(tenant_id, features_dict['text_to_speech'].get('voice'))
														
 
															+        for response in self._process_stream_response(publisher=publisher, trace_manager=trace_manager):
														
 
															+            while True:
														
 
															+                audio_response = self._listenAudioMsg(publisher, task_id=task_id)
														
 
															+                if audio_response:
														
 
															+                    yield audio_response
														
 
															+                else:
														
 
															+                    break
														
 
															+            yield response
														
 
															+
														
 
															+        start_listener_time = time.time()
														
 
															+        while (time.time() - start_listener_time) < TTS_AUTO_PLAY_TIMEOUT:
														
 
															+            try:
														
 
															+                if not publisher:
														
 
															+                    break
														
 
															+                audio_trunk = publisher.checkAndGetAudio()
														
 
															+                if audio_trunk is None:
														
 
															+                    # release cpu
														
 
															+                    # sleep 20 ms ( 40ms => 1280 byte audio file,20ms => 640 byte audio file)
														
 
															+                    time.sleep(TTS_AUTO_PLAY_YIELD_CPU_TIME)
														
 
															+                    continue
														
 
															+                if audio_trunk.status == "finish":
														
 
															+                    break
														
 
															+                else:
														
 
															+                    yield MessageAudioStreamResponse(audio=audio_trunk.audio, task_id=task_id)
														
 
															+            except Exception as e:
														
 
															+                logger.error(e)
														
 
															+                break
														
 
															+        yield MessageAudioEndStreamResponse(audio='', task_id=task_id)
														
 
															+
														
 
															+
														
 
															     def _process_stream_response(
														
 
															         self,
														
 
															+        publisher: AppGeneratorTTSPublisher,
														
 
															         trace_manager: Optional[TraceQueueManager] = None
														
 
															     ) -> Generator[StreamResponse, None, None]:
														
 
															         """
														
@@ -170,6 +225,8 @@ class WorkflowAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCycleMa
 
															         :return:
														
 
															         """
														
 
															         for message in self._queue_manager.listen():
														
 
															+            if publisher:
														
 
															+                publisher.publish(message=message)
														
 
															             event = message.event
														
 
															             if isinstance(event, QueueErrorEvent):
														
@@ -251,6 +308,10 @@ class WorkflowAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCycleMa
 
															             else:
														
 
															                 continue
														
 
															+        if publisher:
														
 
															+            publisher.publish(None)
														
 
															+
														
 
															+
														
 
															     def _save_workflow_app_log(self, workflow_run: WorkflowRun) -> None:
														
 
															         """
														
 
															         Save workflow app log.
														
--- a/api/core/app/entities/task_entities.py
+++ b/api/core/app/entities/task_entities.py
@@ -69,6 +69,7 @@ class WorkflowTaskState(TaskState):
 
															     iteration_nested_node_ids: list[str] = None
														
 
															+
														
 
															 class AdvancedChatTaskState(WorkflowTaskState):
														
 
															     """
														
 
															     AdvancedChatTaskState entity
														
@@ -86,6 +87,8 @@ class StreamEvent(Enum):
 
															     ERROR = "error"
														
 
															     MESSAGE = "message"
														
 
															     MESSAGE_END = "message_end"
														
 
															+    TTS_MESSAGE = "tts_message"
														
 
															+    TTS_MESSAGE_END = "tts_message_end"
														
 
															     MESSAGE_FILE = "message_file"
														
 
															     MESSAGE_REPLACE = "message_replace"
														
 
															     AGENT_THOUGHT = "agent_thought"
														
@@ -130,6 +133,22 @@ class MessageStreamResponse(StreamResponse):
 
															     answer: str
														
 
															+class MessageAudioStreamResponse(StreamResponse):
														
 
															+    """
														
 
															+    MessageStreamResponse entity
														
 
															+    """
														
 
															+    event: StreamEvent = StreamEvent.TTS_MESSAGE
														
 
															+    audio: str
														
 
															+
														
 
															+
														
 
															+class MessageAudioEndStreamResponse(StreamResponse):
														
 
															+    """
														
 
															+    MessageStreamResponse entity
														
 
															+    """
														
 
															+    event: StreamEvent = StreamEvent.TTS_MESSAGE_END
														
 
															+    audio: str
														
 
															+
														
 
															+
														
 
															 class MessageEndStreamResponse(StreamResponse):
														
 
															     """
														
 
															     MessageEndStreamResponse entity
														
@@ -186,6 +205,7 @@ class WorkflowStartStreamResponse(StreamResponse):
 
															     """
														
 
															     WorkflowStartStreamResponse entity
														
 
															     """
														
 
															+
														
 
															     class Data(BaseModel):
														
 
															         """
														
 
															         Data entity
														
@@ -205,6 +225,7 @@ class WorkflowFinishStreamResponse(StreamResponse):
 
															     """
														
 
															     WorkflowFinishStreamResponse entity
														
 
															     """
														
 
															+
														
 
															     class Data(BaseModel):
														
 
															         """
														
 
															         Data entity
														
@@ -232,6 +253,7 @@ class NodeStartStreamResponse(StreamResponse):
 
															     """
														
 
															     NodeStartStreamResponse entity
														
 
															     """
														
 
															+
														
 
															     class Data(BaseModel):
														
 
															         """
														
 
															         Data entity
														
@@ -273,6 +295,7 @@ class NodeFinishStreamResponse(StreamResponse):
 
															     """
														
 
															     NodeFinishStreamResponse entity
														
 
															     """
														
 
															+
														
 
															     class Data(BaseModel):
														
 
															         """
														
 
															         Data entity
														
@@ -323,10 +346,12 @@ class NodeFinishStreamResponse(StreamResponse):
 
															             }
														
 
															         }
														
 
															+
														
 
															 class IterationNodeStartStreamResponse(StreamResponse):
														
 
															     """
														
 
															     NodeStartStreamResponse entity
														
 
															     """
														
 
															+
														
 
															     class Data(BaseModel):
														
 
															         """
														
 
															         Data entity
														
@@ -344,10 +369,12 @@ class IterationNodeStartStreamResponse(StreamResponse):
 
															     workflow_run_id: str
														
 
															     data: Data
														
 
															+
														
 
															 class IterationNodeNextStreamResponse(StreamResponse):
														
 
															     """
														
 
															     NodeStartStreamResponse entity
														
 
															     """
														
 
															+
														
 
															     class Data(BaseModel):
														
 
															         """
														
 
															         Data entity
														
@@ -365,10 +392,12 @@ class IterationNodeNextStreamResponse(StreamResponse):
 
															     workflow_run_id: str
														
 
															     data: Data
														
 
															+
														
 
															 class IterationNodeCompletedStreamResponse(StreamResponse):
														
 
															     """
														
 
															     NodeCompletedStreamResponse entity
														
 
															     """
														
 
															+
														
 
															     class Data(BaseModel):
														
 
															         """
														
 
															         Data entity
														
@@ -393,10 +422,12 @@ class IterationNodeCompletedStreamResponse(StreamResponse):
 
															     workflow_run_id: str
														
 
															     data: Data
														
 
															+
														
 
															 class TextChunkStreamResponse(StreamResponse):
														
 
															     """
														
 
															     TextChunkStreamResponse entity
														
 
															     """
														
 
															+
														
 
															     class Data(BaseModel):
														
 
															         """
														
 
															         Data entity
														
@@ -411,6 +442,7 @@ class TextReplaceStreamResponse(StreamResponse):
 
															     """
														
 
															     TextReplaceStreamResponse entity
														
 
															     """
														
 
															+
														
 
															     class Data(BaseModel):
														
 
															         """
														
 
															         Data entity
														
@@ -473,6 +505,7 @@ class ChatbotAppBlockingResponse(AppBlockingResponse):
 
															     """
														
 
															     ChatbotAppBlockingResponse entity
														
 
															     """
														
 
															+
														
 
															     class Data(BaseModel):
														
 
															         """
														
 
															         Data entity
														
@@ -492,6 +525,7 @@ class CompletionAppBlockingResponse(AppBlockingResponse):
 
															     """
														
 
															     CompletionAppBlockingResponse entity
														
 
															     """
														
 
															+
														
 
															     class Data(BaseModel):
														
 
															         """
														
 
															         Data entity
														
@@ -510,6 +544,7 @@ class WorkflowAppBlockingResponse(AppBlockingResponse):
 
															     """
														
 
															     WorkflowAppBlockingResponse entity
														
 
															     """
														
 
															+
														
 
															     class Data(BaseModel):
														
 
															         """
														
 
															         Data entity
														
@@ -528,10 +563,12 @@ class WorkflowAppBlockingResponse(AppBlockingResponse):
 
															     workflow_run_id: str
														
 
															     data: Data
														
 
															+
														
 
															 class WorkflowIterationState(BaseModel):
														
 
															     """
														
 
															     WorkflowIterationState entity
														
 
															     """
														
 
															+
														
 
															     class Data(BaseModel):
														
 
															         """
														
 
															         Data entity
														
--- a/api/core/app/task_pipeline/easy_ui_based_generate_task_pipeline.py
+++ b/api/core/app/task_pipeline/easy_ui_based_generate_task_pipeline.py
@@ -4,6 +4,8 @@ import time
 
															 from collections.abc import Generator
														
 
															 from typing import Optional, Union, cast
														
 
															+from constants.tts_auto_play_timeout import TTS_AUTO_PLAY_TIMEOUT, TTS_AUTO_PLAY_YIELD_CPU_TIME
														
 
															+from core.app.apps.advanced_chat.app_generator_tts_publisher import AppGeneratorTTSPublisher, AudioTrunk
														
 
															 from core.app.apps.base_app_queue_manager import AppQueueManager, PublishFrom
														
 
															 from core.app.entities.app_invoke_entities import (
														
 
															     AgentChatAppGenerateEntity,
														
@@ -32,6 +34,8 @@ from core.app.entities.task_entities import (
 
															     CompletionAppStreamResponse,
														
 
															     EasyUITaskState,
														
 
															     ErrorStreamResponse,
														
 
															+    MessageAudioEndStreamResponse,
														
 
															+    MessageAudioStreamResponse,
														
 
															     MessageEndStreamResponse,
														
 
															     StreamResponse,
														
 
															 )
														
@@ -87,6 +91,7 @@ class EasyUIBasedGenerateTaskPipeline(BasedGenerateTaskPipeline, MessageCycleMan
 
															         """
														
 
															         super().__init__(application_generate_entity, queue_manager, user, stream)
														
 
															         self._model_config = application_generate_entity.model_conf
														
 
															+        self._app_config = application_generate_entity.app_config
														
 
															         self._conversation = conversation
														
 
															         self._message = message
														
@@ -102,7 +107,7 @@ class EasyUIBasedGenerateTaskPipeline(BasedGenerateTaskPipeline, MessageCycleMan
 
															         self._conversation_name_generate_thread = None
														
 
															     def process(
														
 
															-        self,
														
 
															+            self,
														
 
															     ) -> Union[
														
 
															         ChatbotAppBlockingResponse,
														
 
															         CompletionAppBlockingResponse,
														
@@ -123,7 +128,7 @@ class EasyUIBasedGenerateTaskPipeline(BasedGenerateTaskPipeline, MessageCycleMan
 
															                 self._application_generate_entity.query
														
 
															             )
														
 
															-        generator = self._process_stream_response(
														
 
															+        generator = self._wrapper_process_stream_response(
														
 
															             trace_manager=self._application_generate_entity.trace_manager
														
 
															         )
														
 
															         if self._stream:
														
@@ -202,14 +207,64 @@ class EasyUIBasedGenerateTaskPipeline(BasedGenerateTaskPipeline, MessageCycleMan
 
															                     stream_response=stream_response
														
 
															                 )
														
 
															+    def _listenAudioMsg(self, publisher, task_id: str):
														
 
															+        if publisher is None:
														
 
															+            return None
														
 
															+        audio_msg: AudioTrunk = publisher.checkAndGetAudio()
														
 
															+        if audio_msg and audio_msg.status != "finish":
														
 
															+            # audio_str = audio_msg.audio.decode('utf-8', errors='ignore')
														
 
															+            return MessageAudioStreamResponse(audio=audio_msg.audio, task_id=task_id)
														
 
															+        return None
														
 
															+
														
 
															+    def _wrapper_process_stream_response(self, trace_manager: Optional[TraceQueueManager] = None) -> \
														
 
															+            Generator[StreamResponse, None, None]:
														
 
															+
														
 
															+        tenant_id = self._application_generate_entity.app_config.tenant_id
														
 
															+        task_id = self._application_generate_entity.task_id
														
 
															+        publisher = None
														
 
															+        text_to_speech_dict = self._app_config.app_model_config_dict.get('text_to_speech')
														
 
															+        if text_to_speech_dict and text_to_speech_dict.get('autoPlay') == 'enabled' and text_to_speech_dict.get('enabled'):
														
 
															+            publisher = AppGeneratorTTSPublisher(tenant_id, text_to_speech_dict.get('voice', None))
														
 
															+        for response in self._process_stream_response(publisher=publisher, trace_manager=trace_manager):
														
 
															+            while True:
														
 
															+                audio_response = self._listenAudioMsg(publisher, task_id)
														
 
															+                if audio_response:
														
 
															+                    yield audio_response
														
 
															+                else:
														
 
															+                    break
														
 
															+            yield response
														
 
															+
														
 
															+        start_listener_time = time.time()
														
 
															+        # timeout
														
 
															+        while (time.time() - start_listener_time) < TTS_AUTO_PLAY_TIMEOUT:
														
 
															+            if publisher is None:
														
 
															+                break
														
 
															+            audio = publisher.checkAndGetAudio()
														
 
															+            if audio is None:
														
 
															+                # release cpu
														
 
															+                # sleep 20 ms ( 40ms => 1280 byte audio file,20ms => 640 byte audio file)
														
 
															+                time.sleep(TTS_AUTO_PLAY_YIELD_CPU_TIME)
														
 
															+                continue
														
 
															+            if audio.status == "finish":
														
 
															+                break
														
 
															+            else:
														
 
															+                start_listener_time = time.time()
														
 
															+                yield MessageAudioStreamResponse(audio=audio.audio,
														
 
															+                                                 task_id=task_id)
														
 
															+        yield MessageAudioEndStreamResponse(audio='', task_id=task_id)
														
 
															+
														
 
															     def _process_stream_response(
														
 
															-        self, trace_manager: Optional[TraceQueueManager] = None
														
 
															+            self,
														
 
															+            publisher: AppGeneratorTTSPublisher,
														
 
															+            trace_manager: Optional[TraceQueueManager] = None
														
 
															     ) -> Generator[StreamResponse, None, None]:
														
 
															         """
														
 
															         Process stream response.
														
 
															         :return:
														
 
															         """
														
 
															         for message in self._queue_manager.listen():
														
 
															+            if publisher:
														
 
															+                publisher.publish(message)
														
 
															             event = message.event
														
 
															             if isinstance(event, QueueErrorEvent):
														
@@ -272,12 +327,13 @@ class EasyUIBasedGenerateTaskPipeline(BasedGenerateTaskPipeline, MessageCycleMan
 
															                 yield self._ping_stream_response()
														
 
															             else:
														
 
															                 continue
														
 
															-
														
 
															+        if publisher:
														
 
															+            publisher.publish(None)
														
 
															         if self._conversation_name_generate_thread:
														
 
															             self._conversation_name_generate_thread.join()
														
 
															     def _save_message(
														
 
															-        self, trace_manager: Optional[TraceQueueManager] = None
														
 
															+            self, trace_manager: Optional[TraceQueueManager] = None
														
 
															     ) -> None:
														
 
															         """
														
 
															         Save message.
														
--- a/api/core/model_manager.py
+++ b/api/core/model_manager.py
@@ -264,7 +264,7 @@ class ModelInstance:
 
															             user=user
														
 
															         )
														
 
															-    def invoke_tts(self, content_text: str, tenant_id: str, voice: str, streaming: bool, user: Optional[str] = None) \
														
 
															+    def invoke_tts(self, content_text: str, tenant_id: str, voice: str, user: Optional[str] = None) \
														
 
															             -> str:
														
 
															         """
														
 
															         Invoke large language tts model
														
@@ -287,8 +287,7 @@ class ModelInstance:
 
															             content_text=content_text,
														
 
															             user=user,
														
 
															             tenant_id=tenant_id,
														
 
															-            voice=voice,
														
 
															-            streaming=streaming
														
 
															+            voice=voice
														
 
															         )
														
 
															     def _round_robin_invoke(self, function: Callable, *args, **kwargs):
														
--- a/api/core/model_runtime/model_providers/__base/tts_model.py
+++ b/api/core/model_runtime/model_providers/__base/tts_model.py
@@ -1,4 +1,6 @@
 
															 import hashlib
														
 
															+import logging
														
 
															+import re
														
 
															 import subprocess
														
 
															 import uuid
														
 
															 from abc import abstractmethod
														
@@ -10,7 +12,7 @@ from core.model_runtime.entities.model_entities import ModelPropertyKey, ModelTy
 
															 from core.model_runtime.errors.invoke import InvokeBadRequestError
														
 
															 from core.model_runtime.model_providers.__base.ai_model import AIModel
														
 
															-
														
 
															+logger = logging.getLogger(__name__)
														
 
															 class TTSModel(AIModel):
														
 
															     """
														
 
															     Model class for ttstext model.
														
@@ -20,7 +22,7 @@ class TTSModel(AIModel):
 
															     # pydantic configs
														
 
															     model_config = ConfigDict(protected_namespaces=())
														
 
															-    def invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
														
 
															+    def invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str,
														
 
															                user: Optional[str] = None):
														
 
															         """
														
 
															         Invoke large language model
														
@@ -35,14 +37,15 @@ class TTSModel(AIModel):
 
															         :return: translated audio file
														
 
															         """
														
 
															         try:
														
 
															+            logger.info(f"Invoke TTS model: {model} , invoke content : {content_text}")
														
 
															             self._is_ffmpeg_installed()
														
 
															-            return self._invoke(model=model, credentials=credentials, user=user, streaming=streaming,
														
 
															+            return self._invoke(model=model, credentials=credentials, user=user,
														
 
															                                 content_text=content_text, voice=voice, tenant_id=tenant_id)
														
 
															         except Exception as e:
														
 
															             raise self._transform_invoke_error(e)
														
 
															     @abstractmethod
														
 
															-    def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
														
 
															+    def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str,
														
 
															                 user: Optional[str] = None):
														
 
															         """
														
 
															         Invoke large language model
														
@@ -123,26 +126,26 @@ class TTSModel(AIModel):
 
															             return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS]
														
 
															     @staticmethod
														
 
															-    def _split_text_into_sentences(text: str, limit: int, delimiters=None):
														
 
															-        if delimiters is None:
														
 
															-            delimiters = set('。！？；\n')
														
 
															-
														
 
															-        buf = []
														
 
															-        word_count = 0
														
 
															-        for char in text:
														
 
															-            buf.append(char)
														
 
															-            if char in delimiters:
														
 
															-                if word_count >= limit:
														
 
															-                    yield ''.join(buf)
														
 
															-                    buf = []
														
 
															-                    word_count = 0
														
 
															-                else:
														
 
															-                    word_count += 1
														
 
															-            else:
														
 
															-                word_count += 1
														
 
															-
														
 
															-        if buf:
														
 
															-            yield ''.join(buf)
														
 
															+    def _split_text_into_sentences(org_text, max_length=2000, pattern=r'[。.!?]'):
														
 
															+        match = re.compile(pattern)
														
 
															+        tx = match.finditer(org_text)
														
 
															+        start = 0
														
 
															+        result = []
														
 
															+        one_sentence = ''
														
 
															+        for i in tx:
														
 
															+            end = i.regs[0][1]
														
 
															+            tmp = org_text[start:end]
														
 
															+            if len(one_sentence + tmp) > max_length:
														
 
															+                result.append(one_sentence)
														
 
															+                one_sentence = ''
														
 
															+            one_sentence += tmp
														
 
															+            start = end
														
 
															+        last_sens = org_text[start:]
														
 
															+        if last_sens:
														
 
															+            one_sentence += last_sens
														
 
															+        if one_sentence != '':
														
 
															+            result.append(one_sentence)
														
 
															+        return result
														
 
															     @staticmethod
														
 
															     def _is_ffmpeg_installed():
														
--- a/api/core/model_runtime/model_providers/azure_openai/tts/tts.py
+++ b/api/core/model_runtime/model_providers/azure_openai/tts/tts.py
@@ -4,7 +4,7 @@ from functools import reduce
 
															 from io import BytesIO
														
 
															 from typing import Optional
														
 
															-from flask import Response, stream_with_context
														
 
															+from flask import Response
														
 
															 from openai import AzureOpenAI
														
 
															 from pydub import AudioSegment
														
@@ -14,7 +14,6 @@ from core.model_runtime.errors.validate import CredentialsValidateFailedError
 
															 from core.model_runtime.model_providers.__base.tts_model import TTSModel
														
 
															 from core.model_runtime.model_providers.azure_openai._common import _CommonAzureOpenAI
														
 
															 from core.model_runtime.model_providers.azure_openai._constant import TTS_BASE_MODELS, AzureBaseModel
														
 
															-from extensions.ext_storage import storage
														
 
															 class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
														
@@ -23,7 +22,7 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
 
															     """
														
 
															     def _invoke(self, model: str, tenant_id: str, credentials: dict,
														
 
															-                content_text: str, voice: str, streaming: bool, user: Optional[str] = None) -> any:
														
 
															+                content_text: str, voice: str, user: Optional[str] = None) -> any:
														
 
															         """
														
 
															         _invoke text2speech model
														
@@ -32,30 +31,23 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
 
															         :param credentials: model credentials
														
 
															         :param content_text: text content to be translated
														
 
															         :param voice: model timbre
														
 
															-        :param streaming: output is streaming
														
 
															         :param user: unique user id
														
 
															         :return: text translated to audio file
														
 
															         """
														
 
															-        audio_type = self._get_model_audio_type(model, credentials)
														
 
															         if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]:
														
 
															             voice = self._get_model_default_voice(model, credentials)
														
 
															-        if streaming:
														
 
															-            return Response(stream_with_context(self._tts_invoke_streaming(model=model,
														
 
															-                                                                           credentials=credentials,
														
 
															-                                                                           content_text=content_text,
														
 
															-                                                                           tenant_id=tenant_id,
														
 
															-                                                                           voice=voice)),
														
 
															-                            status=200, mimetype=f'audio/{audio_type}')
														
 
															-        else:
														
 
															-            return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)
														
 
															-
														
 
															-    def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
														
 
															+
														
 
															+        return self._tts_invoke_streaming(model=model,
														
 
															+                                          credentials=credentials,
														
 
															+                                          content_text=content_text,
														
 
															+                                          voice=voice)
														
 
															+
														
 
															+    def validate_credentials(self, model: str, credentials: dict) -> None:
														
 
															         """
														
 
															         validate credentials text2speech model
														
 
															         :param model: model name
														
 
															         :param credentials: model credentials
														
 
															-        :param user: unique user id
														
 
															         :return: text translated to audio file
														
 
															         """
														
 
															         try:
														
@@ -82,7 +74,7 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
 
															         word_limit = self._get_model_word_limit(model, credentials)
														
 
															         max_workers = self._get_model_workers_limit(model, credentials)
														
 
															         try:
														
 
															-            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
														
 
															+            sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
														
 
															             audio_bytes_list = []
														
 
															             # Create a thread pool and map the function to the list of sentences
														
@@ -107,34 +99,37 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
 
															         except Exception as ex:
														
 
															             raise InvokeBadRequestError(str(ex))
														
 
															-    # Todo: To improve the streaming function
														
 
															-    def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str,
														
 
															+    def _tts_invoke_streaming(self, model: str,  credentials: dict, content_text: str,
														
 
															                               voice: str) -> any:
														
 
															         """
														
 
															         _tts_invoke_streaming text2speech model
														
 
															-
														
 
															         :param model: model name
														
 
															-        :param tenant_id: user tenant id
														
 
															         :param credentials: model credentials
														
 
															         :param content_text: text content to be translated
														
 
															         :param voice: model timbre
														
 
															         :return: text translated to audio file
														
 
															         """
														
 
															-        # transform credentials to kwargs for model instance
														
 
															-        credentials_kwargs = self._to_credential_kwargs(credentials)
														
 
															-        if not voice or voice not in self.get_tts_model_voices(model=model, credentials=credentials):
														
 
															-            voice = self._get_model_default_voice(model, credentials)
														
 
															-        word_limit = self._get_model_word_limit(model, credentials)
														
 
															-        audio_type = self._get_model_audio_type(model, credentials)
														
 
															-        tts_file_id = self._get_file_name(content_text)
														
 
															-        file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
														
 
															         try:
														
 
															+            # doc: https://platform.openai.com/docs/guides/text-to-speech
														
 
															+            credentials_kwargs = self._to_credential_kwargs(credentials)
														
 
															             client = AzureOpenAI(**credentials_kwargs)
														
 
															-            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
														
 
															-            for sentence in sentences:
														
 
															-                response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())
														
 
															-                # response.stream_to_file(file_path)
														
 
															-                storage.save(file_path, response.read())
														
 
															+            # max font is 4096,there is 3500 limit for each request
														
 
															+            max_length = 3500
														
 
															+            if len(content_text) > max_length:
														
 
															+                sentences = self._split_text_into_sentences(content_text, max_length=max_length)
														
 
															+                executor = concurrent.futures.ThreadPoolExecutor(max_workers=min(3, len(sentences)))
														
 
															+                futures = [executor.submit(client.audio.speech.with_streaming_response.create, model=model,
														
 
															+                                           response_format="mp3",
														
 
															+                                           input=sentences[i], voice=voice) for i in range(len(sentences))]
														
 
															+                for index, future in enumerate(futures):
														
 
															+                    yield from future.result().__enter__().iter_bytes(1024)
														
 
															+
														
 
															+            else:
														
 
															+                response = client.audio.speech.with_streaming_response.create(model=model, voice=voice,
														
 
															+                                                                              response_format="mp3",
														
 
															+                                                                              input=content_text.strip())
														
 
															+
														
 
															+                yield from response.__enter__().iter_bytes(1024)
														
 
															         except Exception as ex:
														
 
															             raise InvokeBadRequestError(str(ex))
														
@@ -162,7 +157,7 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
 
															     @staticmethod
														
 
															-    def _get_ai_model_entity(base_model_name: str, model: str) -> AzureBaseModel:
														
 
															+    def _get_ai_model_entity(base_model_name: str, model: str) -> AzureBaseModel | None:
														
 
															         for ai_model_entity in TTS_BASE_MODELS:
														
 
															             if ai_model_entity.base_model_name == base_model_name:
														
 
															                 ai_model_entity_copy = copy.deepcopy(ai_model_entity)
														
@@ -170,5 +165,4 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
 
															                 ai_model_entity_copy.entity.label.en_US = model
														
 
															                 ai_model_entity_copy.entity.label.zh_Hans = model
														
 
															                 return ai_model_entity_copy
														
 
															-
														
 
															         return None
														
--- a/api/core/model_runtime/model_providers/openai/tts/tts-1-hd.yaml
+++ b/api/core/model_runtime/model_providers/openai/tts/tts-1-hd.yaml
@@ -21,7 +21,7 @@ model_properties:
 
															     - mode: 'shimmer'
														
 
															       name: 'Shimmer'
														
 
															       language: [ 'zh-Hans', 'en-US', 'de-DE', 'fr-FR', 'es-ES', 'it-IT', 'th-TH', 'id-ID' ]
														
 
															-  word_limit: 120
														
 
															+  word_limit: 3500
														
 
															   audio_type: 'mp3'
														
 
															   max_workers: 5
														
 
															 pricing:
														
--- a/api/core/model_runtime/model_providers/openai/tts/tts-1.yaml
+++ b/api/core/model_runtime/model_providers/openai/tts/tts-1.yaml
@@ -21,7 +21,7 @@ model_properties:
 
															     - mode: 'shimmer'
														
 
															       name: 'Shimmer'
														
 
															       language: ['zh-Hans', 'en-US', 'de-DE', 'fr-FR', 'es-ES', 'it-IT', 'th-TH', 'id-ID']
														
 
															-  word_limit: 120
														
 
															+  word_limit: 3500
														
 
															   audio_type: 'mp3'
														
 
															   max_workers: 5
														
 
															 pricing:
														
--- a/api/core/model_runtime/model_providers/openai/tts/tts.py
+++ b/api/core/model_runtime/model_providers/openai/tts/tts.py
@@ -3,7 +3,7 @@ from functools import reduce
 
															 from io import BytesIO
														
 
															 from typing import Optional
														
 
															-from flask import Response, stream_with_context
														
 
															+from flask import Response
														
 
															 from openai import OpenAI
														
 
															 from pydub import AudioSegment
														
@@ -11,7 +11,6 @@ from core.model_runtime.errors.invoke import InvokeBadRequestError
 
															 from core.model_runtime.errors.validate import CredentialsValidateFailedError
														
 
															 from core.model_runtime.model_providers.__base.tts_model import TTSModel
														
 
															 from core.model_runtime.model_providers.openai._common import _CommonOpenAI
														
 
															-from extensions.ext_storage import storage
														
 
															 class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
														
@@ -20,7 +19,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
 
															     """
														
 
															     def _invoke(self, model: str, tenant_id: str, credentials: dict,
														
 
															-                content_text: str, voice: str, streaming: bool, user: Optional[str] = None) -> any:
														
 
															+                content_text: str, voice: str, user: Optional[str] = None) -> any:
														
 
															         """
														
 
															         _invoke text2speech model
														
@@ -29,22 +28,17 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
 
															         :param credentials: model credentials
														
 
															         :param content_text: text content to be translated
														
 
															         :param voice: model timbre
														
 
															-        :param streaming: output is streaming
														
 
															         :param user: unique user id
														
 
															         :return: text translated to audio file
														
 
															         """
														
 
															-        audio_type = self._get_model_audio_type(model, credentials)
														
 
															+
														
 
															         if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]:
														
 
															             voice = self._get_model_default_voice(model, credentials)
														
 
															-        if streaming:
														
 
															-            return Response(stream_with_context(self._tts_invoke_streaming(model=model,
														
 
															-                                                                           credentials=credentials,
														
 
															-                                                                           content_text=content_text,
														
 
															-                                                                           tenant_id=tenant_id,
														
 
															-                                                                           voice=voice)),
														
 
															-                            status=200, mimetype=f'audio/{audio_type}')
														
 
															-        else:
														
 
															-            return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)
														
 
															+        # if streaming:
														
 
															+        return self._tts_invoke_streaming(model=model,
														
 
															+                                          credentials=credentials,
														
 
															+                                          content_text=content_text,
														
 
															+                                          voice=voice)
														
 
															     def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
														
 
															         """
														
@@ -79,7 +73,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
 
															         word_limit = self._get_model_word_limit(model, credentials)
														
 
															         max_workers = self._get_model_workers_limit(model, credentials)
														
 
															         try:
														
 
															-            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
														
 
															+            sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
														
 
															             audio_bytes_list = []
														
 
															             # Create a thread pool and map the function to the list of sentences
														
@@ -104,34 +98,40 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
 
															         except Exception as ex:
														
 
															             raise InvokeBadRequestError(str(ex))
														
 
															-    # Todo: To improve the streaming function
														
 
															-    def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str,
														
 
															+
														
 
															+    def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
														
 
															                               voice: str) -> any:
														
 
															         """
														
 
															         _tts_invoke_streaming text2speech model
														
 
															         :param model: model name
														
 
															-        :param tenant_id: user tenant id
														
 
															         :param credentials: model credentials
														
 
															         :param content_text: text content to be translated
														
 
															         :param voice: model timbre
														
 
															         :return: text translated to audio file
														
 
															         """
														
 
															-        # transform credentials to kwargs for model instance
														
 
															-        credentials_kwargs = self._to_credential_kwargs(credentials)
														
 
															-        if not voice or voice not in self.get_tts_model_voices(model=model, credentials=credentials):
														
 
															-            voice = self._get_model_default_voice(model, credentials)
														
 
															-        word_limit = self._get_model_word_limit(model, credentials)
														
 
															-        audio_type = self._get_model_audio_type(model, credentials)
														
 
															-        tts_file_id = self._get_file_name(content_text)
														
 
															-        file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
														
 
															         try:
														
 
															+            # doc: https://platform.openai.com/docs/guides/text-to-speech
														
 
															+            credentials_kwargs = self._to_credential_kwargs(credentials)
														
 
															             client = OpenAI(**credentials_kwargs)
														
 
															-            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
														
 
															-            for sentence in sentences:
														
 
															-                response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())
														
 
															-                # response.stream_to_file(file_path)
														
 
															-                storage.save(file_path, response.read())
														
 
															+            if not voice or voice not in self.get_tts_model_voices(model=model, credentials=credentials):
														
 
															+                voice = self._get_model_default_voice(model, credentials)
														
 
															+            word_limit = self._get_model_word_limit(model, credentials)
														
 
															+            if len(content_text) > word_limit:
														
 
															+                sentences = self._split_text_into_sentences(content_text, max_length=word_limit)
														
 
															+                executor = concurrent.futures.ThreadPoolExecutor(max_workers=min(3, len(sentences)))
														
 
															+                futures = [executor.submit(client.audio.speech.with_streaming_response.create, model=model,
														
 
															+                                           response_format="mp3",
														
 
															+                                           input=sentences[i], voice=voice) for i in range(len(sentences))]
														
 
															+                for index, future in enumerate(futures):
														
 
															+                    yield from future.result().__enter__().iter_bytes(1024)
														
 
															+
														
 
															+            else:
														
 
															+                response = client.audio.speech.with_streaming_response.create(model=model, voice=voice,
														
 
															+                                                                              response_format="mp3",
														
 
															+                                                                              input=content_text.strip())
														
 
															+
														
 
															+                yield from response.__enter__().iter_bytes(1024)
														
 
															         except Exception as ex:
														
 
															             raise InvokeBadRequestError(str(ex))
														
--- a/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml
@@ -129,7 +129,7 @@ model_properties:
 
															     - mode: "sambert-waan-v1"
														
 
															       name: "Waan（泰语女声）"
														
 
															       language: [ "th-TH" ]
														
 
															-  word_limit: 120
														
 
															+  word_limit: 7000
														
 
															   audio_type: 'mp3'
														
 
															   max_workers: 5
														
 
															 pricing:
														
--- a/api/core/model_runtime/model_providers/tongyi/tts/tts.py
+++ b/api/core/model_runtime/model_providers/tongyi/tts/tts.py
@@ -1,17 +1,21 @@
 
															 import concurrent.futures
														
 
															+import threading
														
 
															 from functools import reduce
														
 
															 from io import BytesIO
														
 
															+from queue import Queue
														
 
															 from typing import Optional
														
 
															 import dashscope
														
 
															-from flask import Response, stream_with_context
														
 
															+from dashscope import SpeechSynthesizer
														
 
															+from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse
														
 
															+from dashscope.audio.tts import ResultCallback, SpeechSynthesisResult
														
 
															+from flask import Response
														
 
															 from pydub import AudioSegment
														
 
															 from core.model_runtime.errors.invoke import InvokeBadRequestError
														
 
															 from core.model_runtime.errors.validate import CredentialsValidateFailedError
														
 
															 from core.model_runtime.model_providers.__base.tts_model import TTSModel
														
 
															 from core.model_runtime.model_providers.tongyi._common import _CommonTongyi
														
 
															-from extensions.ext_storage import storage
														
 
															 class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
														
@@ -19,7 +23,7 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
 
															     Model class for Tongyi Speech to text model.
														
 
															     """
														
 
															-    def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
														
 
															+    def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str,
														
 
															                 user: Optional[str] = None) -> any:
														
 
															         """
														
 
															         _invoke text2speech model
														
@@ -29,22 +33,17 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
 
															         :param credentials: model credentials
														
 
															         :param voice: model timbre
														
 
															         :param content_text: text content to be translated
														
 
															-        :param streaming: output is streaming
														
 
															         :param user: unique user id
														
 
															         :return: text translated to audio file
														
 
															         """
														
 
															-        audio_type = self._get_model_audio_type(model, credentials)
														
 
															-        if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]:
														
 
															+        if not voice or voice not in [d['value'] for d in
														
 
															+                                      self.get_tts_model_voices(model=model, credentials=credentials)]:
														
 
															             voice = self._get_model_default_voice(model, credentials)
														
 
															-        if streaming:
														
 
															-            return Response(stream_with_context(self._tts_invoke_streaming(model=model,
														
 
															-                                                                           credentials=credentials,
														
 
															-                                                                           content_text=content_text,
														
 
															-                                                                           voice=voice,
														
 
															-                                                                           tenant_id=tenant_id)),
														
 
															-                            status=200, mimetype=f'audio/{audio_type}')
														
 
															-        else:
														
 
															-            return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)
														
 
															+
														
 
															+        return self._tts_invoke_streaming(model=model,
														
 
															+                                          credentials=credentials,
														
 
															+                                          content_text=content_text,
														
 
															+                                          voice=voice)
														
 
															     def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
														
 
															         """
														
@@ -79,7 +78,7 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
 
															         word_limit = self._get_model_word_limit(model, credentials)
														
 
															         max_workers = self._get_model_workers_limit(model, credentials)
														
 
															         try:
														
 
															-            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
														
 
															+            sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
														
 
															             audio_bytes_list = []
														
 
															             # Create a thread pool and map the function to the list of sentences
														
@@ -105,14 +104,12 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
 
															         except Exception as ex:
														
 
															             raise InvokeBadRequestError(str(ex))
														
 
															-    # Todo: To improve the streaming function
														
 
															-    def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str,
														
 
															+    def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
														
 
															                               voice: str) -> any:
														
 
															         """
														
 
															         _tts_invoke_streaming text2speech model
														
 
															         :param model: model name
														
 
															-        :param tenant_id: user tenant id
														
 
															         :param credentials: model credentials
														
 
															         :param voice: model timbre
														
 
															         :param content_text: text content to be translated
														
@@ -120,18 +117,32 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
 
															         """
														
 
															         word_limit = self._get_model_word_limit(model, credentials)
														
 
															         audio_type = self._get_model_audio_type(model, credentials)
														
 
															-        tts_file_id = self._get_file_name(content_text)
														
 
															-        file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
														
 
															         try:
														
 
															-            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
														
 
															-            for sentence in sentences:
														
 
															-                response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice, sample_rate=48000,
														
 
															-                                                                      api_key=credentials.get('dashscope_api_key'),
														
 
															-                                                                      text=sentence.strip(),
														
 
															-                                                                      format=audio_type, word_timestamp_enabled=True,
														
 
															-                                                                      phoneme_timestamp_enabled=True)
														
 
															-                if isinstance(response.get_audio_data(), bytes):
														
 
															-                    storage.save(file_path, response.get_audio_data())
														
 
															+            audio_queue: Queue = Queue()
														
 
															+            callback = Callback(queue=audio_queue)
														
 
															+
														
 
															+            def invoke_remote(content, v, api_key, cb, at, wl):
														
 
															+                if len(content) < word_limit:
														
 
															+                    sentences = [content]
														
 
															+                else:
														
 
															+                    sentences = list(self._split_text_into_sentences(org_text=content, max_length=wl))
														
 
															+                for sentence in sentences:
														
 
															+                    SpeechSynthesizer.call(model=v, sample_rate=16000,
														
 
															+                                           api_key=api_key,
														
 
															+                                           text=sentence.strip(),
														
 
															+                                           callback=cb,
														
 
															+                                           format=at, word_timestamp_enabled=True,
														
 
															+                                           phoneme_timestamp_enabled=True)
														
 
															+
														
 
															+            threading.Thread(target=invoke_remote, args=(
														
 
															+                content_text, voice, credentials.get('dashscope_api_key'), callback, audio_type, word_limit)).start()
														
 
															+
														
 
															+            while True:
														
 
															+                audio = audio_queue.get()
														
 
															+                if audio is None:
														
 
															+                    break
														
 
															+                yield audio
														
 
															+
														
 
															         except Exception as ex:
														
 
															             raise InvokeBadRequestError(str(ex))
														
@@ -152,3 +163,29 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
 
															                                                               format=audio_type)
														
 
															         if isinstance(response.get_audio_data(), bytes):
														
 
															             return response.get_audio_data()
														
 
															+
														
 
															+
														
 
															+class Callback(ResultCallback):
														
 
															+
														
 
															+    def __init__(self, queue: Queue):
														
 
															+        self._queue = queue
														
 
															+
														
 
															+    def on_open(self):
														
 
															+        pass
														
 
															+
														
 
															+    def on_complete(self):
														
 
															+        self._queue.put(None)
														
 
															+        self._queue.task_done()
														
 
															+
														
 
															+    def on_error(self, response: SpeechSynthesisResponse):
														
 
															+        self._queue.put(None)
														
 
															+        self._queue.task_done()
														
 
															+
														
 
															+    def on_close(self):
														
 
															+        self._queue.put(None)
														
 
															+        self._queue.task_done()
														
 
															+
														
 
															+    def on_event(self, result: SpeechSynthesisResult):
														
 
															+        ad = result.get_audio_frame()
														
 
															+        if ad:
														
 
															+            self._queue.put(ad)
														
--- a/api/pyproject.toml
+++ b/api/pyproject.toml
@@ -49,7 +49,7 @@ ignore = [
 
															     "B006", # mutable-argument-default
														
 
															     "B007", # unused-loop-control-variable
														
 
															     "B026", # star-arg-unpacking-after-keyword-arg
														
 
															-    "B901", # return-in-generator
														
 
															+#    "B901", # return-in-generator
														
 
															     "B904", # raise-without-from-inside-except
														
 
															     "B905", # zip-without-explicit-strict
														
 
															 ]
														
--- a/api/services/app_service.py
+++ b/api/services/app_service.py
@@ -123,6 +123,8 @@ class AppService:
 
															         app.icon = args['icon']
														
 
															         app.icon_background = args['icon_background']
														
 
															         app.tenant_id = tenant_id
														
 
															+        app.api_rph = args.get('api_rph', 0)
														
 
															+        app.api_rpm = args.get('api_rpm', 0)
														
 
															         db.session.add(app)
														
 
															         db.session.flush()
														
--- a/api/services/audio_service.py
+++ b/api/services/audio_service.py
@@ -1,11 +1,12 @@
 
															 import io
														
 
															+import logging
														
 
															 from typing import Optional
														
 
															 from werkzeug.datastructures import FileStorage
														
 
															 from core.model_manager import ModelManager
														
 
															 from core.model_runtime.entities.model_entities import ModelType
														
 
															-from models.model import App, AppMode, AppModelConfig
														
 
															+from models.model import App, AppMode, AppModelConfig, Message
														
 
															 from services.errors.audio import (
														
 
															     AudioTooLargeServiceError,
														
 
															     NoAudioUploadedServiceError,
														
@@ -18,6 +19,8 @@ FILE_SIZE = 30
 
															 FILE_SIZE_LIMIT = FILE_SIZE * 1024 * 1024
														
 
															 ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm', 'amr']
														
 
															+logger = logging.getLogger(__name__)
														
 
															+
														
 
															 class AudioService:
														
 
															     @classmethod
														
@@ -64,51 +67,74 @@ class AudioService:
 
															         return {"text": model_instance.invoke_speech2text(file=buffer, user=end_user)}
														
 
															     @classmethod
														
 
															-    def transcript_tts(cls, app_model: App, text: str, streaming: bool,
														
 
															-                       voice: Optional[str] = None, end_user: Optional[str] = None):
														
 
															-        if app_model.mode in [AppMode.ADVANCED_CHAT.value, AppMode.WORKFLOW.value]:
														
 
															-            workflow = app_model.workflow
														
 
															-            if workflow is None:
														
 
															-                raise ValueError("TTS is not enabled")
														
 
															+    def transcript_tts(cls, app_model: App, text: Optional[str] = None,
														
 
															+                       voice: Optional[str] = None, end_user: Optional[str] = None, message_id: Optional[str] = None):
														
 
															+        from collections.abc import Generator
														
 
															-            features_dict = workflow.features_dict
														
 
															-            if 'text_to_speech' not in features_dict or not features_dict['text_to_speech'].get('enabled'):
														
 
															-                raise ValueError("TTS is not enabled")
														
 
															+        from flask import Response, stream_with_context
														
 
															-            voice = features_dict['text_to_speech'].get('voice') if voice is None else voice
														
 
															-        else:
														
 
															-            text_to_speech_dict = app_model.app_model_config.text_to_speech_dict
														
 
															-
														
 
															-            if not text_to_speech_dict.get('enabled'):
														
 
															-                raise ValueError("TTS is not enabled")
														
 
															+        from app import app
														
 
															+        from extensions.ext_database import db
														
 
															-            voice = text_to_speech_dict.get('voice') if voice is None else voice
														
 
															+        def invoke_tts(text_content: str, app_model, voice: Optional[str] = None):
														
 
															+            with app.app_context():
														
 
															+                if app_model.mode in [AppMode.ADVANCED_CHAT.value, AppMode.WORKFLOW.value]:
														
 
															+                    workflow = app_model.workflow
														
 
															+                    if workflow is None:
														
 
															+                        raise ValueError("TTS is not enabled")
														
 
															-        model_manager = ModelManager()
														
 
															-        model_instance = model_manager.get_default_model_instance(
														
 
															-            tenant_id=app_model.tenant_id,
														
 
															-            model_type=ModelType.TTS
														
 
															-        )
														
 
															-        if model_instance is None:
														
 
															-            raise ProviderNotSupportTextToSpeechServiceError()
														
 
															+                    features_dict = workflow.features_dict
														
 
															+                    if 'text_to_speech' not in features_dict or not features_dict['text_to_speech'].get('enabled'):
														
 
															+                        raise ValueError("TTS is not enabled")
														
 
															-        try:
														
 
															-            if not voice:
														
 
															-                voices = model_instance.get_tts_voices()
														
 
															-                if voices:
														
 
															-                    voice = voices[0].get('value')
														
 
															+                    voice = features_dict['text_to_speech'].get('voice') if voice is None else voice
														
 
															                 else:
														
 
															-                    raise ValueError("Sorry, no voice available.")
														
 
															-
														
 
															-            return model_instance.invoke_tts(
														
 
															-                content_text=text.strip(),
														
 
															-                user=end_user,
														
 
															-                streaming=streaming,
														
 
															-                tenant_id=app_model.tenant_id,
														
 
															-                voice=voice
														
 
															-            )
														
 
															-        except Exception as e:
														
 
															-            raise e
														
 
															+                    text_to_speech_dict = app_model.app_model_config.text_to_speech_dict
														
 
															+
														
 
															+                    if not text_to_speech_dict.get('enabled'):
														
 
															+                        raise ValueError("TTS is not enabled")
														
 
															+
														
 
															+                    voice = text_to_speech_dict.get('voice') if voice is None else voice
														
 
															+
														
 
															+                model_manager = ModelManager()
														
 
															+                model_instance = model_manager.get_default_model_instance(
														
 
															+                    tenant_id=app_model.tenant_id,
														
 
															+                    model_type=ModelType.TTS
														
 
															+                )
														
 
															+                try:
														
 
															+                    if not voice:
														
 
															+                        voices = model_instance.get_tts_voices()
														
 
															+                        if voices:
														
 
															+                            voice = voices[0].get('value')
														
 
															+                        else:
														
 
															+                            raise ValueError("Sorry, no voice available.")
														
 
															+
														
 
															+                    return model_instance.invoke_tts(
														
 
															+                        content_text=text_content.strip(),
														
 
															+                        user=end_user,
														
 
															+                        tenant_id=app_model.tenant_id,
														
 
															+                        voice=voice
														
 
															+                    )
														
 
															+                except Exception as e:
														
 
															+                    raise e
														
 
															+
														
 
															+        if message_id:
														
 
															+            message = db.session.query(Message).filter(
														
 
															+                Message.id == message_id
														
 
															+            ).first()
														
 
															+            if message.answer == '' and message.status == 'normal':
														
 
															+                return None
														
 
															+
														
 
															+            else:
														
 
															+                response = invoke_tts(message.answer, app_model=app_model, voice=voice)
														
 
															+                if isinstance(response, Generator):
														
 
															+                    return Response(stream_with_context(response), content_type='audio/mpeg')
														
 
															+                return response
														
 
															+        else:
														
 
															+            response = invoke_tts(text, app_model, voice)
														
 
															+            if isinstance(response, Generator):
														
 
															+                return Response(stream_with_context(response), content_type='audio/mpeg')
														
 
															+            return response
														
 
															     @classmethod
														
 
															     def transcript_tts_voices(cls, tenant_id: str, language: str):
														
--- a/web/app/components/app/configuration/config-voice/param-config-content.tsx
+++ b/web/app/components/app/configuration/config-voice/param-config-content.tsx
@@ -11,11 +11,13 @@ import { usePathname } from 'next/navigation'
 
															 import { useTranslation } from 'react-i18next'
														
 
															 import { Listbox, Transition } from '@headlessui/react'
														
 
															 import { CheckIcon, ChevronDownIcon } from '@heroicons/react/20/solid'
														
 
															+import RadioGroup from '@/app/components/app/configuration/config-vision/radio-group'
														
 
															 import type { Item } from '@/app/components/base/select'
														
 
															 import ConfigContext from '@/context/debug-configuration'
														
 
															 import { fetchAppVoices } from '@/service/apps'
														
 
															 import Tooltip from '@/app/components/base/tooltip'
														
 
															 import { languages } from '@/i18n/language'
														
 
															+import { TtsAutoPlay } from '@/types/app'
														
 
															 const VoiceParamConfig: FC = () => {
														
 
															   const { t } = useTranslation()
														
 
															   const pathname = usePathname()
														
@@ -27,12 +29,16 @@ const VoiceParamConfig: FC = () => {
 
															     setTextToSpeechConfig,
														
 
															   } = useContext(ConfigContext)
														
 
															-  const languageItem = languages.find(item => item.value === textToSpeechConfig.language)
														
 
															+  let languageItem = languages.find(item => item.value === textToSpeechConfig.language)
														
 
															   const localLanguagePlaceholder = languageItem?.name || t('common.placeholder.select')
														
 
															-
														
 
															+  if (languages && !languageItem)
														
 
															+    languageItem = languages[0]
														
 
															   const language = languageItem?.value
														
 
															   const voiceItems = useSWR({ appId, language }, fetchAppVoices).data
														
 
															-  const voiceItem = voiceItems?.find(item => item.value === textToSpeechConfig.voice)
														
 
															+  let voiceItem = voiceItems?.find(item => item.value === textToSpeechConfig.voice)
														
 
															+  if (voiceItems && !voiceItem)
														
 
															+    voiceItem = voiceItems[0]
														
 
															+
														
 
															   const localVoicePlaceholder = voiceItem?.name || t('common.placeholder.select')
														
 
															   return (
														
@@ -42,8 +48,9 @@ const VoiceParamConfig: FC = () => {
 
															         <div className='pt-3 space-y-6'>
														
 
															           <div>
														
 
															             <div className='mb-2 flex items-center  space-x-1'>
														
 
															-              <div className='leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.language')}</div>
														
 
															-              <Tooltip htmlContent={<div className='w-[180px]' >
														
 
															+              <div
														
 
															+                className='leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.language')}</div>
														
 
															+              <Tooltip htmlContent={<div className='w-[180px]'>
														
 
															                 {t('appDebug.voice.voiceSettings.resolutionTooltip').split('\n').map(item => (
														
 
															                   <div key={item}>{item}</div>
														
 
															                 ))}
														
@@ -61,7 +68,8 @@ const VoiceParamConfig: FC = () => {
 
															               }}
														
 
															             >
														
 
															               <div className={'relative h-9'}>
														
 
															-                <Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
														
 
															+                <Listbox.Button
														
 
															+                  className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
														
 
															                   <span className={classNames('block truncate text-left', !languageItem?.name && 'text-gray-400')}>
														
 
															                     {languageItem?.name ? t(`common.voice.language.${languageItem?.value.replace('-', '')}`) : localLanguagePlaceholder}
														
 
															                   </span>
														
@@ -79,7 +87,8 @@ const VoiceParamConfig: FC = () => {
 
															                   leaveTo="opacity-0"
														
 
															                 >
														
 
															-                  <Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
														
 
															+                  <Listbox.Options
														
 
															+                    className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
														
 
															                     {languages.map((item: Item) => (
														
 
															                       <Listbox.Option
														
 
															                         key={item.value}
														
@@ -100,7 +109,7 @@ const VoiceParamConfig: FC = () => {
 
															                                   'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
														
 
															                                 )}
														
 
															                               >
														
 
															-                                <CheckIcon className="h-5 w-5" aria-hidden="true" />
														
 
															+                                <CheckIcon className="h-5 w-5" aria-hidden="true"/>
														
 
															                               </span>
														
 
															                             )}
														
 
															                           </>
														
@@ -112,9 +121,9 @@ const VoiceParamConfig: FC = () => {
 
															               </div>
														
 
															             </Listbox>
														
 
															           </div>
														
 
															-
														
 
															           <div>
														
 
															-            <div className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.voice')}</div>
														
 
															+            <div
														
 
															+              className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.voice')}</div>
														
 
															             <Listbox
														
 
															               value={voiceItem}
														
 
															               disabled={!languageItem}
														
@@ -126,8 +135,10 @@ const VoiceParamConfig: FC = () => {
 
															               }}
														
 
															             >
														
 
															               <div className={'relative h-9'}>
														
 
															-                <Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
														
 
															-                  <span className={classNames('block truncate text-left', !voiceItem?.name && 'text-gray-400')}>{voiceItem?.name ?? localVoicePlaceholder}</span>
														
 
															+                <Listbox.Button
														
 
															+                  className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
														
 
															+                  <span
														
 
															+                    className={classNames('block truncate text-left', !voiceItem?.name && 'text-gray-400')}>{voiceItem?.name ?? localVoicePlaceholder}</span>
														
 
															                   <span className="pointer-events-none absolute inset-y-0 right-0 flex items-center pr-2">
														
 
															                     <ChevronDownIcon
														
 
															                       className="h-5 w-5 text-gray-400"
														
@@ -142,7 +153,8 @@ const VoiceParamConfig: FC = () => {
 
															                   leaveTo="opacity-0"
														
 
															                 >
														
 
															-                  <Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
														
 
															+                  <Listbox.Options
														
 
															+                    className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
														
 
															                     {voiceItems?.map((item: Item) => (
														
 
															                       <Listbox.Option
														
 
															                         key={item.value}
														
@@ -162,7 +174,7 @@ const VoiceParamConfig: FC = () => {
 
															                                   'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
														
 
															                                 )}
														
 
															                               >
														
 
															-                                <CheckIcon className="h-5 w-5" aria-hidden="true" />
														
 
															+                                <CheckIcon className="h-5 w-5" aria-hidden="true"/>
														
 
															                               </span>
														
 
															                             )}
														
 
															                           </>
														
@@ -174,6 +186,30 @@ const VoiceParamConfig: FC = () => {
 
															               </div>
														
 
															             </Listbox>
														
 
															           </div>
														
 
															+          <div>
														
 
															+            <div
														
 
															+              className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.autoPlay')}</div>
														
 
															+            <RadioGroup
														
 
															+              className='space-x-3'
														
 
															+              options={[
														
 
															+                {
														
 
															+                  label: t('appDebug.voice.voiceSettings.autoPlayEnabled'),
														
 
															+                  value: TtsAutoPlay.enabled,
														
 
															+                },
														
 
															+                {
														
 
															+                  label: t('appDebug.voice.voiceSettings.autoPlayDisabled'),
														
 
															+                  value: TtsAutoPlay.disabled,
														
 
															+                },
														
 
															+              ]}
														
 
															+              value={textToSpeechConfig.autoPlay ? textToSpeechConfig.autoPlay : TtsAutoPlay.disabled}
														
 
															+              onChange={(value: TtsAutoPlay) => {
														
 
															+                setTextToSpeechConfig({
														
 
															+                  ...textToSpeechConfig,
														
 
															+                  autoPlay: value,
														
 
															+                })
														
 
															+              }}
														
 
															+            />
														
 
															+          </div>
														
 
															         </div>
														
 
															       </div>
														
 
															     </div>
														
--- a/web/app/components/app/configuration/features/chat-group/text-to-speech/index.tsx
+++ b/web/app/components/app/configuration/features/chat-group/text-to-speech/index.tsx
@@ -40,7 +40,6 @@ const TextToSpeech: FC = () => {
 
															           { languageInfo?.example && (
														
 
															             <AudioBtn
														
 
															               value={languageInfo?.example}
														
 
															-              voice={voiceItem?.value}
														
 
															               isAudition
														
 
															               noCache
														
 
															             />
														
--- a/web/app/components/app/text-generate/item/index.tsx
+++ b/web/app/components/app/text-generate/item/index.tsx
@@ -428,8 +428,7 @@ const GenerationItem: FC<IGenerationItemProps> = ({
 
															                   <>
														
 
															                     <div className='ml-2 mr-2 h-[14px] w-[1px] bg-gray-200'></div>
														
 
															                     <AudioBtn
														
 
															-                      value={content}
														
 
															-                      noCache={false}
														
 
															+                      id={messageId!}
														
 
															                       className={'mr-1'}
														
 
															                     />
														
 
															                   </>
														
--- a/web/app/components/base/audio-btn/audio.player.manager.ts
+++ b/web/app/components/base/audio-btn/audio.player.manager.ts
@@ -0,0 +1,53 @@
 
															+import AudioPlayer from '@/app/components/base/audio-btn/audio'
														
 
															+declare global {
														
 
															+  // eslint-disable-next-line @typescript-eslint/consistent-type-definitions
														
 
															+  interface AudioPlayerManager {
														
 
															+    instance: AudioPlayerManager
														
 
															+  }
														
 
															+
														
 
															+}
														
 
															+
														
 
															+export class AudioPlayerManager {
														
 
															+  private static instance: AudioPlayerManager
														
 
															+  private audioPlayers: AudioPlayer | null = null
														
 
															+  private msgId: string | undefined
														
 
															+
														
 
															+  private constructor() {
														
 
															+  }
														
 
															+
														
 
															+  public static getInstance(): AudioPlayerManager {
														
 
															+    if (!AudioPlayerManager.instance) {
														
 
															+      AudioPlayerManager.instance = new AudioPlayerManager()
														
 
															+      this.instance = AudioPlayerManager.instance
														
 
															+    }
														
 
															+
														
 
															+    return AudioPlayerManager.instance
														
 
															+  }
														
 
															+
														
 
															+  public getAudioPlayer(url: string, isPublic: boolean, id: string | undefined, msgContent: string | null | undefined, voice: string | undefined, callback: ((event: string) => {}) | null): AudioPlayer {
														
 
															+    if (this.msgId && this.msgId === id && this.audioPlayers) {
														
 
															+      this.audioPlayers.setCallback(callback)
														
 
															+      return this.audioPlayers
														
 
															+    }
														
 
															+    else {
														
 
															+      if (this.audioPlayers) {
														
 
															+        try {
														
 
															+          this.audioPlayers.pauseAudio()
														
 
															+          this.audioPlayers.cacheBuffers = []
														
 
															+          this.audioPlayers.sourceBuffer?.abort()
														
 
															+        }
														
 
															+        catch (e) {
														
 
															+        }
														
 
															+      }
														
 
															+
														
 
															+      this.msgId = id
														
 
															+      this.audioPlayers = new AudioPlayer(url, isPublic, id, msgContent, callback)
														
 
															+      return this.audioPlayers
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  public resetMsgId(msgId: string) {
														
 
															+    this.msgId = msgId
														
 
															+    this.audioPlayers?.resetMsgId(msgId)
														
 
															+  }
														
 
															+}
														
--- a/web/app/components/base/audio-btn/audio.ts
+++ b/web/app/components/base/audio-btn/audio.ts
@@ -0,0 +1,263 @@
 
															+import Toast from '@/app/components/base/toast'
														
 
															+import { textToAudioStream } from '@/service/share'
														
 
															+
														
 
															+declare global {
														
 
															+  // eslint-disable-next-line @typescript-eslint/consistent-type-definitions
														
 
															+  interface Window {
														
 
															+    ManagedMediaSource: any
														
 
															+  }
														
 
															+}
														
 
															+
														
 
															+export default class AudioPlayer {
														
 
															+  mediaSource: MediaSource | null
														
 
															+  audio: HTMLAudioElement
														
 
															+  audioContext: AudioContext
														
 
															+  sourceBuffer?: SourceBuffer
														
 
															+  cacheBuffers: ArrayBuffer[] = []
														
 
															+  pauseTimer: number | null = null
														
 
															+  msgId: string | undefined
														
 
															+  msgContent: string | null | undefined = null
														
 
															+  voice: string | undefined = undefined
														
 
															+  isLoadData = false
														
 
															+  url: string
														
 
															+  isPublic: boolean
														
 
															+  callback: ((event: string) => {}) | null
														
 
															+
														
 
															+  constructor(streamUrl: string, isPublic: boolean, msgId: string | undefined, msgContent: string | null | undefined, callback: ((event: string) => {}) | null) {
														
 
															+    this.audioContext = new AudioContext()
														
 
															+    this.msgId = msgId
														
 
															+    this.msgContent = msgContent
														
 
															+    this.url = streamUrl
														
 
															+    this.isPublic = isPublic
														
 
															+    this.callback = callback
														
 
															+
														
 
															+    // Compatible with iphone ios17 ManagedMediaSource
														
 
															+    const MediaSource = window.MediaSource || window.ManagedMediaSource
														
 
															+    if (!MediaSource) {
														
 
															+      Toast.notify({
														
 
															+        message: 'Your browser does not support audio streaming, if you are using an iPhone, please update to iOS 17.1 or later.',
														
 
															+        type: 'error',
														
 
															+      })
														
 
															+    }
														
 
															+    this.mediaSource = MediaSource ? new MediaSource() : null
														
 
															+    this.audio = new Audio()
														
 
															+    this.setCallback(callback)
														
 
															+    this.audio.src = this.mediaSource ? URL.createObjectURL(this.mediaSource) : ''
														
 
															+    this.audio.autoplay = true
														
 
															+
														
 
															+    const source = this.audioContext.createMediaElementSource(this.audio)
														
 
															+    source.connect(this.audioContext.destination)
														
 
															+    this.listenMediaSource('audio/mpeg')
														
 
															+  }
														
 
															+
														
 
															+  public resetMsgId(msgId: string) {
														
 
															+    this.msgId = msgId
														
 
															+  }
														
 
															+
														
 
															+  private listenMediaSource(contentType: string) {
														
 
															+    this.mediaSource?.addEventListener('sourceopen', () => {
														
 
															+      if (this.sourceBuffer)
														
 
															+        return
														
 
															+
														
 
															+      this.sourceBuffer = this.mediaSource?.addSourceBuffer(contentType)
														
 
															+    //   this.sourceBuffer?.addEventListener('update', () => {
														
 
															+    //     if (this.cacheBuffers.length && !this.sourceBuffer?.updating) {
														
 
															+    //       const cacheBuffer = this.cacheBuffers.shift()!
														
 
															+    //       this.sourceBuffer?.appendBuffer(cacheBuffer)
														
 
															+    //     }
														
 
															+    //     // this.pauseAudio()
														
 
															+    //   })
														
 
															+    //
														
 
															+    //   this.sourceBuffer?.addEventListener('updateend', () => {
														
 
															+    //     if (this.cacheBuffers.length && !this.sourceBuffer?.updating) {
														
 
															+    //       const cacheBuffer = this.cacheBuffers.shift()!
														
 
															+    //       this.sourceBuffer?.appendBuffer(cacheBuffer)
														
 
															+    //     }
														
 
															+    //     // this.pauseAudio()
														
 
															+    //   })
														
 
															+    })
														
 
															+  }
														
 
															+
														
 
															+  public setCallback(callback: ((event: string) => {}) | null) {
														
 
															+    this.callback = callback
														
 
															+    if (callback) {
														
 
															+      this.audio.addEventListener('ended', () => {
														
 
															+        callback('ended')
														
 
															+      }, false)
														
 
															+      this.audio.addEventListener('paused', () => {
														
 
															+        callback('paused')
														
 
															+      }, true)
														
 
															+      this.audio.addEventListener('loaded', () => {
														
 
															+        callback('loaded')
														
 
															+      }, true)
														
 
															+      this.audio.addEventListener('play', () => {
														
 
															+        callback('play')
														
 
															+      }, true)
														
 
															+      this.audio.addEventListener('timeupdate', () => {
														
 
															+        callback('timeupdate')
														
 
															+      }, true)
														
 
															+      this.audio.addEventListener('loadeddate', () => {
														
 
															+        callback('loadeddate')
														
 
															+      }, true)
														
 
															+      this.audio.addEventListener('canplay', () => {
														
 
															+        callback('canplay')
														
 
															+      }, true)
														
 
															+      this.audio.addEventListener('error', () => {
														
 
															+        callback('error')
														
 
															+      }, true)
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  private async loadAudio() {
														
 
															+    try {
														
 
															+      const audioResponse: any = await textToAudioStream(this.url, this.isPublic, { content_type: 'audio/mpeg' }, {
														
 
															+        message_id: this.msgId,
														
 
															+        streaming: true,
														
 
															+        voice: this.voice,
														
 
															+        text: this.msgContent,
														
 
															+      })
														
 
															+
														
 
															+      if (audioResponse.status !== 200) {
														
 
															+        this.isLoadData = false
														
 
															+        if (this.callback)
														
 
															+          this.callback('error')
														
 
															+      }
														
 
															+
														
 
															+      const reader = audioResponse.body.getReader()
														
 
															+      while (true) {
														
 
															+        const { value, done } = await reader.read()
														
 
															+
														
 
															+        if (done) {
														
 
															+          this.receiveAudioData(value)
														
 
															+          break
														
 
															+        }
														
 
															+
														
 
															+        this.receiveAudioData(value)
														
 
															+      }
														
 
															+    }
														
 
															+    catch (error) {
														
 
															+      this.isLoadData = false
														
 
															+      this.callback && this.callback('error')
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  // play audio
														
 
															+  public playAudio() {
														
 
															+    if (this.isLoadData) {
														
 
															+      if (this.audioContext.state === 'suspended') {
														
 
															+        this.audioContext.resume().then((_) => {
														
 
															+          this.audio.play()
														
 
															+          this.callback && this.callback('play')
														
 
															+        })
														
 
															+      }
														
 
															+      else if (this.audio.ended) {
														
 
															+        this.audio.play()
														
 
															+        this.callback && this.callback('play')
														
 
															+      }
														
 
															+      if (this.callback)
														
 
															+        this.callback('play')
														
 
															+    }
														
 
															+    else {
														
 
															+      this.isLoadData = true
														
 
															+      this.loadAudio()
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  private theEndOfStream() {
														
 
															+    const endTimer = setInterval(() => {
														
 
															+      if (!this.sourceBuffer?.updating) {
														
 
															+        this.mediaSource?.endOfStream()
														
 
															+        clearInterval(endTimer)
														
 
															+      }
														
 
															+      console.log('finishStream  endOfStream endTimer')
														
 
															+    }, 10)
														
 
															+  }
														
 
															+
														
 
															+  private finishStream() {
														
 
															+    const timer = setInterval(() => {
														
 
															+      if (!this.cacheBuffers.length) {
														
 
															+        this.theEndOfStream()
														
 
															+        clearInterval(timer)
														
 
															+      }
														
 
															+
														
 
															+      if (this.cacheBuffers.length && !this.sourceBuffer?.updating) {
														
 
															+        const arrayBuffer = this.cacheBuffers.shift()!
														
 
															+        this.sourceBuffer?.appendBuffer(arrayBuffer)
														
 
															+      }
														
 
															+      console.log('finishStream  timer')
														
 
															+    }, 10)
														
 
															+  }
														
 
															+
														
 
															+  public async playAudioWithAudio(audio: string, play = true) {
														
 
															+    if (!audio || !audio.length) {
														
 
															+      this.finishStream()
														
 
															+      return
														
 
															+    }
														
 
															+
														
 
															+    const audioContent = Buffer.from(audio, 'base64')
														
 
															+    this.receiveAudioData(new Uint8Array(audioContent))
														
 
															+    if (play) {
														
 
															+      this.isLoadData = true
														
 
															+      if (this.audio.paused) {
														
 
															+        this.audioContext.resume().then((_) => {
														
 
															+          this.audio.play()
														
 
															+          this.callback && this.callback('play')
														
 
															+        })
														
 
															+      }
														
 
															+      else if (this.audio.ended) {
														
 
															+        this.audio.play()
														
 
															+        this.callback && this.callback('play')
														
 
															+      }
														
 
															+      else if (this.audio.played) { /* empty */ }
														
 
															+
														
 
															+      else {
														
 
															+        this.audio.play()
														
 
															+        this.callback && this.callback('play')
														
 
															+      }
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  public pauseAudio() {
														
 
															+    this.callback && this.callback('paused')
														
 
															+    this.audio.pause()
														
 
															+    this.audioContext.suspend()
														
 
															+  }
														
 
															+
														
 
															+  private cancer() {
														
 
															+
														
 
															+  }
														
 
															+
														
 
															+  private receiveAudioData(unit8Array: Uint8Array) {
														
 
															+    if (!unit8Array) {
														
 
															+      this.finishStream()
														
 
															+      return
														
 
															+    }
														
 
															+    const audioData = this.byteArrayToArrayBuffer(unit8Array)
														
 
															+    if (!audioData.byteLength) {
														
 
															+      if (this.mediaSource?.readyState === 'open')
														
 
															+        this.finishStream()
														
 
															+      return
														
 
															+    }
														
 
															+
														
 
															+    if (this.sourceBuffer?.updating) {
														
 
															+      this.cacheBuffers.push(audioData)
														
 
															+    }
														
 
															+    else {
														
 
															+      if (this.cacheBuffers.length && !this.sourceBuffer?.updating) {
														
 
															+        this.cacheBuffers.push(audioData)
														
 
															+        const cacheBuffer = this.cacheBuffers.shift()!
														
 
															+        this.sourceBuffer?.appendBuffer(cacheBuffer)
														
 
															+      }
														
 
															+      else {
														
 
															+        this.sourceBuffer?.appendBuffer(audioData)
														
 
															+      }
														
 
															+    }
														
 
															+  }
														
 
															+
														
 
															+  private byteArrayToArrayBuffer(byteArray: Uint8Array): ArrayBuffer {
														
 
															+    const arrayBuffer = new ArrayBuffer(byteArray.length)
														
 
															+    const uint8Array = new Uint8Array(arrayBuffer)
														
 
															+    uint8Array.set(byteArray)
														
 
															+    return arrayBuffer
														
 
															+  }
														
 
															+}
														
--- a/web/app/components/base/audio-btn/index.tsx
+++ b/web/app/components/base/audio-btn/index.tsx
@@ -1,124 +1,78 @@
 
															 'use client'
														
 
															-import { useEffect, useRef, useState } from 'react'
														
 
															+import { useRef, useState } from 'react'
														
 
															 import { t } from 'i18next'
														
 
															 import { useParams, usePathname } from 'next/navigation'
														
 
															 import s from './style.module.css'
														
 
															 import Tooltip from '@/app/components/base/tooltip'
														
 
															 import { randomString } from '@/utils'
														
 
															-import { textToAudio } from '@/service/share'
														
 
															 import Loading from '@/app/components/base/loading'
														
 
															+import { AudioPlayerManager } from '@/app/components/base/audio-btn/audio.player.manager'
														
 
															 type AudioBtnProps = {
														
 
															-  value: string
														
 
															+  id?: string
														
 
															   voice?: string
														
 
															+  value?: string
														
 
															   className?: string
														
 
															   isAudition?: boolean
														
 
															-  noCache: boolean
														
 
															+  noCache?: boolean
														
 
															 }
														
 
															 type AudioState = 'initial' | 'loading' | 'playing' | 'paused' | 'ended'
														
 
															 const AudioBtn = ({
														
 
															-  value,
														
 
															+  id,
														
 
															   voice,
														
 
															+  value,
														
 
															   className,
														
 
															   isAudition,
														
 
															-  noCache,
														
 
															 }: AudioBtnProps) => {
														
 
															-  const audioRef = useRef<HTMLAudioElement | null>(null)
														
 
															   const [audioState, setAudioState] = useState<AudioState>('initial')
														
 
															   const selector = useRef(`play-tooltip-${randomString(4)}`)
														
 
															   const params = useParams()
														
 
															   const pathname = usePathname()
														
 
															-  const removeCodeBlocks = (inputText: any) => {
														
 
															-    const codeBlockRegex = /```[\s\S]*?```/g
														
 
															-    if (inputText)
														
 
															-      return inputText.replace(codeBlockRegex, '')
														
 
															-    return ''
														
 
															-  }
														
 
															-
														
 
															-  const loadAudio = async () => {
														
 
															-    const formData = new FormData()
														
 
															-    formData.append('text', removeCodeBlocks(value))
														
 
															-    formData.append('voice', removeCodeBlocks(voice))
														
 
															-
														
 
															-    if (value !== '') {
														
 
															-      setAudioState('loading')
														
 
															-
														
 
															-      let url = ''
														
 
															-      let isPublic = false
														
 
															-
														
 
															-      if (params.token) {
														
 
															-        url = '/text-to-audio'
														
 
															-        isPublic = true
														
 
															-      }
														
 
															-      else if (params.appId) {
														
 
															-        if (pathname.search('explore/installed') > -1)
														
 
															-          url = `/installed-apps/${params.appId}/text-to-audio`
														
 
															-        else
														
 
															-          url = `/apps/${params.appId}/text-to-audio`
														
 
															-      }
														
 
															-
														
 
															-      try {
														
 
															-        const audioResponse = await textToAudio(url, isPublic, formData)
														
 
															-        const blob_bytes = Buffer.from(audioResponse.data, 'latin1')
														
 
															-        const blob = new Blob([blob_bytes], { type: 'audio/wav' })
														
 
															-        const audioUrl = URL.createObjectURL(blob)
														
 
															-        audioRef.current!.src = audioUrl
														
 
															-      }
														
 
															-      catch (error) {
														
 
															-        setAudioState('initial')
														
 
															-        console.error('Error playing audio:', error)
														
 
															-      }
														
 
															+  const audio_finished_call = (event: string): any => {
														
 
															+    switch (event) {
														
 
															+      case 'ended':
														
 
															+        setAudioState('ended')
														
 
															+        break
														
 
															+      case 'paused':
														
 
															+        setAudioState('ended')
														
 
															+        break
														
 
															+      case 'loaded':
														
 
															+        setAudioState('loading')
														
 
															+        break
														
 
															+      case 'play':
														
 
															+        setAudioState('playing')
														
 
															+        break
														
 
															+      case 'error':
														
 
															+        setAudioState('ended')
														
 
															+        break
														
 
															     }
														
 
															   }
														
 
															+  let url = ''
														
 
															+  let isPublic = false
														
 
															+  if (params.token) {
														
 
															+    url = '/text-to-audio'
														
 
															+    isPublic = true
														
 
															+  }
														
 
															+  else if (params.appId) {
														
 
															+    if (pathname.search('explore/installed') > -1)
														
 
															+      url = `/installed-apps/${params.appId}/text-to-audio`
														
 
															+    else
														
 
															+      url = `/apps/${params.appId}/text-to-audio`
														
 
															+  }
														
 
															   const handleToggle = async () => {
														
 
															-    if (audioState === 'initial' || noCache) {
														
 
															-      await loadAudio()
														
 
															+    if (audioState === 'playing' || audioState === 'loading') {
														
 
															+      setAudioState('paused')
														
 
															+      AudioPlayerManager.getInstance().getAudioPlayer(url, isPublic, id, value, voice, audio_finished_call).pauseAudio()
														
 
															     }
														
 
															-    else if (audioRef.current) {
														
 
															-      if (audioState === 'playing') {
														
 
															-        audioRef.current.pause()
														
 
															-        setAudioState('paused')
														
 
															-      }
														
 
															-      else {
														
 
															-        audioRef.current.play()
														
 
															-        setAudioState('playing')
														
 
															-      }
														
 
															-    }
														
 
															-  }
														
 
															-
														
 
															-  useEffect(() => {
														
 
															-    const currentAudio = audioRef.current
														
 
															-
														
 
															-    const handleLoading = () => {
														
 
															+    else {
														
 
															       setAudioState('loading')
														
 
															+      AudioPlayerManager.getInstance().getAudioPlayer(url, isPublic, id, value, voice, audio_finished_call).playAudio()
														
 
															     }
														
 
															-
														
 
															-    const handlePlay = () => {
														
 
															-      currentAudio?.play()
														
 
															-      setAudioState('playing')
														
 
															-    }
														
 
															-
														
 
															-    const handleEnded = () => {
														
 
															-      setAudioState('ended')
														
 
															-    }
														
 
															-
														
 
															-    currentAudio?.addEventListener('progress', handleLoading)
														
 
															-    currentAudio?.addEventListener('canplaythrough', handlePlay)
														
 
															-    currentAudio?.addEventListener('ended', handleEnded)
														
 
															-
														
 
															-    return () => {
														
 
															-      currentAudio?.removeEventListener('progress', handleLoading)
														
 
															-      currentAudio?.removeEventListener('canplaythrough', handlePlay)
														
 
															-      currentAudio?.removeEventListener('ended', handleEnded)
														
 
															-      URL.revokeObjectURL(currentAudio?.src || '')
														
 
															-      currentAudio?.pause()
														
 
															-      currentAudio?.setAttribute('src', '')
														
 
															-    }
														
 
															-  }, [])
														
 
															+  }
														
 
															   const tooltipContent = {
														
 
															     initial: t('appApi.play'),
														
@@ -151,7 +105,6 @@ const AudioBtn = ({
 
															             )}
														
 
															         </button>
														
 
															       </Tooltip>
														
 
															-      <audio ref={audioRef} src='' className='hidden' />
														
 
															     </div>
														
 
															   )
														
 
															 }
														
--- a/web/app/components/base/chat/chat/answer/index.tsx
+++ b/web/app/components/base/chat/chat/answer/index.tsx
@@ -8,6 +8,7 @@ import type {
 
															   ChatConfig,
														
 
															   ChatItem,
														
 
															 } from '../../types'
														
 
															+import { useChatContext } from '../context'
														
 
															 import Operation from './operation'
														
 
															 import AgentContent from './agent-content'
														
 
															 import BasicContent from './basic-content'
														
@@ -59,23 +60,25 @@ const Answer: FC<AnswerProps> = ({
 
															   } = item
														
 
															   const hasAgentThoughts = !!agent_thoughts?.length
														
 
															-  const [containerWidth, setContainerWidth] = useState(0)
														
 
															+  const [containerWidth] = useState(0)
														
 
															   const [contentWidth, setContentWidth] = useState(0)
														
 
															   const containerRef = useRef<HTMLDivElement>(null)
														
 
															   const contentRef = useRef<HTMLDivElement>(null)
														
 
															-  const getContainerWidth = () => {
														
 
															-    if (containerRef.current)
														
 
															-      setContainerWidth(containerRef.current?.clientWidth + 16)
														
 
															-  }
														
 
															+  const {
														
 
															+    config: chatContextConfig,
														
 
															+  } = useChatContext()
														
 
															+
														
 
															+  const voiceRef = useRef(chatContextConfig?.text_to_speech?.voice)
														
 
															   const getContentWidth = () => {
														
 
															     if (contentRef.current)
														
 
															       setContentWidth(contentRef.current?.clientWidth)
														
 
															   }
														
 
															   useEffect(() => {
														
 
															-    getContainerWidth()
														
 
															-  }, [])
														
 
															+    voiceRef.current = chatContextConfig?.text_to_speech?.voice
														
 
															+  }
														
 
															+  , [chatContextConfig?.text_to_speech?.voice])
														
 
															   useEffect(() => {
														
 
															     if (!responding)
														
--- a/web/app/components/base/chat/chat/answer/operation.tsx
+++ b/web/app/components/base/chat/chat/answer/operation.tsx
@@ -119,9 +119,9 @@ const Operation: FC<OperationProps> = ({
 
															               <>
														
 
															                 <div className='mx-1 w-[1px] h-[14px] bg-gray-200'/>
														
 
															                 <AudioBtn
														
 
															+                  id={id}
														
 
															                   value={content}
														
 
															                   noCache={false}
														
 
															-                  voice={config?.text_to_speech?.voice}
														
 
															                   className='hidden group-hover:block'
														
 
															                 />
														
 
															               </>
														
--- a/web/app/components/base/chat/chat/hooks.ts
+++ b/web/app/components/base/chat/chat/hooks.ts
@@ -6,6 +6,8 @@ import {
 
															 } from 'react'
														
 
															 import { useTranslation } from 'react-i18next'
														
 
															 import { produce, setAutoFreeze } from 'immer'
														
 
															+import { useParams, usePathname } from 'next/navigation'
														
 
															+import { v4 as uuidV4 } from 'uuid'
														
 
															 import type {
														
 
															   ChatConfig,
														
 
															   ChatItem,
														
@@ -20,6 +22,7 @@ import { replaceStringWithValues } from '@/app/components/app/configuration/prom
 
															 import type { Annotation } from '@/models/log'
														
 
															 import { WorkflowRunningStatus } from '@/app/components/workflow/types'
														
 
															 import useTimestamp from '@/hooks/use-timestamp'
														
 
															+import { AudioPlayerManager } from '@/app/components/base/audio-btn/audio.player.manager'
														
 
															 type GetAbortController = (abortController: AbortController) => void
														
 
															 type SendCallback = {
														
@@ -91,7 +94,8 @@ export const useChat = (
 
															   const conversationMessagesAbortControllerRef = useRef<AbortController | null>(null)
														
 
															   const suggestedQuestionsAbortControllerRef = useRef<AbortController | null>(null)
														
 
															   const checkPromptVariables = useCheckPromptVariables()
														
 
															-
														
 
															+  const params = useParams()
														
 
															+  const pathname = usePathname()
														
 
															   useEffect(() => {
														
 
															     setAutoFreeze(false)
														
 
															     return () => {
														
@@ -262,6 +266,19 @@ export const useChat = (
 
															     let isAgentMode = false
														
 
															     let hasSetResponseId = false
														
 
															+    let ttsUrl = ''
														
 
															+    let ttsIsPublic = false
														
 
															+    if (params.token) {
														
 
															+      ttsUrl = '/text-to-audio'
														
 
															+      ttsIsPublic = true
														
 
															+    }
														
 
															+    else if (params.appId) {
														
 
															+      if (pathname.search('explore/installed') > -1)
														
 
															+        ttsUrl = `/installed-apps/${params.appId}/text-to-audio`
														
 
															+      else
														
 
															+        ttsUrl = `/apps/${params.appId}/text-to-audio`
														
 
															+    }
														
 
															+    const player = AudioPlayerManager.getInstance().getAudioPlayer(ttsUrl, ttsIsPublic, uuidV4(), 'none', 'none', (_: any): any => {})
														
 
															     ssePost(
														
 
															       url,
														
 
															       {
														
@@ -530,6 +547,15 @@ export const useChat = (
 
															             }
														
 
															           }))
														
 
															         },
														
 
															+        onTTSChunk: (messageId: string, audio: string) => {
														
 
															+          if (!audio || audio === '')
														
 
															+            return
														
 
															+          player.playAudioWithAudio(audio, true)
														
 
															+          AudioPlayerManager.getInstance().resetMsgId(messageId)
														
 
															+        },
														
 
															+        onTTSEnd: (messageId: string, audio: string) => {
														
 
															+          player.playAudioWithAudio(audio, false)
														
 
															+        },
														
 
															       })
														
 
															     return true
														
 
															   }, [
														
--- a/web/app/components/base/features/feature-panel/text-to-speech/param-config-content.tsx
+++ b/web/app/components/base/features/feature-panel/text-to-speech/param-config-content.tsx
@@ -19,6 +19,8 @@ import type { Item } from '@/app/components/base/select'
 
															 import { fetchAppVoices } from '@/service/apps'
														
 
															 import Tooltip from '@/app/components/base/tooltip'
														
 
															 import { languages } from '@/i18n/language'
														
 
															+import RadioGroup from '@/app/components/app/configuration/config-vision/radio-group'
														
 
															+import { TtsAutoPlay } from '@/types/app'
														
 
															 type VoiceParamConfigProps = {
														
 
															   onChange?: OnFeaturesChange
														
@@ -33,12 +35,16 @@ const VoiceParamConfig = ({
 
															   const text2speech = useFeatures(state => state.features.text2speech)
														
 
															   const featuresStore = useFeaturesStore()
														
 
															-  const languageItem = languages.find(item => item.value === text2speech.language)
														
 
															+  let languageItem = languages.find(item => item.value === text2speech?.language)
														
 
															+  if (languages && !languageItem)
														
 
															+    languageItem = languages[0]
														
 
															   const localLanguagePlaceholder = languageItem?.name || t('common.placeholder.select')
														
 
															   const language = languageItem?.value
														
 
															   const voiceItems = useSWR({ appId, language }, fetchAppVoices).data
														
 
															-  const voiceItem = voiceItems?.find(item => item.value === text2speech.voice)
														
 
															+  let voiceItem = voiceItems?.find(item => item.value === text2speech?.voice)
														
 
															+  if (voiceItems && !voiceItem)
														
 
															+    voiceItem = voiceItems[0]
														
 
															   const localVoicePlaceholder = voiceItem?.name || t('common.placeholder.select')
														
 
															   const handleChange = (value: Record<string, string>) => {
														
@@ -66,13 +72,14 @@ const VoiceParamConfig = ({
 
															         <div className='pt-3 space-y-6'>
														
 
															           <div>
														
 
															             <div className='mb-2 flex items-center  space-x-1'>
														
 
															-              <div className='leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.language')}</div>
														
 
															-              <Tooltip htmlContent={<div className='w-[180px]' >
														
 
															+              <div
														
 
															+                className='leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.language')}</div>
														
 
															+              <Tooltip htmlContent={<div className='w-[180px]'>
														
 
															                 {t('appDebug.voice.voiceSettings.resolutionTooltip').split('\n').map(item => (
														
 
															                   <div key={item}>{item}</div>
														
 
															                 ))}
														
 
															               </div>} selector='config-resolution-tooltip'>
														
 
															-                <RiQuestionLine className='w-[14px] h-[14px] text-gray-400' />
														
 
															+                <RiQuestionLine className='w-[14px] h-[14px] text-gray-400'/>
														
 
															               </Tooltip>
														
 
															             </div>
														
 
															             <Listbox
														
@@ -84,7 +91,8 @@ const VoiceParamConfig = ({
 
															               }}
														
 
															             >
														
 
															               <div className={'relative h-9'}>
														
 
															-                <Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
														
 
															+                <Listbox.Button
														
 
															+                  className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
														
 
															                   <span className={classNames('block truncate text-left', !languageItem?.name && 'text-gray-400')}>
														
 
															                     {languageItem?.name ? t(`common.voice.language.${languageItem?.value.replace('-', '')}`) : localLanguagePlaceholder}
														
 
															                   </span>
														
@@ -102,7 +110,8 @@ const VoiceParamConfig = ({
 
															                   leaveTo="opacity-0"
														
 
															                 >
														
 
															-                  <Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
														
 
															+                  <Listbox.Options
														
 
															+                    className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
														
 
															                     {languages.map((item: Item) => (
														
 
															                       <Listbox.Option
														
 
															                         key={item.value}
														
@@ -117,13 +126,13 @@ const VoiceParamConfig = ({
 
															                           <>
														
 
															                             <span
														
 
															                               className={classNames('block', selected && 'font-normal')}>{t(`common.voice.language.${(item.value).toString().replace('-', '')}`)}</span>
														
 
															-                            {(selected || item.value === text2speech.language) && (
														
 
															+                            {(selected || item.value === text2speech?.language) && (
														
 
															                               <span
														
 
															                                 className={classNames(
														
 
															                                   'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
														
 
															                                 )}
														
 
															                               >
														
 
															-                                <CheckIcon className="h-5 w-5" aria-hidden="true" />
														
 
															+                                <CheckIcon className="h-5 w-5" aria-hidden="true"/>
														
 
															                               </span>
														
 
															                             )}
														
 
															                           </>
														
@@ -137,7 +146,8 @@ const VoiceParamConfig = ({
 
															           </div>
														
 
															           <div>
														
 
															-            <div className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.voice')}</div>
														
 
															+            <div
														
 
															+              className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.voice')}</div>
														
 
															             <Listbox
														
 
															               value={voiceItem}
														
 
															               disabled={!languageItem}
														
@@ -148,8 +158,10 @@ const VoiceParamConfig = ({
 
															               }}
														
 
															             >
														
 
															               <div className={'relative h-9'}>
														
 
															-                <Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
														
 
															-                  <span className={classNames('block truncate text-left', !voiceItem?.name && 'text-gray-400')}>{voiceItem?.name ?? localVoicePlaceholder}</span>
														
 
															+                <Listbox.Button
														
 
															+                  className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
														
 
															+                  <span
														
 
															+                    className={classNames('block truncate text-left', !voiceItem?.name && 'text-gray-400')}>{voiceItem?.name ?? localVoicePlaceholder}</span>
														
 
															                   <span className="pointer-events-none absolute inset-y-0 right-0 flex items-center pr-2">
														
 
															                     <ChevronDownIcon
														
 
															                       className="h-5 w-5 text-gray-400"
														
@@ -164,7 +176,8 @@ const VoiceParamConfig = ({
 
															                   leaveTo="opacity-0"
														
 
															                 >
														
 
															-                  <Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
														
 
															+                  <Listbox.Options
														
 
															+                    className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
														
 
															                     {voiceItems?.map((item: Item) => (
														
 
															                       <Listbox.Option
														
 
															                         key={item.value}
														
@@ -178,13 +191,13 @@ const VoiceParamConfig = ({
 
															                         {({ /* active, */ selected }) => (
														
 
															                           <>
														
 
															                             <span className={classNames('block', selected && 'font-normal')}>{item.name}</span>
														
 
															-                            {(selected || item.value === text2speech.voice) && (
														
 
															+                            {(selected || item.value === text2speech?.voice) && (
														
 
															                               <span
														
 
															                                 className={classNames(
														
 
															                                   'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
														
 
															                                 )}
														
 
															                               >
														
 
															-                                <CheckIcon className="h-5 w-5" aria-hidden="true" />
														
 
															+                                <CheckIcon className="h-5 w-5" aria-hidden="true"/>
														
 
															                               </span>
														
 
															                             )}
														
 
															                           </>
														
@@ -196,6 +209,29 @@ const VoiceParamConfig = ({
 
															               </div>
														
 
															             </Listbox>
														
 
															           </div>
														
 
															+          <div>
														
 
															+            <div
														
 
															+              className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.autoPlay')}</div>
														
 
															+            <RadioGroup
														
 
															+              className='space-x-3'
														
 
															+              options={[
														
 
															+                {
														
 
															+                  label: t('appDebug.voice.voiceSettings.autoPlayEnabled'),
														
 
															+                  value: TtsAutoPlay.enabled,
														
 
															+                },
														
 
															+                {
														
 
															+                  label: t('appDebug.voice.voiceSettings.autoPlayDisabled'),
														
 
															+                  value: TtsAutoPlay.disabled,
														
 
															+                },
														
 
															+              ]}
														
 
															+              value={text2speech?.autoPlay ? text2speech?.autoPlay : TtsAutoPlay.disabled}
														
 
															+              onChange={(value: TtsAutoPlay) => {
														
 
															+                handleChange({
														
 
															+                  autoPlay: value,
														
 
															+                })
														
 
															+              }}
														
 
															+            />
														
 
															+          </div>
														
 
															         </div>
														
 
															       </div>
														
 
															     </div>
														
--- a/web/app/components/base/features/types.ts
+++ b/web/app/components/base/features/types.ts
@@ -1,4 +1,4 @@
 
															-import type { TransferMethod } from '@/types/app'
														
 
															+import type { TransferMethod, TtsAutoPlay } from '@/types/app'
														
 
															 export type EnabledOrDisabled = {
														
 
															   enabled?: boolean
														
@@ -14,6 +14,7 @@ export type SuggestedQuestionsAfterAnswer = EnabledOrDisabled
 
															 export type TextToSpeech = EnabledOrDisabled & {
														
 
															   language?: string
														
 
															   voice?: string
														
 
															+  autoPlay?: TtsAutoPlay
														
 
															 }
														
 
															 export type SpeechToText = EnabledOrDisabled
														
--- a/web/app/components/workflow/hooks/use-workflow-run.ts
+++ b/web/app/components/workflow/hooks/use-workflow-run.ts
@@ -4,6 +4,8 @@ import {
 
															   useStoreApi,
														
 
															 } from 'reactflow'
														
 
															 import produce from 'immer'
														
 
															+import { v4 as uuidV4 } from 'uuid'
														
 
															+import { usePathname } from 'next/navigation'
														
 
															 import { useWorkflowStore } from '../store'
														
 
															 import { useNodesSyncDraft } from '../hooks'
														
 
															 import {
														
@@ -19,6 +21,7 @@ import {
 
															   stopWorkflowRun,
														
 
															 } from '@/service/workflow'
														
 
															 import { useFeaturesStore } from '@/app/components/base/features/hooks'
														
 
															+import { AudioPlayerManager } from '@/app/components/base/audio-btn/audio.player.manager'
														
 
															 export const useWorkflowRun = () => {
														
 
															   const store = useStoreApi()
														
@@ -27,6 +30,7 @@ export const useWorkflowRun = () => {
 
															   const featuresStore = useFeaturesStore()
														
 
															   const { doSyncWorkflowDraft } = useNodesSyncDraft()
														
 
															   const { handleUpdateWorkflowCanvas } = useWorkflowUpdate()
														
 
															+  const pathname = usePathname()
														
 
															   const handleBackupDraft = useCallback(() => {
														
 
															     const {
														
@@ -134,6 +138,20 @@ export const useWorkflowRun = () => {
 
															     let isInIteration = false
														
 
															     let iterationLength = 0
														
 
															+    let ttsUrl = ''
														
 
															+    let ttsIsPublic = false
														
 
															+    if (params.token) {
														
 
															+      ttsUrl = '/text-to-audio'
														
 
															+      ttsIsPublic = true
														
 
															+    }
														
 
															+    else if (params.appId) {
														
 
															+      if (pathname.search('explore/installed') > -1)
														
 
															+        ttsUrl = `/installed-apps/${params.appId}/text-to-audio`
														
 
															+      else
														
 
															+        ttsUrl = `/apps/${params.appId}/text-to-audio`
														
 
															+    }
														
 
															+    const player = AudioPlayerManager.getInstance().getAudioPlayer(ttsUrl, ttsIsPublic, uuidV4(), 'none', 'none', (_: any): any => {})
														
 
															+
														
 
															     ssePost(
														
 
															       url,
														
 
															       {
														
@@ -468,6 +486,15 @@ export const useWorkflowRun = () => {
 
															             draft.resultText = text
														
 
															           }))
														
 
															         },
														
 
															+        onTTSChunk: (messageId: string, audio: string, audioType?: string) => {
														
 
															+          if (!audio || audio === '')
														
 
															+            return
														
 
															+          player.playAudioWithAudio(audio, true)
														
 
															+          AudioPlayerManager.getInstance().resetMsgId(messageId)
														
 
															+        },
														
 
															+        onTTSEnd: (messageId: string, audio: string, audioType?: string) => {
														
 
															+          player.playAudioWithAudio(audio, false)
														
 
															+        },
														
 
															         ...restCallback,
														
 
															       },
														
 
															     )
														
--- a/web/i18n/en-US/app-debug.ts
+++ b/web/i18n/en-US/app-debug.ts
@@ -323,6 +323,9 @@ const translation = {
 
															       language: 'Language',
														
 
															       resolutionTooltip: 'Text-to-speech voice support language。',
														
 
															       voice: 'Voice',
														
 
															+      autoPlay: 'Auto Play',
														
 
															+      autoPlayEnabled: 'Turn On',
														
 
															+      autoPlayDisabled: 'Turn Off',
														
 
															     },
														
 
															   },
														
 
															   openingStatement: {
														
--- a/web/i18n/ja-JP/app-debug.ts
+++ b/web/i18n/ja-JP/app-debug.ts
@@ -319,6 +319,9 @@ const translation = {
 
															       language: '言語',
														
 
															       resolutionTooltip: 'テキスト読み上げの音声言語をサポートします。',
														
 
															       voice: '音声',
														
 
															+      autoPlay: '自動再生',
														
 
															+      autoPlayEnabled: '開ける',
														
 
															+      autoPlayDisabled: '關閉',
														
 
															     },
														
 
															   },
														
 
															   openingStatement: {
														
--- a/web/i18n/zh-Hans/app-debug.ts
+++ b/web/i18n/zh-Hans/app-debug.ts
@@ -319,6 +319,9 @@ const translation = {
 
															       language: '语言',
														
 
															       resolutionTooltip: '文本转语音音色支持语言。',
														
 
															       voice: '音色',
														
 
															+      autoPlay: '自动播放',
														
 
															+      autoPlayEnabled: '开启',
														
 
															+      autoPlayDisabled: '关闭',
														
 
															     },
														
 
															   },
														
 
															   openingStatement: {
														
--- a/web/i18n/zh-Hant/app-debug.ts
+++ b/web/i18n/zh-Hant/app-debug.ts
@@ -318,6 +318,9 @@ const translation = {
 
															       language: '語言',
														
 
															       resolutionTooltip: '文字轉語音音色支援語言。',
														
 
															       voice: '音色',
														
 
															+      autoPlay: '自動播放',
														
 
															+      autoPlayEnabled: '開啟',
														
 
															+      autoPlayDisabled: '關閉',
														
 
															     },
														
 
															   },
														
 
															   openingStatement: {
														
--- a/web/models/debug.ts
+++ b/web/models/debug.ts
@@ -1,4 +1,4 @@
 
															-import type { AgentStrategy, ModelModeType, RETRIEVE_TYPE, ToolItem } from '@/types/app'
														
 
															+import type { AgentStrategy, ModelModeType, RETRIEVE_TYPE, ToolItem, TtsAutoPlay } from '@/types/app'
														
 
															 export type Inputs = Record<string, string | number | object>
														
 
															 export enum PromptMode {
														
@@ -79,6 +79,7 @@ export type TextToSpeechConfig = {
 
															   enabled: boolean
														
 
															   voice?: string
														
 
															   language?: string
														
 
															+  autoPlay?: TtsAutoPlay
														
 
															 }
														
 
															 export type CitationConfig = MoreLikeThisConfig
														
--- a/web/next.config.js
+++ b/web/next.config.js
@@ -34,6 +34,7 @@ const nextConfig = {
 
															     // https://nextjs.org/docs/api-reference/next.config.js/ignoring-typescript-errors
														
 
															     ignoreBuildErrors: true,
														
 
															   },
														
 
															+  reactStrictMode: true,
														
 
															   async redirects() {
														
 
															     return [
														
 
															       {
														
--- a/web/service/apps.ts
+++ b/web/service/apps.ts
@@ -120,6 +120,7 @@ export const generationIntroduction: Fetcher<GenerationIntroductionResponse, { u
 
															 }
														
 
															 export const fetchAppVoices: Fetcher<AppVoicesListResponse, { appId: string; language?: string }> = ({ appId, language }) => {
														
 
															+  language = language || 'en-US'
														
 
															   return get<AppVoicesListResponse>(`apps/${appId}/text-to-audio/voices?language=${language}`)
														
 
															 }
														
--- a/web/service/base.ts
+++ b/web/service/base.ts
@@ -19,6 +19,7 @@ const TIME_OUT = 100000
 
															 const ContentType = {
														
 
															   json: 'application/json',
														
 
															   stream: 'text/event-stream',
														
 
															+  audio: 'audio/mpeg',
														
 
															   form: 'application/x-www-form-urlencoded; charset=UTF-8',
														
 
															   download: 'application/octet-stream', // for download
														
 
															   upload: 'multipart/form-data', // for upload
														
@@ -59,6 +60,8 @@ export type IOnIterationStarted = (workflowStarted: IterationStartedResponse) =>
 
															 export type IOnIterationNexted = (workflowStarted: IterationNextedResponse) => void
														
 
															 export type IOnIterationFinished = (workflowFinished: IterationFinishedResponse) => void
														
 
															 export type IOnTextChunk = (textChunk: TextChunkResponse) => void
														
 
															+export type IOnTTSChunk = (messageId: string, audioStr: string, audioType?: string) => void
														
 
															+export type IOnTTSEnd = (messageId: string, audioStr: string, audioType?: string) => void
														
 
															 export type IOnTextReplace = (textReplace: TextReplaceResponse) => void
														
 
															 export type IOtherOptions = {
														
@@ -84,6 +87,8 @@ export type IOtherOptions = {
 
															   onIterationNext?: IOnIterationNexted
														
 
															   onIterationFinish?: IOnIterationFinished
														
 
															   onTextChunk?: IOnTextChunk
														
 
															+  onTTSChunk?: IOnTTSChunk
														
 
															+  onTTSEnd?: IOnTTSEnd
														
 
															   onTextReplace?: IOnTextReplace
														
 
															 }
														
@@ -135,6 +140,8 @@ const handleStream = (
 
															   onIterationNext?: IOnIterationNexted,
														
 
															   onIterationFinish?: IOnIterationFinished,
														
 
															   onTextChunk?: IOnTextChunk,
														
 
															+  onTTSChunk?: IOnTTSChunk,
														
 
															+  onTTSEnd?: IOnTTSEnd,
														
 
															   onTextReplace?: IOnTextReplace,
														
 
															 ) => {
														
 
															   if (!response.ok)
														
@@ -227,6 +234,12 @@ const handleStream = (
 
															             else if (bufferObj.event === 'text_replace') {
														
 
															               onTextReplace?.(bufferObj as TextReplaceResponse)
														
 
															             }
														
 
															+            else if (bufferObj.event === 'tts_message') {
														
 
															+              onTTSChunk?.(bufferObj.message_id, bufferObj.audio, bufferObj.audio_type)
														
 
															+            }
														
 
															+            else if (bufferObj.event === 'tts_message_end') {
														
 
															+              onTTSEnd?.(bufferObj.message_id, bufferObj.audio)
														
 
															+            }
														
 
															           }
														
 
															         })
														
 
															         buffer = lines[lines.length - 1]
														
@@ -390,9 +403,10 @@ const baseFetch = <T>(
 
															           }
														
 
															           // return data
														
 
															-          const data: Promise<T> = options.headers.get('Content-type') === ContentType.download ? res.blob() : res.json()
														
 
															+          if (options.headers.get('Content-type') === ContentType.download || options.headers.get('Content-type') === ContentType.audio)
														
 
															+            resolve(needAllResponseContent ? resClone : res.blob())
														
 
															-          resolve(needAllResponseContent ? resClone : data)
														
 
															+          else resolve(needAllResponseContent ? resClone : res.json())
														
 
															         })
														
 
															         .catch((err) => {
														
 
															           if (!silent)
														
@@ -475,6 +489,8 @@ export const ssePost = (
 
															     onIterationNext,
														
 
															     onIterationFinish,
														
 
															     onTextChunk,
														
 
															+    onTTSChunk,
														
 
															+    onTTSEnd,
														
 
															     onTextReplace,
														
 
															     onError,
														
 
															     getAbortController,
														
@@ -527,7 +543,7 @@ export const ssePost = (
 
															           return
														
 
															         }
														
 
															         onData?.(str, isFirstMessage, moreInfo)
														
 
															-      }, onCompleted, onThought, onMessageEnd, onMessageReplace, onFile, onWorkflowStarted, onWorkflowFinished, onNodeStarted, onNodeFinished, onIterationStart, onIterationNext, onIterationFinish, onTextChunk, onTextReplace)
														
 
															+      }, onCompleted, onThought, onMessageEnd, onMessageReplace, onFile, onWorkflowStarted, onWorkflowFinished, onNodeStarted, onNodeFinished, onIterationStart, onIterationNext, onIterationFinish, onTextChunk, onTTSChunk, onTTSEnd, onTextReplace)
														
 
															     }).catch((e) => {
														
 
															       if (e.toString() !== 'AbortError: The user aborted a request.')
														
 
															         Toast.notify({ type: 'error', message: e })
														
--- a/web/service/share.ts
+++ b/web/service/share.ts
@@ -1,4 +1,4 @@
 
															-import type { IOnCompleted, IOnData, IOnError, IOnFile, IOnIterationFinished, IOnIterationNexted, IOnIterationStarted, IOnMessageEnd, IOnMessageReplace, IOnNodeFinished, IOnNodeStarted, IOnTextChunk, IOnTextReplace, IOnThought, IOnWorkflowFinished, IOnWorkflowStarted } from './base'
														
 
															+import type { IOnCompleted, IOnData, IOnError, IOnFile, IOnIterationFinished, IOnIterationNexted, IOnIterationStarted, IOnMessageEnd, IOnMessageReplace, IOnNodeFinished, IOnNodeStarted, IOnTTSChunk, IOnTTSEnd, IOnTextChunk, IOnTextReplace, IOnThought, IOnWorkflowFinished, IOnWorkflowStarted } from './base'
														
 
															 import {
														
 
															   del as consoleDel, get as consoleGet, patch as consolePatch, post as consolePost,
														
 
															   delPublic as del, getPublic as get, patchPublic as patch, postPublic as post, ssePost,
														
@@ -30,7 +30,7 @@ export function getUrl(url: string, isInstalledApp: boolean, installedAppId: str
 
															   return isInstalledApp ? `installed-apps/${installedAppId}/${url.startsWith('/') ? url.slice(1) : url}` : url
														
 
															 }
														
 
															-export const sendChatMessage = async (body: Record<string, any>, { onData, onCompleted, onThought, onFile, onError, getAbortController, onMessageEnd, onMessageReplace }: {
														
 
															+export const sendChatMessage = async (body: Record<string, any>, { onData, onCompleted, onThought, onFile, onError, getAbortController, onMessageEnd, onMessageReplace, onTTSChunk, onTTSEnd }: {
														
 
															   onData: IOnData
														
 
															   onCompleted: IOnCompleted
														
 
															   onFile: IOnFile
														
@@ -39,13 +39,15 @@ export const sendChatMessage = async (body: Record<string, any>, { onData, onCom
 
															   onMessageEnd?: IOnMessageEnd
														
 
															   onMessageReplace?: IOnMessageReplace
														
 
															   getAbortController?: (abortController: AbortController) => void
														
 
															+  onTTSChunk?: IOnTTSChunk
														
 
															+  onTTSEnd?: IOnTTSEnd
														
 
															 }, isInstalledApp: boolean, installedAppId = '') => {
														
 
															   return ssePost(getUrl('chat-messages', isInstalledApp, installedAppId), {
														
 
															     body: {
														
 
															       ...body,
														
 
															       response_mode: 'streaming',
														
 
															     },
														
 
															-  }, { onData, onCompleted, onThought, onFile, isPublicAPI: !isInstalledApp, onError, getAbortController, onMessageEnd, onMessageReplace })
														
 
															+  }, { onData, onCompleted, onThought, onFile, isPublicAPI: !isInstalledApp, onError, getAbortController, onMessageEnd, onMessageReplace, onTTSChunk, onTTSEnd })
														
 
															 }
														
 
															 export const stopChatMessageResponding = async (appId: string, taskId: string, isInstalledApp: boolean, installedAppId = '') => {
														
@@ -214,6 +216,10 @@ export const textToAudio = (url: string, isPublicAPI: boolean, body: FormData) =
 
															   return (getAction('post', !isPublicAPI))(url, { body }, { bodyStringify: false, deleteContentType: true }) as Promise<{ data: string }>
														
 
															 }
														
 
															+export const textToAudioStream = (url: string, isPublicAPI: boolean, header: { content_type: string }, body: { streaming: boolean; voice?: string; message_id?: string; text?: string | null | undefined }) => {
														
 
															+  return (getAction('post', !isPublicAPI))(url, { body, header }, { needAllResponseContent: true })
														
 
															+}
														
 
															+
														
 
															 export const fetchAccessToken = async (appCode: string) => {
														
 
															   const headers = new Headers()
														
 
															   headers.append('X-App-Code', appCode)
														
--- a/web/types/app.ts
+++ b/web/types/app.ts
@@ -160,6 +160,7 @@ export type ModelConfig = {
 
															     enabled: boolean
														
 
															     voice?: string
														
 
															     language?: string
														
 
															+    autoPlay?: TtsAutoPlay
														
 
															   }
														
 
															   retriever_resource: {
														
 
															     enabled: boolean
														
@@ -349,6 +350,11 @@ export enum TransferMethod {
 
															   remote_url = 'remote_url',
														
 
															 }
														
 
															+export enum TtsAutoPlay {
														
 
															+  enabled = 'enabled',
														
 
															+  disabled = 'disabled',
														
 
															+}
														
 
															+
														
 
															 export const ALLOW_FILE_EXTENSIONS = ['png', 'jpg', 'jpeg', 'webp', 'gif']
														
 
															 export type VisionSettings = {