Browse Source

feat:add tts-streaming config and future (#5492)

chenxu9741 9 months ago
parent
commit
6ef401a9f0
44 changed files with 1281 additions and 359 deletions
  1. 4 0
      api/constants/tts_auto_play_timeout.py
  2. 26 5
      api/controllers/console/app/audio.py
  3. 25 5
      api/controllers/console/explore/audio.py
  4. 22 11
      api/controllers/service_api/app/audio.py
  5. 24 5
      api/controllers/web/audio.py
  6. 135 0
      api/core/app/apps/advanced_chat/app_generator_tts_publisher.py
  7. 78 19
      api/core/app/apps/advanced_chat/generate_task_pipeline.py
  8. 0 1
      api/core/app/apps/base_app_queue_manager.py
  9. 62 1
      api/core/app/apps/workflow/generate_task_pipeline.py
  10. 37 0
      api/core/app/entities/task_entities.py
  11. 61 5
      api/core/app/task_pipeline/easy_ui_based_generate_task_pipeline.py
  12. 2 3
      api/core/model_manager.py
  13. 27 24
      api/core/model_runtime/model_providers/__base/tts_model.py
  14. 31 37
      api/core/model_runtime/model_providers/azure_openai/tts/tts.py
  15. 1 1
      api/core/model_runtime/model_providers/openai/tts/tts-1-hd.yaml
  16. 1 1
      api/core/model_runtime/model_providers/openai/tts/tts-1.yaml
  17. 31 31
      api/core/model_runtime/model_providers/openai/tts/tts.py
  18. 1 1
      api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml
  19. 67 30
      api/core/model_runtime/model_providers/tongyi/tts/tts.py
  20. 1 1
      api/pyproject.toml
  21. 2 0
      api/services/app_service.py
  22. 66 40
      api/services/audio_service.py
  23. 50 14
      web/app/components/app/configuration/config-voice/param-config-content.tsx
  24. 0 1
      web/app/components/app/configuration/features/chat-group/text-to-speech/index.tsx
  25. 1 2
      web/app/components/app/text-generate/item/index.tsx
  26. 53 0
      web/app/components/base/audio-btn/audio.player.manager.ts
  27. 263 0
      web/app/components/base/audio-btn/audio.ts
  28. 42 89
      web/app/components/base/audio-btn/index.tsx
  29. 10 7
      web/app/components/base/chat/chat/answer/index.tsx
  30. 1 1
      web/app/components/base/chat/chat/answer/operation.tsx
  31. 27 1
      web/app/components/base/chat/chat/hooks.ts
  32. 51 15
      web/app/components/base/features/feature-panel/text-to-speech/param-config-content.tsx
  33. 2 1
      web/app/components/base/features/types.ts
  34. 27 0
      web/app/components/workflow/hooks/use-workflow-run.ts
  35. 3 0
      web/i18n/en-US/app-debug.ts
  36. 3 0
      web/i18n/ja-JP/app-debug.ts
  37. 3 0
      web/i18n/zh-Hans/app-debug.ts
  38. 3 0
      web/i18n/zh-Hant/app-debug.ts
  39. 2 1
      web/models/debug.ts
  40. 1 0
      web/next.config.js
  41. 1 0
      web/service/apps.ts
  42. 19 3
      web/service/base.ts
  43. 9 3
      web/service/share.ts
  44. 6 0
      web/types/app.ts

+ 4 - 0
api/constants/tts_auto_play_timeout.py

@@ -0,0 +1,4 @@
+TTS_AUTO_PLAY_TIMEOUT = 5
+
+# sleep 20 ms ( 40ms => 1280 byte audio file,20ms => 640 byte audio file)
+TTS_AUTO_PLAY_YIELD_CPU_TIME = 0.02

+ 26 - 5
api/controllers/console/app/audio.py

@@ -81,15 +81,36 @@ class ChatMessageTextApi(Resource):
     @account_initialization_required
     @account_initialization_required
     @get_app_model
     @get_app_model
     def post(self, app_model):
     def post(self, app_model):
+        from werkzeug.exceptions import InternalServerError
+
         try:
         try:
+            parser = reqparse.RequestParser()
+            parser.add_argument('message_id', type=str, location='json')
+            parser.add_argument('text', type=str, location='json')
+            parser.add_argument('voice', type=str, location='json')
+            parser.add_argument('streaming', type=bool, location='json')
+            args = parser.parse_args()
+
+            message_id = args.get('message_id', None)
+            text = args.get('text', None)
+            if (app_model.mode in [AppMode.ADVANCED_CHAT.value, AppMode.WORKFLOW.value]
+                    and app_model.workflow
+                    and app_model.workflow.features_dict):
+                text_to_speech = app_model.workflow.features_dict.get('text_to_speech')
+                voice = args.get('voice') if args.get('voice') else text_to_speech.get('voice')
+            else:
+                try:
+                    voice = args.get('voice') if args.get('voice') else app_model.app_model_config.text_to_speech_dict.get(
+                        'voice')
+                except Exception:
+                    voice = None
             response = AudioService.transcript_tts(
             response = AudioService.transcript_tts(
                 app_model=app_model,
                 app_model=app_model,
-                text=request.form['text'],
-                voice=request.form['voice'],
-                streaming=False
+                text=text,
+                message_id=message_id,
+                voice=voice
             )
             )
-
-            return {'data': response.data.decode('latin1')}
+            return response
         except services.errors.app_model_config.AppModelConfigBrokenError:
         except services.errors.app_model_config.AppModelConfigBrokenError:
             logging.exception("App model config broken.")
             logging.exception("App model config broken.")
             raise AppUnavailableError()
             raise AppUnavailableError()

+ 25 - 5
api/controllers/console/explore/audio.py

@@ -19,6 +19,7 @@ from controllers.console.app.error import (
 from controllers.console.explore.wraps import InstalledAppResource
 from controllers.console.explore.wraps import InstalledAppResource
 from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
 from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
 from core.model_runtime.errors.invoke import InvokeError
 from core.model_runtime.errors.invoke import InvokeError
+from models.model import AppMode
 from services.audio_service import AudioService
 from services.audio_service import AudioService
 from services.errors.audio import (
 from services.errors.audio import (
     AudioTooLargeServiceError,
     AudioTooLargeServiceError,
@@ -70,16 +71,33 @@ class ChatAudioApi(InstalledAppResource):
 
 
 class ChatTextApi(InstalledAppResource):
 class ChatTextApi(InstalledAppResource):
     def post(self, installed_app):
     def post(self, installed_app):
-        app_model = installed_app.app
+        from flask_restful import reqparse
 
 
+        app_model = installed_app.app
         try:
         try:
+            parser = reqparse.RequestParser()
+            parser.add_argument('message_id', type=str, required=False, location='json')
+            parser.add_argument('voice', type=str, location='json')
+            parser.add_argument('streaming', type=bool, location='json')
+            args = parser.parse_args()
+
+            message_id = args.get('message_id')
+            if (app_model.mode in [AppMode.ADVANCED_CHAT.value, AppMode.WORKFLOW.value]
+                    and app_model.workflow
+                    and app_model.workflow.features_dict):
+                text_to_speech = app_model.workflow.features_dict.get('text_to_speech')
+                voice = args.get('voice') if args.get('voice') else text_to_speech.get('voice')
+            else:
+                try:
+                    voice = args.get('voice') if args.get('voice') else app_model.app_model_config.text_to_speech_dict.get('voice')
+                except Exception:
+                    voice = None
             response = AudioService.transcript_tts(
             response = AudioService.transcript_tts(
                 app_model=app_model,
                 app_model=app_model,
-                text=request.form['text'],
-                voice=request.form['voice'] if request.form.get('voice') else app_model.app_model_config.text_to_speech_dict.get('voice'),
-                streaming=False
+                message_id=message_id,
+                voice=voice
             )
             )
-            return {'data': response.data.decode('latin1')}
+            return response
         except services.errors.app_model_config.AppModelConfigBrokenError:
         except services.errors.app_model_config.AppModelConfigBrokenError:
             logging.exception("App model config broken.")
             logging.exception("App model config broken.")
             raise AppUnavailableError()
             raise AppUnavailableError()
@@ -108,3 +126,5 @@ class ChatTextApi(InstalledAppResource):
 
 
 api.add_resource(ChatAudioApi, '/installed-apps/<uuid:installed_app_id>/audio-to-text', endpoint='installed_app_audio')
 api.add_resource(ChatAudioApi, '/installed-apps/<uuid:installed_app_id>/audio-to-text', endpoint='installed_app_audio')
 api.add_resource(ChatTextApi, '/installed-apps/<uuid:installed_app_id>/text-to-audio', endpoint='installed_app_text')
 api.add_resource(ChatTextApi, '/installed-apps/<uuid:installed_app_id>/text-to-audio', endpoint='installed_app_text')
+# api.add_resource(ChatTextApiWithMessageId, '/installed-apps/<uuid:installed_app_id>/text-to-audio/message-id',
+#                  endpoint='installed_app_text_with_message_id')

+ 22 - 11
api/controllers/service_api/app/audio.py

@@ -20,7 +20,7 @@ from controllers.service_api.app.error import (
 from controllers.service_api.wraps import FetchUserArg, WhereisUserArg, validate_app_token
 from controllers.service_api.wraps import FetchUserArg, WhereisUserArg, validate_app_token
 from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
 from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
 from core.model_runtime.errors.invoke import InvokeError
 from core.model_runtime.errors.invoke import InvokeError
-from models.model import App, EndUser
+from models.model import App, AppMode, EndUser
 from services.audio_service import AudioService
 from services.audio_service import AudioService
 from services.errors.audio import (
 from services.errors.audio import (
     AudioTooLargeServiceError,
     AudioTooLargeServiceError,
@@ -72,19 +72,30 @@ class AudioApi(Resource):
 class TextApi(Resource):
 class TextApi(Resource):
     @validate_app_token(fetch_user_arg=FetchUserArg(fetch_from=WhereisUserArg.JSON))
     @validate_app_token(fetch_user_arg=FetchUserArg(fetch_from=WhereisUserArg.JSON))
     def post(self, app_model: App, end_user: EndUser):
     def post(self, app_model: App, end_user: EndUser):
-        parser = reqparse.RequestParser()
-        parser.add_argument('text', type=str, required=True, nullable=False, location='json')
-        parser.add_argument('voice', type=str, location='json')
-        parser.add_argument('streaming', type=bool, required=False, nullable=False, location='json')
-        args = parser.parse_args()
-
         try:
         try:
+            parser = reqparse.RequestParser()
+            parser.add_argument('message_id', type=str, required=False, location='json')
+            parser.add_argument('voice', type=str, location='json')
+            parser.add_argument('streaming', type=bool, location='json')
+            args = parser.parse_args()
+
+            message_id = args.get('message_id')
+            if (app_model.mode in [AppMode.ADVANCED_CHAT.value, AppMode.WORKFLOW.value]
+                    and app_model.workflow
+                    and app_model.workflow.features_dict):
+                text_to_speech = app_model.workflow.features_dict.get('text_to_speech')
+                voice = args.get('voice') if args.get('voice') else text_to_speech.get('voice')
+            else:
+                try:
+                    voice = args.get('voice') if args.get('voice') else app_model.app_model_config.text_to_speech_dict.get(
+                        'voice')
+                except Exception:
+                    voice = None
             response = AudioService.transcript_tts(
             response = AudioService.transcript_tts(
                 app_model=app_model,
                 app_model=app_model,
-                text=args['text'],
-                end_user=end_user,
-                voice=args.get('voice'),
-                streaming=args['streaming']
+                message_id=message_id,
+                end_user=end_user.external_user_id,
+                voice=voice
             )
             )
 
 
             return response
             return response

+ 24 - 5
api/controllers/web/audio.py

@@ -19,7 +19,7 @@ from controllers.web.error import (
 from controllers.web.wraps import WebApiResource
 from controllers.web.wraps import WebApiResource
 from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
 from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
 from core.model_runtime.errors.invoke import InvokeError
 from core.model_runtime.errors.invoke import InvokeError
-from models.model import App
+from models.model import App, AppMode
 from services.audio_service import AudioService
 from services.audio_service import AudioService
 from services.errors.audio import (
 from services.errors.audio import (
     AudioTooLargeServiceError,
     AudioTooLargeServiceError,
@@ -69,16 +69,35 @@ class AudioApi(WebApiResource):
 
 
 class TextApi(WebApiResource):
 class TextApi(WebApiResource):
     def post(self, app_model: App, end_user):
     def post(self, app_model: App, end_user):
+        from flask_restful import reqparse
         try:
         try:
+            parser = reqparse.RequestParser()
+            parser.add_argument('message_id', type=str, required=False, location='json')
+            parser.add_argument('voice', type=str, location='json')
+            parser.add_argument('streaming', type=bool, location='json')
+            args = parser.parse_args()
+
+            message_id = args.get('message_id')
+            if (app_model.mode in [AppMode.ADVANCED_CHAT.value, AppMode.WORKFLOW.value]
+                    and app_model.workflow
+                    and app_model.workflow.features_dict):
+                text_to_speech = app_model.workflow.features_dict.get('text_to_speech')
+                voice = args.get('voice') if args.get('voice') else text_to_speech.get('voice')
+            else:
+                try:
+                    voice = args.get('voice') if args.get(
+                        'voice') else app_model.app_model_config.text_to_speech_dict.get('voice')
+                except Exception:
+                    voice = None
+
             response = AudioService.transcript_tts(
             response = AudioService.transcript_tts(
                 app_model=app_model,
                 app_model=app_model,
-                text=request.form['text'],
+                message_id=message_id,
                 end_user=end_user.external_user_id,
                 end_user=end_user.external_user_id,
-                voice=request.form['voice'] if request.form.get('voice') else None,
-                streaming=False
+                voice=voice
             )
             )
 
 
-            return {'data': response.data.decode('latin1')}
+            return response
         except services.errors.app_model_config.AppModelConfigBrokenError:
         except services.errors.app_model_config.AppModelConfigBrokenError:
             logging.exception("App model config broken.")
             logging.exception("App model config broken.")
             raise AppUnavailableError()
             raise AppUnavailableError()

+ 135 - 0
api/core/app/apps/advanced_chat/app_generator_tts_publisher.py

@@ -0,0 +1,135 @@
+import base64
+import concurrent.futures
+import logging
+import queue
+import re
+import threading
+
+from core.app.entities.queue_entities import QueueAgentMessageEvent, QueueLLMChunkEvent, QueueTextChunkEvent
+from core.model_manager import ModelManager
+from core.model_runtime.entities.model_entities import ModelType
+
+
+class AudioTrunk:
+    def __init__(self, status: str, audio):
+        self.audio = audio
+        self.status = status
+
+
+def _invoiceTTS(text_content: str, model_instance, tenant_id: str, voice: str):
+    if not text_content or text_content.isspace():
+        return
+    return model_instance.invoke_tts(
+        content_text=text_content.strip(),
+        user="responding_tts",
+        tenant_id=tenant_id,
+        voice=voice
+    )
+
+
+def _process_future(future_queue, audio_queue):
+    while True:
+        try:
+            future = future_queue.get()
+            if future is None:
+                break
+            for audio in future.result():
+                audio_base64 = base64.b64encode(bytes(audio))
+                audio_queue.put(AudioTrunk("responding", audio=audio_base64))
+        except Exception as e:
+            logging.getLogger(__name__).warning(e)
+            break
+    audio_queue.put(AudioTrunk("finish", b''))
+
+
+class AppGeneratorTTSPublisher:
+
+    def __init__(self, tenant_id: str, voice: str):
+        self.logger = logging.getLogger(__name__)
+        self.tenant_id = tenant_id
+        self.msg_text = ''
+        self._audio_queue = queue.Queue()
+        self._msg_queue = queue.Queue()
+        self.match = re.compile(r'[。.!?]')
+        self.model_manager = ModelManager()
+        self.model_instance = self.model_manager.get_default_model_instance(
+            tenant_id=self.tenant_id,
+            model_type=ModelType.TTS
+        )
+        self.voices = self.model_instance.get_tts_voices()
+        values = [voice.get('value') for voice in self.voices]
+        self.voice = voice
+        if not voice or voice not in values:
+            self.voice = self.voices[0].get('value')
+        self.MAX_SENTENCE = 2
+        self._last_audio_event = None
+        self._runtime_thread = threading.Thread(target=self._runtime).start()
+        self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=3)
+
+    def publish(self, message):
+        try:
+            self._msg_queue.put(message)
+        except Exception as e:
+            self.logger.warning(e)
+
+    def _runtime(self):
+        future_queue = queue.Queue()
+        threading.Thread(target=_process_future, args=(future_queue, self._audio_queue)).start()
+        while True:
+            try:
+                message = self._msg_queue.get()
+                if message is None:
+                    if self.msg_text and len(self.msg_text.strip()) > 0:
+                        futures_result = self.executor.submit(_invoiceTTS, self.msg_text,
+                                                              self.model_instance, self.tenant_id, self.voice)
+                        future_queue.put(futures_result)
+                    break
+                elif isinstance(message.event, QueueAgentMessageEvent | QueueLLMChunkEvent):
+                    self.msg_text += message.event.chunk.delta.message.content
+                elif isinstance(message.event, QueueTextChunkEvent):
+                    self.msg_text += message.event.text
+                self.last_message = message
+                sentence_arr, text_tmp = self._extract_sentence(self.msg_text)
+                if len(sentence_arr) >= min(self.MAX_SENTENCE, 7):
+                    self.MAX_SENTENCE += 1
+                    text_content = ''.join(sentence_arr)
+                    futures_result = self.executor.submit(_invoiceTTS, text_content,
+                                                          self.model_instance,
+                                                          self.tenant_id,
+                                                          self.voice)
+                    future_queue.put(futures_result)
+                    if text_tmp:
+                        self.msg_text = text_tmp
+                    else:
+                        self.msg_text = ''
+
+            except Exception as e:
+                self.logger.warning(e)
+                break
+        future_queue.put(None)
+
+    def checkAndGetAudio(self) -> AudioTrunk | None:
+        try:
+            if self._last_audio_event and self._last_audio_event.status == "finish":
+                if self.executor:
+                    self.executor.shutdown(wait=False)
+                return self.last_message
+            audio = self._audio_queue.get_nowait()
+            if audio and audio.status == "finish":
+                self.executor.shutdown(wait=False)
+                self._runtime_thread = None
+            if audio:
+                self._last_audio_event = audio
+            return audio
+        except queue.Empty:
+            return None
+
+    def _extract_sentence(self, org_text):
+        tx = self.match.finditer(org_text)
+        start = 0
+        result = []
+        for i in tx:
+            end = i.regs[0][1]
+            result.append(org_text[start:end])
+            start = end
+        return result, org_text[start:]

+ 78 - 19
api/core/app/apps/advanced_chat/generate_task_pipeline.py

@@ -4,6 +4,8 @@ import time
 from collections.abc import Generator
 from collections.abc import Generator
 from typing import Any, Optional, Union, cast
 from typing import Any, Optional, Union, cast
 
 
+from constants.tts_auto_play_timeout import TTS_AUTO_PLAY_TIMEOUT, TTS_AUTO_PLAY_YIELD_CPU_TIME
+from core.app.apps.advanced_chat.app_generator_tts_publisher import AppGeneratorTTSPublisher, AudioTrunk
 from core.app.apps.base_app_queue_manager import AppQueueManager, PublishFrom
 from core.app.apps.base_app_queue_manager import AppQueueManager, PublishFrom
 from core.app.entities.app_invoke_entities import (
 from core.app.entities.app_invoke_entities import (
     AdvancedChatAppGenerateEntity,
     AdvancedChatAppGenerateEntity,
@@ -33,6 +35,8 @@ from core.app.entities.task_entities import (
     ChatbotAppStreamResponse,
     ChatbotAppStreamResponse,
     ChatflowStreamGenerateRoute,
     ChatflowStreamGenerateRoute,
     ErrorStreamResponse,
     ErrorStreamResponse,
+    MessageAudioEndStreamResponse,
+    MessageAudioStreamResponse,
     MessageEndStreamResponse,
     MessageEndStreamResponse,
     StreamResponse,
     StreamResponse,
 )
 )
@@ -71,13 +75,13 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
     _iteration_nested_relations: dict[str, list[str]]
     _iteration_nested_relations: dict[str, list[str]]
 
 
     def __init__(
     def __init__(
-        self, application_generate_entity: AdvancedChatAppGenerateEntity,
-        workflow: Workflow,
-        queue_manager: AppQueueManager,
-        conversation: Conversation,
-        message: Message,
-        user: Union[Account, EndUser],
-        stream: bool
+            self, application_generate_entity: AdvancedChatAppGenerateEntity,
+            workflow: Workflow,
+            queue_manager: AppQueueManager,
+            conversation: Conversation,
+            message: Message,
+            user: Union[Account, EndUser],
+            stream: bool
     ) -> None:
     ) -> None:
         """
         """
         Initialize AdvancedChatAppGenerateTaskPipeline.
         Initialize AdvancedChatAppGenerateTaskPipeline.
@@ -129,7 +133,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
             self._application_generate_entity.query
             self._application_generate_entity.query
         )
         )
 
 
-        generator = self._process_stream_response(
+        generator = self._wrapper_process_stream_response(
             trace_manager=self._application_generate_entity.trace_manager
             trace_manager=self._application_generate_entity.trace_manager
         )
         )
         if self._stream:
         if self._stream:
@@ -138,7 +142,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
             return self._to_blocking_response(generator)
             return self._to_blocking_response(generator)
 
 
     def _to_blocking_response(self, generator: Generator[StreamResponse, None, None]) \
     def _to_blocking_response(self, generator: Generator[StreamResponse, None, None]) \
-        -> ChatbotAppBlockingResponse:
+            -> ChatbotAppBlockingResponse:
         """
         """
         Process blocking response.
         Process blocking response.
         :return:
         :return:
@@ -169,7 +173,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
         raise Exception('Queue listening stopped unexpectedly.')
         raise Exception('Queue listening stopped unexpectedly.')
 
 
     def _to_stream_response(self, generator: Generator[StreamResponse, None, None]) \
     def _to_stream_response(self, generator: Generator[StreamResponse, None, None]) \
-        -> Generator[ChatbotAppStreamResponse, None, None]:
+            -> Generator[ChatbotAppStreamResponse, None, None]:
         """
         """
         To stream response.
         To stream response.
         :return:
         :return:
@@ -182,14 +186,68 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
                 stream_response=stream_response
                 stream_response=stream_response
             )
             )
 
 
+    def _listenAudioMsg(self, publisher, task_id: str):
+        if not publisher:
+            return None
+        audio_msg: AudioTrunk = publisher.checkAndGetAudio()
+        if audio_msg and audio_msg.status != "finish":
+            return MessageAudioStreamResponse(audio=audio_msg.audio, task_id=task_id)
+        return None
+
+    def _wrapper_process_stream_response(self, trace_manager: Optional[TraceQueueManager] = None) -> \
+            Generator[StreamResponse, None, None]:
+
+        publisher = None
+        task_id = self._application_generate_entity.task_id
+        tenant_id = self._application_generate_entity.app_config.tenant_id
+        features_dict = self._workflow.features_dict
+
+        if features_dict.get('text_to_speech') and features_dict['text_to_speech'].get('enabled') and features_dict[
+                'text_to_speech'].get('autoPlay') == 'enabled':
+            publisher = AppGeneratorTTSPublisher(tenant_id, features_dict['text_to_speech'].get('voice'))
+        for response in self._process_stream_response(publisher=publisher, trace_manager=trace_manager):
+            while True:
+                audio_response = self._listenAudioMsg(publisher, task_id=task_id)
+                if audio_response:
+                    yield audio_response
+                else:
+                    break
+            yield response
+
+        start_listener_time = time.time()
+        # timeout
+        while (time.time() - start_listener_time) < TTS_AUTO_PLAY_TIMEOUT:
+            try:
+                if not publisher:
+                    break
+                audio_trunk = publisher.checkAndGetAudio()
+                if audio_trunk is None:
+                    # release cpu
+                    # sleep 20 ms ( 40ms => 1280 byte audio file,20ms => 640 byte audio file)
+                    time.sleep(TTS_AUTO_PLAY_YIELD_CPU_TIME)
+                    continue
+                if audio_trunk.status == "finish":
+                    break
+                else:
+                    start_listener_time = time.time()
+                    yield MessageAudioStreamResponse(audio=audio_trunk.audio, task_id=task_id)
+            except Exception as e:
+                logger.error(e)
+                break
+        yield MessageAudioEndStreamResponse(audio='', task_id=task_id)
+
     def _process_stream_response(
     def _process_stream_response(
-        self, trace_manager: Optional[TraceQueueManager] = None
+            self,
+            publisher: AppGeneratorTTSPublisher,
+            trace_manager: Optional[TraceQueueManager] = None
     ) -> Generator[StreamResponse, None, None]:
     ) -> Generator[StreamResponse, None, None]:
         """
         """
         Process stream response.
         Process stream response.
         :return:
         :return:
         """
         """
         for message in self._queue_manager.listen():
         for message in self._queue_manager.listen():
+            if publisher:
+                publisher.publish(message=message)
             event = message.event
             event = message.event
 
 
             if isinstance(event, QueueErrorEvent):
             if isinstance(event, QueueErrorEvent):
@@ -301,7 +359,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
                     continue
                     continue
 
 
                 if not self._is_stream_out_support(
                 if not self._is_stream_out_support(
-                    event=event
+                        event=event
                 ):
                 ):
                     continue
                     continue
 
 
@@ -318,7 +376,8 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
                 yield self._ping_stream_response()
                 yield self._ping_stream_response()
             else:
             else:
                 continue
                 continue
-
+        if publisher:
+            publisher.publish(None)
         if self._conversation_name_generate_thread:
         if self._conversation_name_generate_thread:
             self._conversation_name_generate_thread.join()
             self._conversation_name_generate_thread.join()
 
 
@@ -402,7 +461,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
         return stream_generate_routes
         return stream_generate_routes
 
 
     def _get_answer_start_at_node_ids(self, graph: dict, target_node_id: str) \
     def _get_answer_start_at_node_ids(self, graph: dict, target_node_id: str) \
-        -> list[str]:
+            -> list[str]:
         """
         """
         Get answer start at node id.
         Get answer start at node id.
         :param graph: graph
         :param graph: graph
@@ -457,7 +516,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
                 start_node_id = target_node_id
                 start_node_id = target_node_id
                 start_node_ids.append(start_node_id)
                 start_node_ids.append(start_node_id)
             elif node_type == NodeType.START.value or \
             elif node_type == NodeType.START.value or \
-                node_iteration_id is not None and iteration_start_node_id == source_node.get('id'):
+                    node_iteration_id is not None and iteration_start_node_id == source_node.get('id'):
                 start_node_id = source_node_id
                 start_node_id = source_node_id
                 start_node_ids.append(start_node_id)
                 start_node_ids.append(start_node_id)
             else:
             else:
@@ -515,7 +574,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
 
 
             # all route chunks are generated
             # all route chunks are generated
             if self._task_state.current_stream_generate_state.current_route_position == len(
             if self._task_state.current_stream_generate_state.current_route_position == len(
-                self._task_state.current_stream_generate_state.generate_route
+                    self._task_state.current_stream_generate_state.generate_route
             ):
             ):
                 self._task_state.current_stream_generate_state = None
                 self._task_state.current_stream_generate_state = None
 
 
@@ -525,7 +584,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
         :return:
         :return:
         """
         """
         if not self._task_state.current_stream_generate_state:
         if not self._task_state.current_stream_generate_state:
-            return None
+            return
 
 
         route_chunks = self._task_state.current_stream_generate_state.generate_route[
         route_chunks = self._task_state.current_stream_generate_state.generate_route[
                        self._task_state.current_stream_generate_state.current_route_position:]
                        self._task_state.current_stream_generate_state.current_route_position:]
@@ -573,7 +632,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
                     # get route chunk node execution info
                     # get route chunk node execution info
                     route_chunk_node_execution_info = self._task_state.ran_node_execution_infos[route_chunk_node_id]
                     route_chunk_node_execution_info = self._task_state.ran_node_execution_infos[route_chunk_node_id]
                     if (route_chunk_node_execution_info.node_type == NodeType.LLM
                     if (route_chunk_node_execution_info.node_type == NodeType.LLM
-                        and latest_node_execution_info.node_type == NodeType.LLM):
+                            and latest_node_execution_info.node_type == NodeType.LLM):
                         # only LLM support chunk stream output
                         # only LLM support chunk stream output
                         self._task_state.current_stream_generate_state.current_route_position += 1
                         self._task_state.current_stream_generate_state.current_route_position += 1
                         continue
                         continue
@@ -643,7 +702,7 @@ class AdvancedChatAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCyc
 
 
         # all route chunks are generated
         # all route chunks are generated
         if self._task_state.current_stream_generate_state.current_route_position == len(
         if self._task_state.current_stream_generate_state.current_route_position == len(
-            self._task_state.current_stream_generate_state.generate_route
+                self._task_state.current_stream_generate_state.generate_route
         ):
         ):
             self._task_state.current_stream_generate_state = None
             self._task_state.current_stream_generate_state = None
 
 

+ 0 - 1
api/core/app/apps/base_app_queue_manager.py

@@ -51,7 +51,6 @@ class AppQueueManager:
         listen_timeout = current_app.config.get("APP_MAX_EXECUTION_TIME")
         listen_timeout = current_app.config.get("APP_MAX_EXECUTION_TIME")
         start_time = time.time()
         start_time = time.time()
         last_ping_time = 0
         last_ping_time = 0
-
         while True:
         while True:
             try:
             try:
                 message = self._q.get(timeout=1)
                 message = self._q.get(timeout=1)

+ 62 - 1
api/core/app/apps/workflow/generate_task_pipeline.py

@@ -1,7 +1,10 @@
 import logging
 import logging
+import time
 from collections.abc import Generator
 from collections.abc import Generator
 from typing import Any, Optional, Union
 from typing import Any, Optional, Union
 
 
+from constants.tts_auto_play_timeout import TTS_AUTO_PLAY_TIMEOUT, TTS_AUTO_PLAY_YIELD_CPU_TIME
+from core.app.apps.advanced_chat.app_generator_tts_publisher import AppGeneratorTTSPublisher, AudioTrunk
 from core.app.apps.base_app_queue_manager import AppQueueManager
 from core.app.apps.base_app_queue_manager import AppQueueManager
 from core.app.entities.app_invoke_entities import (
 from core.app.entities.app_invoke_entities import (
     InvokeFrom,
     InvokeFrom,
@@ -25,6 +28,8 @@ from core.app.entities.queue_entities import (
 )
 )
 from core.app.entities.task_entities import (
 from core.app.entities.task_entities import (
     ErrorStreamResponse,
     ErrorStreamResponse,
+    MessageAudioEndStreamResponse,
+    MessageAudioStreamResponse,
     StreamResponse,
     StreamResponse,
     TextChunkStreamResponse,
     TextChunkStreamResponse,
     TextReplaceStreamResponse,
     TextReplaceStreamResponse,
@@ -105,7 +110,7 @@ class WorkflowAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCycleMa
         db.session.refresh(self._user)
         db.session.refresh(self._user)
         db.session.close()
         db.session.close()
 
 
-        generator = self._process_stream_response(
+        generator = self._wrapper_process_stream_response(
             trace_manager=self._application_generate_entity.trace_manager
             trace_manager=self._application_generate_entity.trace_manager
         )
         )
         if self._stream:
         if self._stream:
@@ -161,8 +166,58 @@ class WorkflowAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCycleMa
                 stream_response=stream_response
                 stream_response=stream_response
             )
             )
 
 
+    def _listenAudioMsg(self, publisher, task_id: str):
+        if not publisher:
+            return None
+        audio_msg: AudioTrunk = publisher.checkAndGetAudio()
+        if audio_msg and audio_msg.status != "finish":
+            return MessageAudioStreamResponse(audio=audio_msg.audio, task_id=task_id)
+        return None
+
+    def _wrapper_process_stream_response(self, trace_manager: Optional[TraceQueueManager] = None) -> \
+            Generator[StreamResponse, None, None]:
+
+        publisher = None
+        task_id = self._application_generate_entity.task_id
+        tenant_id = self._application_generate_entity.app_config.tenant_id
+        features_dict = self._workflow.features_dict
+
+        if features_dict.get('text_to_speech') and features_dict['text_to_speech'].get('enabled') and features_dict[
+                'text_to_speech'].get('autoPlay') == 'enabled':
+            publisher = AppGeneratorTTSPublisher(tenant_id, features_dict['text_to_speech'].get('voice'))
+        for response in self._process_stream_response(publisher=publisher, trace_manager=trace_manager):
+            while True:
+                audio_response = self._listenAudioMsg(publisher, task_id=task_id)
+                if audio_response:
+                    yield audio_response
+                else:
+                    break
+            yield response
+
+        start_listener_time = time.time()
+        while (time.time() - start_listener_time) < TTS_AUTO_PLAY_TIMEOUT:
+            try:
+                if not publisher:
+                    break
+                audio_trunk = publisher.checkAndGetAudio()
+                if audio_trunk is None:
+                    # release cpu
+                    # sleep 20 ms ( 40ms => 1280 byte audio file,20ms => 640 byte audio file)
+                    time.sleep(TTS_AUTO_PLAY_YIELD_CPU_TIME)
+                    continue
+                if audio_trunk.status == "finish":
+                    break
+                else:
+                    yield MessageAudioStreamResponse(audio=audio_trunk.audio, task_id=task_id)
+            except Exception as e:
+                logger.error(e)
+                break
+        yield MessageAudioEndStreamResponse(audio='', task_id=task_id)
+
+
     def _process_stream_response(
     def _process_stream_response(
         self,
         self,
+        publisher: AppGeneratorTTSPublisher,
         trace_manager: Optional[TraceQueueManager] = None
         trace_manager: Optional[TraceQueueManager] = None
     ) -> Generator[StreamResponse, None, None]:
     ) -> Generator[StreamResponse, None, None]:
         """
         """
@@ -170,6 +225,8 @@ class WorkflowAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCycleMa
         :return:
         :return:
         """
         """
         for message in self._queue_manager.listen():
         for message in self._queue_manager.listen():
+            if publisher:
+                publisher.publish(message=message)
             event = message.event
             event = message.event
 
 
             if isinstance(event, QueueErrorEvent):
             if isinstance(event, QueueErrorEvent):
@@ -251,6 +308,10 @@ class WorkflowAppGenerateTaskPipeline(BasedGenerateTaskPipeline, WorkflowCycleMa
             else:
             else:
                 continue
                 continue
 
 
+        if publisher:
+            publisher.publish(None)
+
+
     def _save_workflow_app_log(self, workflow_run: WorkflowRun) -> None:
     def _save_workflow_app_log(self, workflow_run: WorkflowRun) -> None:
         """
         """
         Save workflow app log.
         Save workflow app log.

+ 37 - 0
api/core/app/entities/task_entities.py

@@ -69,6 +69,7 @@ class WorkflowTaskState(TaskState):
 
 
     iteration_nested_node_ids: list[str] = None
     iteration_nested_node_ids: list[str] = None
 
 
+
 class AdvancedChatTaskState(WorkflowTaskState):
 class AdvancedChatTaskState(WorkflowTaskState):
     """
     """
     AdvancedChatTaskState entity
     AdvancedChatTaskState entity
@@ -86,6 +87,8 @@ class StreamEvent(Enum):
     ERROR = "error"
     ERROR = "error"
     MESSAGE = "message"
     MESSAGE = "message"
     MESSAGE_END = "message_end"
     MESSAGE_END = "message_end"
+    TTS_MESSAGE = "tts_message"
+    TTS_MESSAGE_END = "tts_message_end"
     MESSAGE_FILE = "message_file"
     MESSAGE_FILE = "message_file"
     MESSAGE_REPLACE = "message_replace"
     MESSAGE_REPLACE = "message_replace"
     AGENT_THOUGHT = "agent_thought"
     AGENT_THOUGHT = "agent_thought"
@@ -130,6 +133,22 @@ class MessageStreamResponse(StreamResponse):
     answer: str
     answer: str
 
 
 
 
+class MessageAudioStreamResponse(StreamResponse):
+    """
+    MessageStreamResponse entity
+    """
+    event: StreamEvent = StreamEvent.TTS_MESSAGE
+    audio: str
+
+
+class MessageAudioEndStreamResponse(StreamResponse):
+    """
+    MessageStreamResponse entity
+    """
+    event: StreamEvent = StreamEvent.TTS_MESSAGE_END
+    audio: str
+
+
 class MessageEndStreamResponse(StreamResponse):
 class MessageEndStreamResponse(StreamResponse):
     """
     """
     MessageEndStreamResponse entity
     MessageEndStreamResponse entity
@@ -186,6 +205,7 @@ class WorkflowStartStreamResponse(StreamResponse):
     """
     """
     WorkflowStartStreamResponse entity
     WorkflowStartStreamResponse entity
     """
     """
+
     class Data(BaseModel):
     class Data(BaseModel):
         """
         """
         Data entity
         Data entity
@@ -205,6 +225,7 @@ class WorkflowFinishStreamResponse(StreamResponse):
     """
     """
     WorkflowFinishStreamResponse entity
     WorkflowFinishStreamResponse entity
     """
     """
+
     class Data(BaseModel):
     class Data(BaseModel):
         """
         """
         Data entity
         Data entity
@@ -232,6 +253,7 @@ class NodeStartStreamResponse(StreamResponse):
     """
     """
     NodeStartStreamResponse entity
     NodeStartStreamResponse entity
     """
     """
+
     class Data(BaseModel):
     class Data(BaseModel):
         """
         """
         Data entity
         Data entity
@@ -273,6 +295,7 @@ class NodeFinishStreamResponse(StreamResponse):
     """
     """
     NodeFinishStreamResponse entity
     NodeFinishStreamResponse entity
     """
     """
+
     class Data(BaseModel):
     class Data(BaseModel):
         """
         """
         Data entity
         Data entity
@@ -323,10 +346,12 @@ class NodeFinishStreamResponse(StreamResponse):
             }
             }
         }
         }
 
 
+
 class IterationNodeStartStreamResponse(StreamResponse):
 class IterationNodeStartStreamResponse(StreamResponse):
     """
     """
     NodeStartStreamResponse entity
     NodeStartStreamResponse entity
     """
     """
+
     class Data(BaseModel):
     class Data(BaseModel):
         """
         """
         Data entity
         Data entity
@@ -344,10 +369,12 @@ class IterationNodeStartStreamResponse(StreamResponse):
     workflow_run_id: str
     workflow_run_id: str
     data: Data
     data: Data
 
 
+
 class IterationNodeNextStreamResponse(StreamResponse):
 class IterationNodeNextStreamResponse(StreamResponse):
     """
     """
     NodeStartStreamResponse entity
     NodeStartStreamResponse entity
     """
     """
+
     class Data(BaseModel):
     class Data(BaseModel):
         """
         """
         Data entity
         Data entity
@@ -365,10 +392,12 @@ class IterationNodeNextStreamResponse(StreamResponse):
     workflow_run_id: str
     workflow_run_id: str
     data: Data
     data: Data
 
 
+
 class IterationNodeCompletedStreamResponse(StreamResponse):
 class IterationNodeCompletedStreamResponse(StreamResponse):
     """
     """
     NodeCompletedStreamResponse entity
     NodeCompletedStreamResponse entity
     """
     """
+
     class Data(BaseModel):
     class Data(BaseModel):
         """
         """
         Data entity
         Data entity
@@ -393,10 +422,12 @@ class IterationNodeCompletedStreamResponse(StreamResponse):
     workflow_run_id: str
     workflow_run_id: str
     data: Data
     data: Data
 
 
+
 class TextChunkStreamResponse(StreamResponse):
 class TextChunkStreamResponse(StreamResponse):
     """
     """
     TextChunkStreamResponse entity
     TextChunkStreamResponse entity
     """
     """
+
     class Data(BaseModel):
     class Data(BaseModel):
         """
         """
         Data entity
         Data entity
@@ -411,6 +442,7 @@ class TextReplaceStreamResponse(StreamResponse):
     """
     """
     TextReplaceStreamResponse entity
     TextReplaceStreamResponse entity
     """
     """
+
     class Data(BaseModel):
     class Data(BaseModel):
         """
         """
         Data entity
         Data entity
@@ -473,6 +505,7 @@ class ChatbotAppBlockingResponse(AppBlockingResponse):
     """
     """
     ChatbotAppBlockingResponse entity
     ChatbotAppBlockingResponse entity
     """
     """
+
     class Data(BaseModel):
     class Data(BaseModel):
         """
         """
         Data entity
         Data entity
@@ -492,6 +525,7 @@ class CompletionAppBlockingResponse(AppBlockingResponse):
     """
     """
     CompletionAppBlockingResponse entity
     CompletionAppBlockingResponse entity
     """
     """
+
     class Data(BaseModel):
     class Data(BaseModel):
         """
         """
         Data entity
         Data entity
@@ -510,6 +544,7 @@ class WorkflowAppBlockingResponse(AppBlockingResponse):
     """
     """
     WorkflowAppBlockingResponse entity
     WorkflowAppBlockingResponse entity
     """
     """
+
     class Data(BaseModel):
     class Data(BaseModel):
         """
         """
         Data entity
         Data entity
@@ -528,10 +563,12 @@ class WorkflowAppBlockingResponse(AppBlockingResponse):
     workflow_run_id: str
     workflow_run_id: str
     data: Data
     data: Data
 
 
+
 class WorkflowIterationState(BaseModel):
 class WorkflowIterationState(BaseModel):
     """
     """
     WorkflowIterationState entity
     WorkflowIterationState entity
     """
     """
+
     class Data(BaseModel):
     class Data(BaseModel):
         """
         """
         Data entity
         Data entity

+ 61 - 5
api/core/app/task_pipeline/easy_ui_based_generate_task_pipeline.py

@@ -4,6 +4,8 @@ import time
 from collections.abc import Generator
 from collections.abc import Generator
 from typing import Optional, Union, cast
 from typing import Optional, Union, cast
 
 
+from constants.tts_auto_play_timeout import TTS_AUTO_PLAY_TIMEOUT, TTS_AUTO_PLAY_YIELD_CPU_TIME
+from core.app.apps.advanced_chat.app_generator_tts_publisher import AppGeneratorTTSPublisher, AudioTrunk
 from core.app.apps.base_app_queue_manager import AppQueueManager, PublishFrom
 from core.app.apps.base_app_queue_manager import AppQueueManager, PublishFrom
 from core.app.entities.app_invoke_entities import (
 from core.app.entities.app_invoke_entities import (
     AgentChatAppGenerateEntity,
     AgentChatAppGenerateEntity,
@@ -32,6 +34,8 @@ from core.app.entities.task_entities import (
     CompletionAppStreamResponse,
     CompletionAppStreamResponse,
     EasyUITaskState,
     EasyUITaskState,
     ErrorStreamResponse,
     ErrorStreamResponse,
+    MessageAudioEndStreamResponse,
+    MessageAudioStreamResponse,
     MessageEndStreamResponse,
     MessageEndStreamResponse,
     StreamResponse,
     StreamResponse,
 )
 )
@@ -87,6 +91,7 @@ class EasyUIBasedGenerateTaskPipeline(BasedGenerateTaskPipeline, MessageCycleMan
         """
         """
         super().__init__(application_generate_entity, queue_manager, user, stream)
         super().__init__(application_generate_entity, queue_manager, user, stream)
         self._model_config = application_generate_entity.model_conf
         self._model_config = application_generate_entity.model_conf
+        self._app_config = application_generate_entity.app_config
         self._conversation = conversation
         self._conversation = conversation
         self._message = message
         self._message = message
 
 
@@ -102,7 +107,7 @@ class EasyUIBasedGenerateTaskPipeline(BasedGenerateTaskPipeline, MessageCycleMan
         self._conversation_name_generate_thread = None
         self._conversation_name_generate_thread = None
 
 
     def process(
     def process(
-        self,
+            self,
     ) -> Union[
     ) -> Union[
         ChatbotAppBlockingResponse,
         ChatbotAppBlockingResponse,
         CompletionAppBlockingResponse,
         CompletionAppBlockingResponse,
@@ -123,7 +128,7 @@ class EasyUIBasedGenerateTaskPipeline(BasedGenerateTaskPipeline, MessageCycleMan
                 self._application_generate_entity.query
                 self._application_generate_entity.query
             )
             )
 
 
-        generator = self._process_stream_response(
+        generator = self._wrapper_process_stream_response(
             trace_manager=self._application_generate_entity.trace_manager
             trace_manager=self._application_generate_entity.trace_manager
         )
         )
         if self._stream:
         if self._stream:
@@ -202,14 +207,64 @@ class EasyUIBasedGenerateTaskPipeline(BasedGenerateTaskPipeline, MessageCycleMan
                     stream_response=stream_response
                     stream_response=stream_response
                 )
                 )
 
 
+    def _listenAudioMsg(self, publisher, task_id: str):
+        if publisher is None:
+            return None
+        audio_msg: AudioTrunk = publisher.checkAndGetAudio()
+        if audio_msg and audio_msg.status != "finish":
+            # audio_str = audio_msg.audio.decode('utf-8', errors='ignore')
+            return MessageAudioStreamResponse(audio=audio_msg.audio, task_id=task_id)
+        return None
+
+    def _wrapper_process_stream_response(self, trace_manager: Optional[TraceQueueManager] = None) -> \
+            Generator[StreamResponse, None, None]:
+
+        tenant_id = self._application_generate_entity.app_config.tenant_id
+        task_id = self._application_generate_entity.task_id
+        publisher = None
+        text_to_speech_dict = self._app_config.app_model_config_dict.get('text_to_speech')
+        if text_to_speech_dict and text_to_speech_dict.get('autoPlay') == 'enabled' and text_to_speech_dict.get('enabled'):
+            publisher = AppGeneratorTTSPublisher(tenant_id, text_to_speech_dict.get('voice', None))
+        for response in self._process_stream_response(publisher=publisher, trace_manager=trace_manager):
+            while True:
+                audio_response = self._listenAudioMsg(publisher, task_id)
+                if audio_response:
+                    yield audio_response
+                else:
+                    break
+            yield response
+
+        start_listener_time = time.time()
+        # timeout
+        while (time.time() - start_listener_time) < TTS_AUTO_PLAY_TIMEOUT:
+            if publisher is None:
+                break
+            audio = publisher.checkAndGetAudio()
+            if audio is None:
+                # release cpu
+                # sleep 20 ms ( 40ms => 1280 byte audio file,20ms => 640 byte audio file)
+                time.sleep(TTS_AUTO_PLAY_YIELD_CPU_TIME)
+                continue
+            if audio.status == "finish":
+                break
+            else:
+                start_listener_time = time.time()
+                yield MessageAudioStreamResponse(audio=audio.audio,
+                                                 task_id=task_id)
+        yield MessageAudioEndStreamResponse(audio='', task_id=task_id)
+
     def _process_stream_response(
     def _process_stream_response(
-        self, trace_manager: Optional[TraceQueueManager] = None
+            self,
+            publisher: AppGeneratorTTSPublisher,
+            trace_manager: Optional[TraceQueueManager] = None
     ) -> Generator[StreamResponse, None, None]:
     ) -> Generator[StreamResponse, None, None]:
         """
         """
         Process stream response.
         Process stream response.
         :return:
         :return:
         """
         """
         for message in self._queue_manager.listen():
         for message in self._queue_manager.listen():
+            if publisher:
+                publisher.publish(message)
             event = message.event
             event = message.event
 
 
             if isinstance(event, QueueErrorEvent):
             if isinstance(event, QueueErrorEvent):
@@ -272,12 +327,13 @@ class EasyUIBasedGenerateTaskPipeline(BasedGenerateTaskPipeline, MessageCycleMan
                 yield self._ping_stream_response()
                 yield self._ping_stream_response()
             else:
             else:
                 continue
                 continue
-
+        if publisher:
+            publisher.publish(None)
         if self._conversation_name_generate_thread:
         if self._conversation_name_generate_thread:
             self._conversation_name_generate_thread.join()
             self._conversation_name_generate_thread.join()
 
 
     def _save_message(
     def _save_message(
-        self, trace_manager: Optional[TraceQueueManager] = None
+            self, trace_manager: Optional[TraceQueueManager] = None
     ) -> None:
     ) -> None:
         """
         """
         Save message.
         Save message.

+ 2 - 3
api/core/model_manager.py

@@ -264,7 +264,7 @@ class ModelInstance:
             user=user
             user=user
         )
         )
 
 
-    def invoke_tts(self, content_text: str, tenant_id: str, voice: str, streaming: bool, user: Optional[str] = None) \
+    def invoke_tts(self, content_text: str, tenant_id: str, voice: str, user: Optional[str] = None) \
             -> str:
             -> str:
         """
         """
         Invoke large language tts model
         Invoke large language tts model
@@ -287,8 +287,7 @@ class ModelInstance:
             content_text=content_text,
             content_text=content_text,
             user=user,
             user=user,
             tenant_id=tenant_id,
             tenant_id=tenant_id,
-            voice=voice,
-            streaming=streaming
+            voice=voice
         )
         )
 
 
     def _round_robin_invoke(self, function: Callable, *args, **kwargs):
     def _round_robin_invoke(self, function: Callable, *args, **kwargs):

+ 27 - 24
api/core/model_runtime/model_providers/__base/tts_model.py

@@ -1,4 +1,6 @@
 import hashlib
 import hashlib
+import logging
+import re
 import subprocess
 import subprocess
 import uuid
 import uuid
 from abc import abstractmethod
 from abc import abstractmethod
@@ -10,7 +12,7 @@ from core.model_runtime.entities.model_entities import ModelPropertyKey, ModelTy
 from core.model_runtime.errors.invoke import InvokeBadRequestError
 from core.model_runtime.errors.invoke import InvokeBadRequestError
 from core.model_runtime.model_providers.__base.ai_model import AIModel
 from core.model_runtime.model_providers.__base.ai_model import AIModel
 
 
-
+logger = logging.getLogger(__name__)
 class TTSModel(AIModel):
 class TTSModel(AIModel):
     """
     """
     Model class for ttstext model.
     Model class for ttstext model.
@@ -20,7 +22,7 @@ class TTSModel(AIModel):
     # pydantic configs
     # pydantic configs
     model_config = ConfigDict(protected_namespaces=())
     model_config = ConfigDict(protected_namespaces=())
 
 
-    def invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
+    def invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str,
                user: Optional[str] = None):
                user: Optional[str] = None):
         """
         """
         Invoke large language model
         Invoke large language model
@@ -35,14 +37,15 @@ class TTSModel(AIModel):
         :return: translated audio file
         :return: translated audio file
         """
         """
         try:
         try:
+            logger.info(f"Invoke TTS model: {model} , invoke content : {content_text}")
             self._is_ffmpeg_installed()
             self._is_ffmpeg_installed()
-            return self._invoke(model=model, credentials=credentials, user=user, streaming=streaming,
+            return self._invoke(model=model, credentials=credentials, user=user,
                                 content_text=content_text, voice=voice, tenant_id=tenant_id)
                                 content_text=content_text, voice=voice, tenant_id=tenant_id)
         except Exception as e:
         except Exception as e:
             raise self._transform_invoke_error(e)
             raise self._transform_invoke_error(e)
 
 
     @abstractmethod
     @abstractmethod
-    def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
+    def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str,
                 user: Optional[str] = None):
                 user: Optional[str] = None):
         """
         """
         Invoke large language model
         Invoke large language model
@@ -123,26 +126,26 @@ class TTSModel(AIModel):
             return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS]
             return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS]
 
 
     @staticmethod
     @staticmethod
-    def _split_text_into_sentences(text: str, limit: int, delimiters=None):
-        if delimiters is None:
-            delimiters = set('。!?;\n')
-
-        buf = []
-        word_count = 0
-        for char in text:
-            buf.append(char)
-            if char in delimiters:
-                if word_count >= limit:
-                    yield ''.join(buf)
-                    buf = []
-                    word_count = 0
-                else:
-                    word_count += 1
-            else:
-                word_count += 1
-
-        if buf:
-            yield ''.join(buf)
+    def _split_text_into_sentences(org_text, max_length=2000, pattern=r'[。.!?]'):
+        match = re.compile(pattern)
+        tx = match.finditer(org_text)
+        start = 0
+        result = []
+        one_sentence = ''
+        for i in tx:
+            end = i.regs[0][1]
+            tmp = org_text[start:end]
+            if len(one_sentence + tmp) > max_length:
+                result.append(one_sentence)
+                one_sentence = ''
+            one_sentence += tmp
+            start = end
+        last_sens = org_text[start:]
+        if last_sens:
+            one_sentence += last_sens
+        if one_sentence != '':
+            result.append(one_sentence)
+        return result
 
 
     @staticmethod
     @staticmethod
     def _is_ffmpeg_installed():
     def _is_ffmpeg_installed():

+ 31 - 37
api/core/model_runtime/model_providers/azure_openai/tts/tts.py

@@ -4,7 +4,7 @@ from functools import reduce
 from io import BytesIO
 from io import BytesIO
 from typing import Optional
 from typing import Optional
 
 
-from flask import Response, stream_with_context
+from flask import Response
 from openai import AzureOpenAI
 from openai import AzureOpenAI
 from pydub import AudioSegment
 from pydub import AudioSegment
 
 
@@ -14,7 +14,6 @@ from core.model_runtime.errors.validate import CredentialsValidateFailedError
 from core.model_runtime.model_providers.__base.tts_model import TTSModel
 from core.model_runtime.model_providers.__base.tts_model import TTSModel
 from core.model_runtime.model_providers.azure_openai._common import _CommonAzureOpenAI
 from core.model_runtime.model_providers.azure_openai._common import _CommonAzureOpenAI
 from core.model_runtime.model_providers.azure_openai._constant import TTS_BASE_MODELS, AzureBaseModel
 from core.model_runtime.model_providers.azure_openai._constant import TTS_BASE_MODELS, AzureBaseModel
-from extensions.ext_storage import storage
 
 
 
 
 class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
 class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
@@ -23,7 +22,7 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
     """
     """
 
 
     def _invoke(self, model: str, tenant_id: str, credentials: dict,
     def _invoke(self, model: str, tenant_id: str, credentials: dict,
-                content_text: str, voice: str, streaming: bool, user: Optional[str] = None) -> any:
+                content_text: str, voice: str, user: Optional[str] = None) -> any:
         """
         """
         _invoke text2speech model
         _invoke text2speech model
 
 
@@ -32,30 +31,23 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
         :param credentials: model credentials
         :param credentials: model credentials
         :param content_text: text content to be translated
         :param content_text: text content to be translated
         :param voice: model timbre
         :param voice: model timbre
-        :param streaming: output is streaming
         :param user: unique user id
         :param user: unique user id
         :return: text translated to audio file
         :return: text translated to audio file
         """
         """
-        audio_type = self._get_model_audio_type(model, credentials)
         if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]:
         if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]:
             voice = self._get_model_default_voice(model, credentials)
             voice = self._get_model_default_voice(model, credentials)
-        if streaming:
-            return Response(stream_with_context(self._tts_invoke_streaming(model=model,
-                                                                           credentials=credentials,
-                                                                           content_text=content_text,
-                                                                           tenant_id=tenant_id,
-                                                                           voice=voice)),
-                            status=200, mimetype=f'audio/{audio_type}')
-        else:
-            return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)
-
-    def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
+
+        return self._tts_invoke_streaming(model=model,
+                                          credentials=credentials,
+                                          content_text=content_text,
+                                          voice=voice)
+
+    def validate_credentials(self, model: str, credentials: dict) -> None:
         """
         """
         validate credentials text2speech model
         validate credentials text2speech model
 
 
         :param model: model name
         :param model: model name
         :param credentials: model credentials
         :param credentials: model credentials
-        :param user: unique user id
         :return: text translated to audio file
         :return: text translated to audio file
         """
         """
         try:
         try:
@@ -82,7 +74,7 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
         word_limit = self._get_model_word_limit(model, credentials)
         word_limit = self._get_model_word_limit(model, credentials)
         max_workers = self._get_model_workers_limit(model, credentials)
         max_workers = self._get_model_workers_limit(model, credentials)
         try:
         try:
-            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
+            sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
             audio_bytes_list = []
             audio_bytes_list = []
 
 
             # Create a thread pool and map the function to the list of sentences
             # Create a thread pool and map the function to the list of sentences
@@ -107,34 +99,37 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
         except Exception as ex:
         except Exception as ex:
             raise InvokeBadRequestError(str(ex))
             raise InvokeBadRequestError(str(ex))
 
 
-    # Todo: To improve the streaming function
-    def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str,
+    def _tts_invoke_streaming(self, model: str,  credentials: dict, content_text: str,
                               voice: str) -> any:
                               voice: str) -> any:
         """
         """
         _tts_invoke_streaming text2speech model
         _tts_invoke_streaming text2speech model
-
         :param model: model name
         :param model: model name
-        :param tenant_id: user tenant id
         :param credentials: model credentials
         :param credentials: model credentials
         :param content_text: text content to be translated
         :param content_text: text content to be translated
         :param voice: model timbre
         :param voice: model timbre
         :return: text translated to audio file
         :return: text translated to audio file
         """
         """
-        # transform credentials to kwargs for model instance
-        credentials_kwargs = self._to_credential_kwargs(credentials)
-        if not voice or voice not in self.get_tts_model_voices(model=model, credentials=credentials):
-            voice = self._get_model_default_voice(model, credentials)
-        word_limit = self._get_model_word_limit(model, credentials)
-        audio_type = self._get_model_audio_type(model, credentials)
-        tts_file_id = self._get_file_name(content_text)
-        file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
         try:
         try:
+            # doc: https://platform.openai.com/docs/guides/text-to-speech
+            credentials_kwargs = self._to_credential_kwargs(credentials)
             client = AzureOpenAI(**credentials_kwargs)
             client = AzureOpenAI(**credentials_kwargs)
-            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
-            for sentence in sentences:
-                response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())
-                # response.stream_to_file(file_path)
-                storage.save(file_path, response.read())
+            # max font is 4096,there is 3500 limit for each request
+            max_length = 3500
+            if len(content_text) > max_length:
+                sentences = self._split_text_into_sentences(content_text, max_length=max_length)
+                executor = concurrent.futures.ThreadPoolExecutor(max_workers=min(3, len(sentences)))
+                futures = [executor.submit(client.audio.speech.with_streaming_response.create, model=model,
+                                           response_format="mp3",
+                                           input=sentences[i], voice=voice) for i in range(len(sentences))]
+                for index, future in enumerate(futures):
+                    yield from future.result().__enter__().iter_bytes(1024)
+
+            else:
+                response = client.audio.speech.with_streaming_response.create(model=model, voice=voice,
+                                                                              response_format="mp3",
+                                                                              input=content_text.strip())
+
+                yield from response.__enter__().iter_bytes(1024)
         except Exception as ex:
         except Exception as ex:
             raise InvokeBadRequestError(str(ex))
             raise InvokeBadRequestError(str(ex))
 
 
@@ -162,7 +157,7 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
 
 
 
 
     @staticmethod
     @staticmethod
-    def _get_ai_model_entity(base_model_name: str, model: str) -> AzureBaseModel:
+    def _get_ai_model_entity(base_model_name: str, model: str) -> AzureBaseModel | None:
         for ai_model_entity in TTS_BASE_MODELS:
         for ai_model_entity in TTS_BASE_MODELS:
             if ai_model_entity.base_model_name == base_model_name:
             if ai_model_entity.base_model_name == base_model_name:
                 ai_model_entity_copy = copy.deepcopy(ai_model_entity)
                 ai_model_entity_copy = copy.deepcopy(ai_model_entity)
@@ -170,5 +165,4 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
                 ai_model_entity_copy.entity.label.en_US = model
                 ai_model_entity_copy.entity.label.en_US = model
                 ai_model_entity_copy.entity.label.zh_Hans = model
                 ai_model_entity_copy.entity.label.zh_Hans = model
                 return ai_model_entity_copy
                 return ai_model_entity_copy
-
         return None
         return None

+ 1 - 1
api/core/model_runtime/model_providers/openai/tts/tts-1-hd.yaml

@@ -21,7 +21,7 @@ model_properties:
     - mode: 'shimmer'
     - mode: 'shimmer'
       name: 'Shimmer'
       name: 'Shimmer'
       language: [ 'zh-Hans', 'en-US', 'de-DE', 'fr-FR', 'es-ES', 'it-IT', 'th-TH', 'id-ID' ]
       language: [ 'zh-Hans', 'en-US', 'de-DE', 'fr-FR', 'es-ES', 'it-IT', 'th-TH', 'id-ID' ]
-  word_limit: 120
+  word_limit: 3500
   audio_type: 'mp3'
   audio_type: 'mp3'
   max_workers: 5
   max_workers: 5
 pricing:
 pricing:

+ 1 - 1
api/core/model_runtime/model_providers/openai/tts/tts-1.yaml

@@ -21,7 +21,7 @@ model_properties:
     - mode: 'shimmer'
     - mode: 'shimmer'
       name: 'Shimmer'
       name: 'Shimmer'
       language: ['zh-Hans', 'en-US', 'de-DE', 'fr-FR', 'es-ES', 'it-IT', 'th-TH', 'id-ID']
       language: ['zh-Hans', 'en-US', 'de-DE', 'fr-FR', 'es-ES', 'it-IT', 'th-TH', 'id-ID']
-  word_limit: 120
+  word_limit: 3500
   audio_type: 'mp3'
   audio_type: 'mp3'
   max_workers: 5
   max_workers: 5
 pricing:
 pricing:

+ 31 - 31
api/core/model_runtime/model_providers/openai/tts/tts.py

@@ -3,7 +3,7 @@ from functools import reduce
 from io import BytesIO
 from io import BytesIO
 from typing import Optional
 from typing import Optional
 
 
-from flask import Response, stream_with_context
+from flask import Response
 from openai import OpenAI
 from openai import OpenAI
 from pydub import AudioSegment
 from pydub import AudioSegment
 
 
@@ -11,7 +11,6 @@ from core.model_runtime.errors.invoke import InvokeBadRequestError
 from core.model_runtime.errors.validate import CredentialsValidateFailedError
 from core.model_runtime.errors.validate import CredentialsValidateFailedError
 from core.model_runtime.model_providers.__base.tts_model import TTSModel
 from core.model_runtime.model_providers.__base.tts_model import TTSModel
 from core.model_runtime.model_providers.openai._common import _CommonOpenAI
 from core.model_runtime.model_providers.openai._common import _CommonOpenAI
-from extensions.ext_storage import storage
 
 
 
 
 class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
 class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
@@ -20,7 +19,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
     """
     """
 
 
     def _invoke(self, model: str, tenant_id: str, credentials: dict,
     def _invoke(self, model: str, tenant_id: str, credentials: dict,
-                content_text: str, voice: str, streaming: bool, user: Optional[str] = None) -> any:
+                content_text: str, voice: str, user: Optional[str] = None) -> any:
         """
         """
         _invoke text2speech model
         _invoke text2speech model
 
 
@@ -29,22 +28,17 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
         :param credentials: model credentials
         :param credentials: model credentials
         :param content_text: text content to be translated
         :param content_text: text content to be translated
         :param voice: model timbre
         :param voice: model timbre
-        :param streaming: output is streaming
         :param user: unique user id
         :param user: unique user id
         :return: text translated to audio file
         :return: text translated to audio file
         """
         """
-        audio_type = self._get_model_audio_type(model, credentials)
+
         if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]:
         if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]:
             voice = self._get_model_default_voice(model, credentials)
             voice = self._get_model_default_voice(model, credentials)
-        if streaming:
-            return Response(stream_with_context(self._tts_invoke_streaming(model=model,
-                                                                           credentials=credentials,
-                                                                           content_text=content_text,
-                                                                           tenant_id=tenant_id,
-                                                                           voice=voice)),
-                            status=200, mimetype=f'audio/{audio_type}')
-        else:
-            return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)
+        # if streaming:
+        return self._tts_invoke_streaming(model=model,
+                                          credentials=credentials,
+                                          content_text=content_text,
+                                          voice=voice)
 
 
     def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
     def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
         """
         """
@@ -79,7 +73,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
         word_limit = self._get_model_word_limit(model, credentials)
         word_limit = self._get_model_word_limit(model, credentials)
         max_workers = self._get_model_workers_limit(model, credentials)
         max_workers = self._get_model_workers_limit(model, credentials)
         try:
         try:
-            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
+            sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
             audio_bytes_list = []
             audio_bytes_list = []
 
 
             # Create a thread pool and map the function to the list of sentences
             # Create a thread pool and map the function to the list of sentences
@@ -104,34 +98,40 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
         except Exception as ex:
         except Exception as ex:
             raise InvokeBadRequestError(str(ex))
             raise InvokeBadRequestError(str(ex))
 
 
-    # Todo: To improve the streaming function
-    def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str,
+
+    def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
                               voice: str) -> any:
                               voice: str) -> any:
         """
         """
         _tts_invoke_streaming text2speech model
         _tts_invoke_streaming text2speech model
 
 
         :param model: model name
         :param model: model name
-        :param tenant_id: user tenant id
         :param credentials: model credentials
         :param credentials: model credentials
         :param content_text: text content to be translated
         :param content_text: text content to be translated
         :param voice: model timbre
         :param voice: model timbre
         :return: text translated to audio file
         :return: text translated to audio file
         """
         """
-        # transform credentials to kwargs for model instance
-        credentials_kwargs = self._to_credential_kwargs(credentials)
-        if not voice or voice not in self.get_tts_model_voices(model=model, credentials=credentials):
-            voice = self._get_model_default_voice(model, credentials)
-        word_limit = self._get_model_word_limit(model, credentials)
-        audio_type = self._get_model_audio_type(model, credentials)
-        tts_file_id = self._get_file_name(content_text)
-        file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
         try:
         try:
+            # doc: https://platform.openai.com/docs/guides/text-to-speech
+            credentials_kwargs = self._to_credential_kwargs(credentials)
             client = OpenAI(**credentials_kwargs)
             client = OpenAI(**credentials_kwargs)
-            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
-            for sentence in sentences:
-                response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())
-                # response.stream_to_file(file_path)
-                storage.save(file_path, response.read())
+            if not voice or voice not in self.get_tts_model_voices(model=model, credentials=credentials):
+                voice = self._get_model_default_voice(model, credentials)
+            word_limit = self._get_model_word_limit(model, credentials)
+            if len(content_text) > word_limit:
+                sentences = self._split_text_into_sentences(content_text, max_length=word_limit)
+                executor = concurrent.futures.ThreadPoolExecutor(max_workers=min(3, len(sentences)))
+                futures = [executor.submit(client.audio.speech.with_streaming_response.create, model=model,
+                                           response_format="mp3",
+                                           input=sentences[i], voice=voice) for i in range(len(sentences))]
+                for index, future in enumerate(futures):
+                    yield from future.result().__enter__().iter_bytes(1024)
+
+            else:
+                response = client.audio.speech.with_streaming_response.create(model=model, voice=voice,
+                                                                              response_format="mp3",
+                                                                              input=content_text.strip())
+
+                yield from response.__enter__().iter_bytes(1024)
         except Exception as ex:
         except Exception as ex:
             raise InvokeBadRequestError(str(ex))
             raise InvokeBadRequestError(str(ex))
 
 

+ 1 - 1
api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml

@@ -129,7 +129,7 @@ model_properties:
     - mode: "sambert-waan-v1"
     - mode: "sambert-waan-v1"
       name: "Waan(泰语女声)"
       name: "Waan(泰语女声)"
       language: [ "th-TH" ]
       language: [ "th-TH" ]
-  word_limit: 120
+  word_limit: 7000
   audio_type: 'mp3'
   audio_type: 'mp3'
   max_workers: 5
   max_workers: 5
 pricing:
 pricing:

+ 67 - 30
api/core/model_runtime/model_providers/tongyi/tts/tts.py

@@ -1,17 +1,21 @@
 import concurrent.futures
 import concurrent.futures
+import threading
 from functools import reduce
 from functools import reduce
 from io import BytesIO
 from io import BytesIO
+from queue import Queue
 from typing import Optional
 from typing import Optional
 
 
 import dashscope
 import dashscope
-from flask import Response, stream_with_context
+from dashscope import SpeechSynthesizer
+from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse
+from dashscope.audio.tts import ResultCallback, SpeechSynthesisResult
+from flask import Response
 from pydub import AudioSegment
 from pydub import AudioSegment
 
 
 from core.model_runtime.errors.invoke import InvokeBadRequestError
 from core.model_runtime.errors.invoke import InvokeBadRequestError
 from core.model_runtime.errors.validate import CredentialsValidateFailedError
 from core.model_runtime.errors.validate import CredentialsValidateFailedError
 from core.model_runtime.model_providers.__base.tts_model import TTSModel
 from core.model_runtime.model_providers.__base.tts_model import TTSModel
 from core.model_runtime.model_providers.tongyi._common import _CommonTongyi
 from core.model_runtime.model_providers.tongyi._common import _CommonTongyi
-from extensions.ext_storage import storage
 
 
 
 
 class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
 class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
@@ -19,7 +23,7 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
     Model class for Tongyi Speech to text model.
     Model class for Tongyi Speech to text model.
     """
     """
 
 
-    def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
+    def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str,
                 user: Optional[str] = None) -> any:
                 user: Optional[str] = None) -> any:
         """
         """
         _invoke text2speech model
         _invoke text2speech model
@@ -29,22 +33,17 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
         :param credentials: model credentials
         :param credentials: model credentials
         :param voice: model timbre
         :param voice: model timbre
         :param content_text: text content to be translated
         :param content_text: text content to be translated
-        :param streaming: output is streaming
         :param user: unique user id
         :param user: unique user id
         :return: text translated to audio file
         :return: text translated to audio file
         """
         """
-        audio_type = self._get_model_audio_type(model, credentials)
-        if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]:
+        if not voice or voice not in [d['value'] for d in
+                                      self.get_tts_model_voices(model=model, credentials=credentials)]:
             voice = self._get_model_default_voice(model, credentials)
             voice = self._get_model_default_voice(model, credentials)
-        if streaming:
-            return Response(stream_with_context(self._tts_invoke_streaming(model=model,
-                                                                           credentials=credentials,
-                                                                           content_text=content_text,
-                                                                           voice=voice,
-                                                                           tenant_id=tenant_id)),
-                            status=200, mimetype=f'audio/{audio_type}')
-        else:
-            return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)
+
+        return self._tts_invoke_streaming(model=model,
+                                          credentials=credentials,
+                                          content_text=content_text,
+                                          voice=voice)
 
 
     def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
     def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
         """
         """
@@ -79,7 +78,7 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
         word_limit = self._get_model_word_limit(model, credentials)
         word_limit = self._get_model_word_limit(model, credentials)
         max_workers = self._get_model_workers_limit(model, credentials)
         max_workers = self._get_model_workers_limit(model, credentials)
         try:
         try:
-            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
+            sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
             audio_bytes_list = []
             audio_bytes_list = []
 
 
             # Create a thread pool and map the function to the list of sentences
             # Create a thread pool and map the function to the list of sentences
@@ -105,14 +104,12 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
         except Exception as ex:
         except Exception as ex:
             raise InvokeBadRequestError(str(ex))
             raise InvokeBadRequestError(str(ex))
 
 
-    # Todo: To improve the streaming function
-    def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str,
+    def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
                               voice: str) -> any:
                               voice: str) -> any:
         """
         """
         _tts_invoke_streaming text2speech model
         _tts_invoke_streaming text2speech model
 
 
         :param model: model name
         :param model: model name
-        :param tenant_id: user tenant id
         :param credentials: model credentials
         :param credentials: model credentials
         :param voice: model timbre
         :param voice: model timbre
         :param content_text: text content to be translated
         :param content_text: text content to be translated
@@ -120,18 +117,32 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
         """
         """
         word_limit = self._get_model_word_limit(model, credentials)
         word_limit = self._get_model_word_limit(model, credentials)
         audio_type = self._get_model_audio_type(model, credentials)
         audio_type = self._get_model_audio_type(model, credentials)
-        tts_file_id = self._get_file_name(content_text)
-        file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
         try:
         try:
-            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
-            for sentence in sentences:
-                response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice, sample_rate=48000,
-                                                                      api_key=credentials.get('dashscope_api_key'),
-                                                                      text=sentence.strip(),
-                                                                      format=audio_type, word_timestamp_enabled=True,
-                                                                      phoneme_timestamp_enabled=True)
-                if isinstance(response.get_audio_data(), bytes):
-                    storage.save(file_path, response.get_audio_data())
+            audio_queue: Queue = Queue()
+            callback = Callback(queue=audio_queue)
+
+            def invoke_remote(content, v, api_key, cb, at, wl):
+                if len(content) < word_limit:
+                    sentences = [content]
+                else:
+                    sentences = list(self._split_text_into_sentences(org_text=content, max_length=wl))
+                for sentence in sentences:
+                    SpeechSynthesizer.call(model=v, sample_rate=16000,
+                                           api_key=api_key,
+                                           text=sentence.strip(),
+                                           callback=cb,
+                                           format=at, word_timestamp_enabled=True,
+                                           phoneme_timestamp_enabled=True)
+
+            threading.Thread(target=invoke_remote, args=(
+                content_text, voice, credentials.get('dashscope_api_key'), callback, audio_type, word_limit)).start()
+
+            while True:
+                audio = audio_queue.get()
+                if audio is None:
+                    break
+                yield audio
+
         except Exception as ex:
         except Exception as ex:
             raise InvokeBadRequestError(str(ex))
             raise InvokeBadRequestError(str(ex))
 
 
@@ -152,3 +163,29 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
                                                               format=audio_type)
                                                               format=audio_type)
         if isinstance(response.get_audio_data(), bytes):
         if isinstance(response.get_audio_data(), bytes):
             return response.get_audio_data()
             return response.get_audio_data()
+
+
+class Callback(ResultCallback):
+
+    def __init__(self, queue: Queue):
+        self._queue = queue
+
+    def on_open(self):
+        pass
+
+    def on_complete(self):
+        self._queue.put(None)
+        self._queue.task_done()
+
+    def on_error(self, response: SpeechSynthesisResponse):
+        self._queue.put(None)
+        self._queue.task_done()
+
+    def on_close(self):
+        self._queue.put(None)
+        self._queue.task_done()
+
+    def on_event(self, result: SpeechSynthesisResult):
+        ad = result.get_audio_frame()
+        if ad:
+            self._queue.put(ad)

+ 1 - 1
api/pyproject.toml

@@ -49,7 +49,7 @@ ignore = [
     "B006", # mutable-argument-default
     "B006", # mutable-argument-default
     "B007", # unused-loop-control-variable
     "B007", # unused-loop-control-variable
     "B026", # star-arg-unpacking-after-keyword-arg
     "B026", # star-arg-unpacking-after-keyword-arg
-    "B901", # return-in-generator
+#    "B901", # return-in-generator
     "B904", # raise-without-from-inside-except
     "B904", # raise-without-from-inside-except
     "B905", # zip-without-explicit-strict
     "B905", # zip-without-explicit-strict
 ]
 ]

+ 2 - 0
api/services/app_service.py

@@ -123,6 +123,8 @@ class AppService:
         app.icon = args['icon']
         app.icon = args['icon']
         app.icon_background = args['icon_background']
         app.icon_background = args['icon_background']
         app.tenant_id = tenant_id
         app.tenant_id = tenant_id
+        app.api_rph = args.get('api_rph', 0)
+        app.api_rpm = args.get('api_rpm', 0)
 
 
         db.session.add(app)
         db.session.add(app)
         db.session.flush()
         db.session.flush()

+ 66 - 40
api/services/audio_service.py

@@ -1,11 +1,12 @@
 import io
 import io
+import logging
 from typing import Optional
 from typing import Optional
 
 
 from werkzeug.datastructures import FileStorage
 from werkzeug.datastructures import FileStorage
 
 
 from core.model_manager import ModelManager
 from core.model_manager import ModelManager
 from core.model_runtime.entities.model_entities import ModelType
 from core.model_runtime.entities.model_entities import ModelType
-from models.model import App, AppMode, AppModelConfig
+from models.model import App, AppMode, AppModelConfig, Message
 from services.errors.audio import (
 from services.errors.audio import (
     AudioTooLargeServiceError,
     AudioTooLargeServiceError,
     NoAudioUploadedServiceError,
     NoAudioUploadedServiceError,
@@ -18,6 +19,8 @@ FILE_SIZE = 30
 FILE_SIZE_LIMIT = FILE_SIZE * 1024 * 1024
 FILE_SIZE_LIMIT = FILE_SIZE * 1024 * 1024
 ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm', 'amr']
 ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm', 'amr']
 
 
+logger = logging.getLogger(__name__)
+
 
 
 class AudioService:
 class AudioService:
     @classmethod
     @classmethod
@@ -64,51 +67,74 @@ class AudioService:
         return {"text": model_instance.invoke_speech2text(file=buffer, user=end_user)}
         return {"text": model_instance.invoke_speech2text(file=buffer, user=end_user)}
 
 
     @classmethod
     @classmethod
-    def transcript_tts(cls, app_model: App, text: str, streaming: bool,
-                       voice: Optional[str] = None, end_user: Optional[str] = None):
-        if app_model.mode in [AppMode.ADVANCED_CHAT.value, AppMode.WORKFLOW.value]:
-            workflow = app_model.workflow
-            if workflow is None:
-                raise ValueError("TTS is not enabled")
+    def transcript_tts(cls, app_model: App, text: Optional[str] = None,
+                       voice: Optional[str] = None, end_user: Optional[str] = None, message_id: Optional[str] = None):
+        from collections.abc import Generator
 
 
-            features_dict = workflow.features_dict
-            if 'text_to_speech' not in features_dict or not features_dict['text_to_speech'].get('enabled'):
-                raise ValueError("TTS is not enabled")
+        from flask import Response, stream_with_context
 
 
-            voice = features_dict['text_to_speech'].get('voice') if voice is None else voice
-        else:
-            text_to_speech_dict = app_model.app_model_config.text_to_speech_dict
-
-            if not text_to_speech_dict.get('enabled'):
-                raise ValueError("TTS is not enabled")
+        from app import app
+        from extensions.ext_database import db
 
 
-            voice = text_to_speech_dict.get('voice') if voice is None else voice
+        def invoke_tts(text_content: str, app_model, voice: Optional[str] = None):
+            with app.app_context():
+                if app_model.mode in [AppMode.ADVANCED_CHAT.value, AppMode.WORKFLOW.value]:
+                    workflow = app_model.workflow
+                    if workflow is None:
+                        raise ValueError("TTS is not enabled")
 
 
-        model_manager = ModelManager()
-        model_instance = model_manager.get_default_model_instance(
-            tenant_id=app_model.tenant_id,
-            model_type=ModelType.TTS
-        )
-        if model_instance is None:
-            raise ProviderNotSupportTextToSpeechServiceError()
+                    features_dict = workflow.features_dict
+                    if 'text_to_speech' not in features_dict or not features_dict['text_to_speech'].get('enabled'):
+                        raise ValueError("TTS is not enabled")
 
 
-        try:
-            if not voice:
-                voices = model_instance.get_tts_voices()
-                if voices:
-                    voice = voices[0].get('value')
+                    voice = features_dict['text_to_speech'].get('voice') if voice is None else voice
                 else:
                 else:
-                    raise ValueError("Sorry, no voice available.")
-
-            return model_instance.invoke_tts(
-                content_text=text.strip(),
-                user=end_user,
-                streaming=streaming,
-                tenant_id=app_model.tenant_id,
-                voice=voice
-            )
-        except Exception as e:
-            raise e
+                    text_to_speech_dict = app_model.app_model_config.text_to_speech_dict
+
+                    if not text_to_speech_dict.get('enabled'):
+                        raise ValueError("TTS is not enabled")
+
+                    voice = text_to_speech_dict.get('voice') if voice is None else voice
+
+                model_manager = ModelManager()
+                model_instance = model_manager.get_default_model_instance(
+                    tenant_id=app_model.tenant_id,
+                    model_type=ModelType.TTS
+                )
+                try:
+                    if not voice:
+                        voices = model_instance.get_tts_voices()
+                        if voices:
+                            voice = voices[0].get('value')
+                        else:
+                            raise ValueError("Sorry, no voice available.")
+
+                    return model_instance.invoke_tts(
+                        content_text=text_content.strip(),
+                        user=end_user,
+                        tenant_id=app_model.tenant_id,
+                        voice=voice
+                    )
+                except Exception as e:
+                    raise e
+
+        if message_id:
+            message = db.session.query(Message).filter(
+                Message.id == message_id
+            ).first()
+            if message.answer == '' and message.status == 'normal':
+                return None
+
+            else:
+                response = invoke_tts(message.answer, app_model=app_model, voice=voice)
+                if isinstance(response, Generator):
+                    return Response(stream_with_context(response), content_type='audio/mpeg')
+                return response
+        else:
+            response = invoke_tts(text, app_model, voice)
+            if isinstance(response, Generator):
+                return Response(stream_with_context(response), content_type='audio/mpeg')
+            return response
 
 
     @classmethod
     @classmethod
     def transcript_tts_voices(cls, tenant_id: str, language: str):
     def transcript_tts_voices(cls, tenant_id: str, language: str):

+ 50 - 14
web/app/components/app/configuration/config-voice/param-config-content.tsx

@@ -11,11 +11,13 @@ import { usePathname } from 'next/navigation'
 import { useTranslation } from 'react-i18next'
 import { useTranslation } from 'react-i18next'
 import { Listbox, Transition } from '@headlessui/react'
 import { Listbox, Transition } from '@headlessui/react'
 import { CheckIcon, ChevronDownIcon } from '@heroicons/react/20/solid'
 import { CheckIcon, ChevronDownIcon } from '@heroicons/react/20/solid'
+import RadioGroup from '@/app/components/app/configuration/config-vision/radio-group'
 import type { Item } from '@/app/components/base/select'
 import type { Item } from '@/app/components/base/select'
 import ConfigContext from '@/context/debug-configuration'
 import ConfigContext from '@/context/debug-configuration'
 import { fetchAppVoices } from '@/service/apps'
 import { fetchAppVoices } from '@/service/apps'
 import Tooltip from '@/app/components/base/tooltip'
 import Tooltip from '@/app/components/base/tooltip'
 import { languages } from '@/i18n/language'
 import { languages } from '@/i18n/language'
+import { TtsAutoPlay } from '@/types/app'
 const VoiceParamConfig: FC = () => {
 const VoiceParamConfig: FC = () => {
   const { t } = useTranslation()
   const { t } = useTranslation()
   const pathname = usePathname()
   const pathname = usePathname()
@@ -27,12 +29,16 @@ const VoiceParamConfig: FC = () => {
     setTextToSpeechConfig,
     setTextToSpeechConfig,
   } = useContext(ConfigContext)
   } = useContext(ConfigContext)
 
 
-  const languageItem = languages.find(item => item.value === textToSpeechConfig.language)
+  let languageItem = languages.find(item => item.value === textToSpeechConfig.language)
   const localLanguagePlaceholder = languageItem?.name || t('common.placeholder.select')
   const localLanguagePlaceholder = languageItem?.name || t('common.placeholder.select')
-
+  if (languages && !languageItem)
+    languageItem = languages[0]
   const language = languageItem?.value
   const language = languageItem?.value
   const voiceItems = useSWR({ appId, language }, fetchAppVoices).data
   const voiceItems = useSWR({ appId, language }, fetchAppVoices).data
-  const voiceItem = voiceItems?.find(item => item.value === textToSpeechConfig.voice)
+  let voiceItem = voiceItems?.find(item => item.value === textToSpeechConfig.voice)
+  if (voiceItems && !voiceItem)
+    voiceItem = voiceItems[0]
+
   const localVoicePlaceholder = voiceItem?.name || t('common.placeholder.select')
   const localVoicePlaceholder = voiceItem?.name || t('common.placeholder.select')
 
 
   return (
   return (
@@ -42,8 +48,9 @@ const VoiceParamConfig: FC = () => {
         <div className='pt-3 space-y-6'>
         <div className='pt-3 space-y-6'>
           <div>
           <div>
             <div className='mb-2 flex items-center  space-x-1'>
             <div className='mb-2 flex items-center  space-x-1'>
-              <div className='leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.language')}</div>
-              <Tooltip htmlContent={<div className='w-[180px]' >
+              <div
+                className='leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.language')}</div>
+              <Tooltip htmlContent={<div className='w-[180px]'>
                 {t('appDebug.voice.voiceSettings.resolutionTooltip').split('\n').map(item => (
                 {t('appDebug.voice.voiceSettings.resolutionTooltip').split('\n').map(item => (
                   <div key={item}>{item}</div>
                   <div key={item}>{item}</div>
                 ))}
                 ))}
@@ -61,7 +68,8 @@ const VoiceParamConfig: FC = () => {
               }}
               }}
             >
             >
               <div className={'relative h-9'}>
               <div className={'relative h-9'}>
-                <Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
+                <Listbox.Button
+                  className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
                   <span className={classNames('block truncate text-left', !languageItem?.name && 'text-gray-400')}>
                   <span className={classNames('block truncate text-left', !languageItem?.name && 'text-gray-400')}>
                     {languageItem?.name ? t(`common.voice.language.${languageItem?.value.replace('-', '')}`) : localLanguagePlaceholder}
                     {languageItem?.name ? t(`common.voice.language.${languageItem?.value.replace('-', '')}`) : localLanguagePlaceholder}
                   </span>
                   </span>
@@ -79,7 +87,8 @@ const VoiceParamConfig: FC = () => {
                   leaveTo="opacity-0"
                   leaveTo="opacity-0"
                 >
                 >
 
 
-                  <Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
+                  <Listbox.Options
+                    className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
                     {languages.map((item: Item) => (
                     {languages.map((item: Item) => (
                       <Listbox.Option
                       <Listbox.Option
                         key={item.value}
                         key={item.value}
@@ -100,7 +109,7 @@ const VoiceParamConfig: FC = () => {
                                   'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
                                   'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
                                 )}
                                 )}
                               >
                               >
-                                <CheckIcon className="h-5 w-5" aria-hidden="true" />
+                                <CheckIcon className="h-5 w-5" aria-hidden="true"/>
                               </span>
                               </span>
                             )}
                             )}
                           </>
                           </>
@@ -112,9 +121,9 @@ const VoiceParamConfig: FC = () => {
               </div>
               </div>
             </Listbox>
             </Listbox>
           </div>
           </div>
-
           <div>
           <div>
-            <div className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.voice')}</div>
+            <div
+              className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.voice')}</div>
             <Listbox
             <Listbox
               value={voiceItem}
               value={voiceItem}
               disabled={!languageItem}
               disabled={!languageItem}
@@ -126,8 +135,10 @@ const VoiceParamConfig: FC = () => {
               }}
               }}
             >
             >
               <div className={'relative h-9'}>
               <div className={'relative h-9'}>
-                <Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
-                  <span className={classNames('block truncate text-left', !voiceItem?.name && 'text-gray-400')}>{voiceItem?.name ?? localVoicePlaceholder}</span>
+                <Listbox.Button
+                  className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
+                  <span
+                    className={classNames('block truncate text-left', !voiceItem?.name && 'text-gray-400')}>{voiceItem?.name ?? localVoicePlaceholder}</span>
                   <span className="pointer-events-none absolute inset-y-0 right-0 flex items-center pr-2">
                   <span className="pointer-events-none absolute inset-y-0 right-0 flex items-center pr-2">
                     <ChevronDownIcon
                     <ChevronDownIcon
                       className="h-5 w-5 text-gray-400"
                       className="h-5 w-5 text-gray-400"
@@ -142,7 +153,8 @@ const VoiceParamConfig: FC = () => {
                   leaveTo="opacity-0"
                   leaveTo="opacity-0"
                 >
                 >
 
 
-                  <Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
+                  <Listbox.Options
+                    className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
                     {voiceItems?.map((item: Item) => (
                     {voiceItems?.map((item: Item) => (
                       <Listbox.Option
                       <Listbox.Option
                         key={item.value}
                         key={item.value}
@@ -162,7 +174,7 @@ const VoiceParamConfig: FC = () => {
                                   'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
                                   'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
                                 )}
                                 )}
                               >
                               >
-                                <CheckIcon className="h-5 w-5" aria-hidden="true" />
+                                <CheckIcon className="h-5 w-5" aria-hidden="true"/>
                               </span>
                               </span>
                             )}
                             )}
                           </>
                           </>
@@ -174,6 +186,30 @@ const VoiceParamConfig: FC = () => {
               </div>
               </div>
             </Listbox>
             </Listbox>
           </div>
           </div>
+          <div>
+            <div
+              className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.autoPlay')}</div>
+            <RadioGroup
+              className='space-x-3'
+              options={[
+                {
+                  label: t('appDebug.voice.voiceSettings.autoPlayEnabled'),
+                  value: TtsAutoPlay.enabled,
+                },
+                {
+                  label: t('appDebug.voice.voiceSettings.autoPlayDisabled'),
+                  value: TtsAutoPlay.disabled,
+                },
+              ]}
+              value={textToSpeechConfig.autoPlay ? textToSpeechConfig.autoPlay : TtsAutoPlay.disabled}
+              onChange={(value: TtsAutoPlay) => {
+                setTextToSpeechConfig({
+                  ...textToSpeechConfig,
+                  autoPlay: value,
+                })
+              }}
+            />
+          </div>
         </div>
         </div>
       </div>
       </div>
     </div>
     </div>

+ 0 - 1
web/app/components/app/configuration/features/chat-group/text-to-speech/index.tsx

@@ -40,7 +40,6 @@ const TextToSpeech: FC = () => {
           { languageInfo?.example && (
           { languageInfo?.example && (
             <AudioBtn
             <AudioBtn
               value={languageInfo?.example}
               value={languageInfo?.example}
-              voice={voiceItem?.value}
               isAudition
               isAudition
               noCache
               noCache
             />
             />

+ 1 - 2
web/app/components/app/text-generate/item/index.tsx

@@ -428,8 +428,7 @@ const GenerationItem: FC<IGenerationItemProps> = ({
                   <>
                   <>
                     <div className='ml-2 mr-2 h-[14px] w-[1px] bg-gray-200'></div>
                     <div className='ml-2 mr-2 h-[14px] w-[1px] bg-gray-200'></div>
                     <AudioBtn
                     <AudioBtn
-                      value={content}
-                      noCache={false}
+                      id={messageId!}
                       className={'mr-1'}
                       className={'mr-1'}
                     />
                     />
                   </>
                   </>

+ 53 - 0
web/app/components/base/audio-btn/audio.player.manager.ts

@@ -0,0 +1,53 @@
+import AudioPlayer from '@/app/components/base/audio-btn/audio'
+declare global {
+  // eslint-disable-next-line @typescript-eslint/consistent-type-definitions
+  interface AudioPlayerManager {
+    instance: AudioPlayerManager
+  }
+
+}
+
+export class AudioPlayerManager {
+  private static instance: AudioPlayerManager
+  private audioPlayers: AudioPlayer | null = null
+  private msgId: string | undefined
+
+  private constructor() {
+  }
+
+  public static getInstance(): AudioPlayerManager {
+    if (!AudioPlayerManager.instance) {
+      AudioPlayerManager.instance = new AudioPlayerManager()
+      this.instance = AudioPlayerManager.instance
+    }
+
+    return AudioPlayerManager.instance
+  }
+
+  public getAudioPlayer(url: string, isPublic: boolean, id: string | undefined, msgContent: string | null | undefined, voice: string | undefined, callback: ((event: string) => {}) | null): AudioPlayer {
+    if (this.msgId && this.msgId === id && this.audioPlayers) {
+      this.audioPlayers.setCallback(callback)
+      return this.audioPlayers
+    }
+    else {
+      if (this.audioPlayers) {
+        try {
+          this.audioPlayers.pauseAudio()
+          this.audioPlayers.cacheBuffers = []
+          this.audioPlayers.sourceBuffer?.abort()
+        }
+        catch (e) {
+        }
+      }
+
+      this.msgId = id
+      this.audioPlayers = new AudioPlayer(url, isPublic, id, msgContent, callback)
+      return this.audioPlayers
+    }
+  }
+
+  public resetMsgId(msgId: string) {
+    this.msgId = msgId
+    this.audioPlayers?.resetMsgId(msgId)
+  }
+}

+ 263 - 0
web/app/components/base/audio-btn/audio.ts

@@ -0,0 +1,263 @@
+import Toast from '@/app/components/base/toast'
+import { textToAudioStream } from '@/service/share'
+
+declare global {
+  // eslint-disable-next-line @typescript-eslint/consistent-type-definitions
+  interface Window {
+    ManagedMediaSource: any
+  }
+}
+
+export default class AudioPlayer {
+  mediaSource: MediaSource | null
+  audio: HTMLAudioElement
+  audioContext: AudioContext
+  sourceBuffer?: SourceBuffer
+  cacheBuffers: ArrayBuffer[] = []
+  pauseTimer: number | null = null
+  msgId: string | undefined
+  msgContent: string | null | undefined = null
+  voice: string | undefined = undefined
+  isLoadData = false
+  url: string
+  isPublic: boolean
+  callback: ((event: string) => {}) | null
+
+  constructor(streamUrl: string, isPublic: boolean, msgId: string | undefined, msgContent: string | null | undefined, callback: ((event: string) => {}) | null) {
+    this.audioContext = new AudioContext()
+    this.msgId = msgId
+    this.msgContent = msgContent
+    this.url = streamUrl
+    this.isPublic = isPublic
+    this.callback = callback
+
+    // Compatible with iphone ios17 ManagedMediaSource
+    const MediaSource = window.MediaSource || window.ManagedMediaSource
+    if (!MediaSource) {
+      Toast.notify({
+        message: 'Your browser does not support audio streaming, if you are using an iPhone, please update to iOS 17.1 or later.',
+        type: 'error',
+      })
+    }
+    this.mediaSource = MediaSource ? new MediaSource() : null
+    this.audio = new Audio()
+    this.setCallback(callback)
+    this.audio.src = this.mediaSource ? URL.createObjectURL(this.mediaSource) : ''
+    this.audio.autoplay = true
+
+    const source = this.audioContext.createMediaElementSource(this.audio)
+    source.connect(this.audioContext.destination)
+    this.listenMediaSource('audio/mpeg')
+  }
+
+  public resetMsgId(msgId: string) {
+    this.msgId = msgId
+  }
+
+  private listenMediaSource(contentType: string) {
+    this.mediaSource?.addEventListener('sourceopen', () => {
+      if (this.sourceBuffer)
+        return
+
+      this.sourceBuffer = this.mediaSource?.addSourceBuffer(contentType)
+    //   this.sourceBuffer?.addEventListener('update', () => {
+    //     if (this.cacheBuffers.length && !this.sourceBuffer?.updating) {
+    //       const cacheBuffer = this.cacheBuffers.shift()!
+    //       this.sourceBuffer?.appendBuffer(cacheBuffer)
+    //     }
+    //     // this.pauseAudio()
+    //   })
+    //
+    //   this.sourceBuffer?.addEventListener('updateend', () => {
+    //     if (this.cacheBuffers.length && !this.sourceBuffer?.updating) {
+    //       const cacheBuffer = this.cacheBuffers.shift()!
+    //       this.sourceBuffer?.appendBuffer(cacheBuffer)
+    //     }
+    //     // this.pauseAudio()
+    //   })
+    })
+  }
+
+  public setCallback(callback: ((event: string) => {}) | null) {
+    this.callback = callback
+    if (callback) {
+      this.audio.addEventListener('ended', () => {
+        callback('ended')
+      }, false)
+      this.audio.addEventListener('paused', () => {
+        callback('paused')
+      }, true)
+      this.audio.addEventListener('loaded', () => {
+        callback('loaded')
+      }, true)
+      this.audio.addEventListener('play', () => {
+        callback('play')
+      }, true)
+      this.audio.addEventListener('timeupdate', () => {
+        callback('timeupdate')
+      }, true)
+      this.audio.addEventListener('loadeddate', () => {
+        callback('loadeddate')
+      }, true)
+      this.audio.addEventListener('canplay', () => {
+        callback('canplay')
+      }, true)
+      this.audio.addEventListener('error', () => {
+        callback('error')
+      }, true)
+    }
+  }
+
+  private async loadAudio() {
+    try {
+      const audioResponse: any = await textToAudioStream(this.url, this.isPublic, { content_type: 'audio/mpeg' }, {
+        message_id: this.msgId,
+        streaming: true,
+        voice: this.voice,
+        text: this.msgContent,
+      })
+
+      if (audioResponse.status !== 200) {
+        this.isLoadData = false
+        if (this.callback)
+          this.callback('error')
+      }
+
+      const reader = audioResponse.body.getReader()
+      while (true) {
+        const { value, done } = await reader.read()
+
+        if (done) {
+          this.receiveAudioData(value)
+          break
+        }
+
+        this.receiveAudioData(value)
+      }
+    }
+    catch (error) {
+      this.isLoadData = false
+      this.callback && this.callback('error')
+    }
+  }
+
+  // play audio
+  public playAudio() {
+    if (this.isLoadData) {
+      if (this.audioContext.state === 'suspended') {
+        this.audioContext.resume().then((_) => {
+          this.audio.play()
+          this.callback && this.callback('play')
+        })
+      }
+      else if (this.audio.ended) {
+        this.audio.play()
+        this.callback && this.callback('play')
+      }
+      if (this.callback)
+        this.callback('play')
+    }
+    else {
+      this.isLoadData = true
+      this.loadAudio()
+    }
+  }
+
+  private theEndOfStream() {
+    const endTimer = setInterval(() => {
+      if (!this.sourceBuffer?.updating) {
+        this.mediaSource?.endOfStream()
+        clearInterval(endTimer)
+      }
+      console.log('finishStream  endOfStream endTimer')
+    }, 10)
+  }
+
+  private finishStream() {
+    const timer = setInterval(() => {
+      if (!this.cacheBuffers.length) {
+        this.theEndOfStream()
+        clearInterval(timer)
+      }
+
+      if (this.cacheBuffers.length && !this.sourceBuffer?.updating) {
+        const arrayBuffer = this.cacheBuffers.shift()!
+        this.sourceBuffer?.appendBuffer(arrayBuffer)
+      }
+      console.log('finishStream  timer')
+    }, 10)
+  }
+
+  public async playAudioWithAudio(audio: string, play = true) {
+    if (!audio || !audio.length) {
+      this.finishStream()
+      return
+    }
+
+    const audioContent = Buffer.from(audio, 'base64')
+    this.receiveAudioData(new Uint8Array(audioContent))
+    if (play) {
+      this.isLoadData = true
+      if (this.audio.paused) {
+        this.audioContext.resume().then((_) => {
+          this.audio.play()
+          this.callback && this.callback('play')
+        })
+      }
+      else if (this.audio.ended) {
+        this.audio.play()
+        this.callback && this.callback('play')
+      }
+      else if (this.audio.played) { /* empty */ }
+
+      else {
+        this.audio.play()
+        this.callback && this.callback('play')
+      }
+    }
+  }
+
+  public pauseAudio() {
+    this.callback && this.callback('paused')
+    this.audio.pause()
+    this.audioContext.suspend()
+  }
+
+  private cancer() {
+
+  }
+
+  private receiveAudioData(unit8Array: Uint8Array) {
+    if (!unit8Array) {
+      this.finishStream()
+      return
+    }
+    const audioData = this.byteArrayToArrayBuffer(unit8Array)
+    if (!audioData.byteLength) {
+      if (this.mediaSource?.readyState === 'open')
+        this.finishStream()
+      return
+    }
+
+    if (this.sourceBuffer?.updating) {
+      this.cacheBuffers.push(audioData)
+    }
+    else {
+      if (this.cacheBuffers.length && !this.sourceBuffer?.updating) {
+        this.cacheBuffers.push(audioData)
+        const cacheBuffer = this.cacheBuffers.shift()!
+        this.sourceBuffer?.appendBuffer(cacheBuffer)
+      }
+      else {
+        this.sourceBuffer?.appendBuffer(audioData)
+      }
+    }
+  }
+
+  private byteArrayToArrayBuffer(byteArray: Uint8Array): ArrayBuffer {
+    const arrayBuffer = new ArrayBuffer(byteArray.length)
+    const uint8Array = new Uint8Array(arrayBuffer)
+    uint8Array.set(byteArray)
+    return arrayBuffer
+  }
+}

+ 42 - 89
web/app/components/base/audio-btn/index.tsx

@@ -1,124 +1,78 @@
 'use client'
 'use client'
-import { useEffect, useRef, useState } from 'react'
+import { useRef, useState } from 'react'
 import { t } from 'i18next'
 import { t } from 'i18next'
 import { useParams, usePathname } from 'next/navigation'
 import { useParams, usePathname } from 'next/navigation'
 import s from './style.module.css'
 import s from './style.module.css'
 import Tooltip from '@/app/components/base/tooltip'
 import Tooltip from '@/app/components/base/tooltip'
 import { randomString } from '@/utils'
 import { randomString } from '@/utils'
-import { textToAudio } from '@/service/share'
 import Loading from '@/app/components/base/loading'
 import Loading from '@/app/components/base/loading'
+import { AudioPlayerManager } from '@/app/components/base/audio-btn/audio.player.manager'
 
 
 type AudioBtnProps = {
 type AudioBtnProps = {
-  value: string
+  id?: string
   voice?: string
   voice?: string
+  value?: string
   className?: string
   className?: string
   isAudition?: boolean
   isAudition?: boolean
-  noCache: boolean
+  noCache?: boolean
 }
 }
 
 
 type AudioState = 'initial' | 'loading' | 'playing' | 'paused' | 'ended'
 type AudioState = 'initial' | 'loading' | 'playing' | 'paused' | 'ended'
 
 
 const AudioBtn = ({
 const AudioBtn = ({
-  value,
+  id,
   voice,
   voice,
+  value,
   className,
   className,
   isAudition,
   isAudition,
-  noCache,
 }: AudioBtnProps) => {
 }: AudioBtnProps) => {
-  const audioRef = useRef<HTMLAudioElement | null>(null)
   const [audioState, setAudioState] = useState<AudioState>('initial')
   const [audioState, setAudioState] = useState<AudioState>('initial')
 
 
   const selector = useRef(`play-tooltip-${randomString(4)}`)
   const selector = useRef(`play-tooltip-${randomString(4)}`)
   const params = useParams()
   const params = useParams()
   const pathname = usePathname()
   const pathname = usePathname()
-  const removeCodeBlocks = (inputText: any) => {
-    const codeBlockRegex = /```[\s\S]*?```/g
-    if (inputText)
-      return inputText.replace(codeBlockRegex, '')
-    return ''
-  }
-
-  const loadAudio = async () => {
-    const formData = new FormData()
-    formData.append('text', removeCodeBlocks(value))
-    formData.append('voice', removeCodeBlocks(voice))
-
-    if (value !== '') {
-      setAudioState('loading')
-
-      let url = ''
-      let isPublic = false
-
-      if (params.token) {
-        url = '/text-to-audio'
-        isPublic = true
-      }
-      else if (params.appId) {
-        if (pathname.search('explore/installed') > -1)
-          url = `/installed-apps/${params.appId}/text-to-audio`
-        else
-          url = `/apps/${params.appId}/text-to-audio`
-      }
-
-      try {
-        const audioResponse = await textToAudio(url, isPublic, formData)
-        const blob_bytes = Buffer.from(audioResponse.data, 'latin1')
-        const blob = new Blob([blob_bytes], { type: 'audio/wav' })
-        const audioUrl = URL.createObjectURL(blob)
-        audioRef.current!.src = audioUrl
-      }
-      catch (error) {
-        setAudioState('initial')
-        console.error('Error playing audio:', error)
-      }
+  const audio_finished_call = (event: string): any => {
+    switch (event) {
+      case 'ended':
+        setAudioState('ended')
+        break
+      case 'paused':
+        setAudioState('ended')
+        break
+      case 'loaded':
+        setAudioState('loading')
+        break
+      case 'play':
+        setAudioState('playing')
+        break
+      case 'error':
+        setAudioState('ended')
+        break
     }
     }
   }
   }
+  let url = ''
+  let isPublic = false
 
 
+  if (params.token) {
+    url = '/text-to-audio'
+    isPublic = true
+  }
+  else if (params.appId) {
+    if (pathname.search('explore/installed') > -1)
+      url = `/installed-apps/${params.appId}/text-to-audio`
+    else
+      url = `/apps/${params.appId}/text-to-audio`
+  }
   const handleToggle = async () => {
   const handleToggle = async () => {
-    if (audioState === 'initial' || noCache) {
-      await loadAudio()
+    if (audioState === 'playing' || audioState === 'loading') {
+      setAudioState('paused')
+      AudioPlayerManager.getInstance().getAudioPlayer(url, isPublic, id, value, voice, audio_finished_call).pauseAudio()
     }
     }
-    else if (audioRef.current) {
-      if (audioState === 'playing') {
-        audioRef.current.pause()
-        setAudioState('paused')
-      }
-      else {
-        audioRef.current.play()
-        setAudioState('playing')
-      }
-    }
-  }
-
-  useEffect(() => {
-    const currentAudio = audioRef.current
-
-    const handleLoading = () => {
+    else {
       setAudioState('loading')
       setAudioState('loading')
+      AudioPlayerManager.getInstance().getAudioPlayer(url, isPublic, id, value, voice, audio_finished_call).playAudio()
     }
     }
-
-    const handlePlay = () => {
-      currentAudio?.play()
-      setAudioState('playing')
-    }
-
-    const handleEnded = () => {
-      setAudioState('ended')
-    }
-
-    currentAudio?.addEventListener('progress', handleLoading)
-    currentAudio?.addEventListener('canplaythrough', handlePlay)
-    currentAudio?.addEventListener('ended', handleEnded)
-
-    return () => {
-      currentAudio?.removeEventListener('progress', handleLoading)
-      currentAudio?.removeEventListener('canplaythrough', handlePlay)
-      currentAudio?.removeEventListener('ended', handleEnded)
-      URL.revokeObjectURL(currentAudio?.src || '')
-      currentAudio?.pause()
-      currentAudio?.setAttribute('src', '')
-    }
-  }, [])
+  }
 
 
   const tooltipContent = {
   const tooltipContent = {
     initial: t('appApi.play'),
     initial: t('appApi.play'),
@@ -151,7 +105,6 @@ const AudioBtn = ({
             )}
             )}
         </button>
         </button>
       </Tooltip>
       </Tooltip>
-      <audio ref={audioRef} src='' className='hidden' />
     </div>
     </div>
   )
   )
 }
 }

+ 10 - 7
web/app/components/base/chat/chat/answer/index.tsx

@@ -8,6 +8,7 @@ import type {
   ChatConfig,
   ChatConfig,
   ChatItem,
   ChatItem,
 } from '../../types'
 } from '../../types'
+import { useChatContext } from '../context'
 import Operation from './operation'
 import Operation from './operation'
 import AgentContent from './agent-content'
 import AgentContent from './agent-content'
 import BasicContent from './basic-content'
 import BasicContent from './basic-content'
@@ -59,23 +60,25 @@ const Answer: FC<AnswerProps> = ({
   } = item
   } = item
   const hasAgentThoughts = !!agent_thoughts?.length
   const hasAgentThoughts = !!agent_thoughts?.length
 
 
-  const [containerWidth, setContainerWidth] = useState(0)
+  const [containerWidth] = useState(0)
   const [contentWidth, setContentWidth] = useState(0)
   const [contentWidth, setContentWidth] = useState(0)
   const containerRef = useRef<HTMLDivElement>(null)
   const containerRef = useRef<HTMLDivElement>(null)
   const contentRef = useRef<HTMLDivElement>(null)
   const contentRef = useRef<HTMLDivElement>(null)
 
 
-  const getContainerWidth = () => {
-    if (containerRef.current)
-      setContainerWidth(containerRef.current?.clientWidth + 16)
-  }
+  const {
+    config: chatContextConfig,
+  } = useChatContext()
+
+  const voiceRef = useRef(chatContextConfig?.text_to_speech?.voice)
   const getContentWidth = () => {
   const getContentWidth = () => {
     if (contentRef.current)
     if (contentRef.current)
       setContentWidth(contentRef.current?.clientWidth)
       setContentWidth(contentRef.current?.clientWidth)
   }
   }
 
 
   useEffect(() => {
   useEffect(() => {
-    getContainerWidth()
-  }, [])
+    voiceRef.current = chatContextConfig?.text_to_speech?.voice
+  }
+  , [chatContextConfig?.text_to_speech?.voice])
 
 
   useEffect(() => {
   useEffect(() => {
     if (!responding)
     if (!responding)

+ 1 - 1
web/app/components/base/chat/chat/answer/operation.tsx

@@ -119,9 +119,9 @@ const Operation: FC<OperationProps> = ({
               <>
               <>
                 <div className='mx-1 w-[1px] h-[14px] bg-gray-200'/>
                 <div className='mx-1 w-[1px] h-[14px] bg-gray-200'/>
                 <AudioBtn
                 <AudioBtn
+                  id={id}
                   value={content}
                   value={content}
                   noCache={false}
                   noCache={false}
-                  voice={config?.text_to_speech?.voice}
                   className='hidden group-hover:block'
                   className='hidden group-hover:block'
                 />
                 />
               </>
               </>

+ 27 - 1
web/app/components/base/chat/chat/hooks.ts

@@ -6,6 +6,8 @@ import {
 } from 'react'
 } from 'react'
 import { useTranslation } from 'react-i18next'
 import { useTranslation } from 'react-i18next'
 import { produce, setAutoFreeze } from 'immer'
 import { produce, setAutoFreeze } from 'immer'
+import { useParams, usePathname } from 'next/navigation'
+import { v4 as uuidV4 } from 'uuid'
 import type {
 import type {
   ChatConfig,
   ChatConfig,
   ChatItem,
   ChatItem,
@@ -20,6 +22,7 @@ import { replaceStringWithValues } from '@/app/components/app/configuration/prom
 import type { Annotation } from '@/models/log'
 import type { Annotation } from '@/models/log'
 import { WorkflowRunningStatus } from '@/app/components/workflow/types'
 import { WorkflowRunningStatus } from '@/app/components/workflow/types'
 import useTimestamp from '@/hooks/use-timestamp'
 import useTimestamp from '@/hooks/use-timestamp'
+import { AudioPlayerManager } from '@/app/components/base/audio-btn/audio.player.manager'
 
 
 type GetAbortController = (abortController: AbortController) => void
 type GetAbortController = (abortController: AbortController) => void
 type SendCallback = {
 type SendCallback = {
@@ -91,7 +94,8 @@ export const useChat = (
   const conversationMessagesAbortControllerRef = useRef<AbortController | null>(null)
   const conversationMessagesAbortControllerRef = useRef<AbortController | null>(null)
   const suggestedQuestionsAbortControllerRef = useRef<AbortController | null>(null)
   const suggestedQuestionsAbortControllerRef = useRef<AbortController | null>(null)
   const checkPromptVariables = useCheckPromptVariables()
   const checkPromptVariables = useCheckPromptVariables()
-
+  const params = useParams()
+  const pathname = usePathname()
   useEffect(() => {
   useEffect(() => {
     setAutoFreeze(false)
     setAutoFreeze(false)
     return () => {
     return () => {
@@ -262,6 +266,19 @@ export const useChat = (
     let isAgentMode = false
     let isAgentMode = false
     let hasSetResponseId = false
     let hasSetResponseId = false
 
 
+    let ttsUrl = ''
+    let ttsIsPublic = false
+    if (params.token) {
+      ttsUrl = '/text-to-audio'
+      ttsIsPublic = true
+    }
+    else if (params.appId) {
+      if (pathname.search('explore/installed') > -1)
+        ttsUrl = `/installed-apps/${params.appId}/text-to-audio`
+      else
+        ttsUrl = `/apps/${params.appId}/text-to-audio`
+    }
+    const player = AudioPlayerManager.getInstance().getAudioPlayer(ttsUrl, ttsIsPublic, uuidV4(), 'none', 'none', (_: any): any => {})
     ssePost(
     ssePost(
       url,
       url,
       {
       {
@@ -530,6 +547,15 @@ export const useChat = (
             }
             }
           }))
           }))
         },
         },
+        onTTSChunk: (messageId: string, audio: string) => {
+          if (!audio || audio === '')
+            return
+          player.playAudioWithAudio(audio, true)
+          AudioPlayerManager.getInstance().resetMsgId(messageId)
+        },
+        onTTSEnd: (messageId: string, audio: string) => {
+          player.playAudioWithAudio(audio, false)
+        },
       })
       })
     return true
     return true
   }, [
   }, [

+ 51 - 15
web/app/components/base/features/feature-panel/text-to-speech/param-config-content.tsx

@@ -19,6 +19,8 @@ import type { Item } from '@/app/components/base/select'
 import { fetchAppVoices } from '@/service/apps'
 import { fetchAppVoices } from '@/service/apps'
 import Tooltip from '@/app/components/base/tooltip'
 import Tooltip from '@/app/components/base/tooltip'
 import { languages } from '@/i18n/language'
 import { languages } from '@/i18n/language'
+import RadioGroup from '@/app/components/app/configuration/config-vision/radio-group'
+import { TtsAutoPlay } from '@/types/app'
 
 
 type VoiceParamConfigProps = {
 type VoiceParamConfigProps = {
   onChange?: OnFeaturesChange
   onChange?: OnFeaturesChange
@@ -33,12 +35,16 @@ const VoiceParamConfig = ({
   const text2speech = useFeatures(state => state.features.text2speech)
   const text2speech = useFeatures(state => state.features.text2speech)
   const featuresStore = useFeaturesStore()
   const featuresStore = useFeaturesStore()
 
 
-  const languageItem = languages.find(item => item.value === text2speech.language)
+  let languageItem = languages.find(item => item.value === text2speech?.language)
+  if (languages && !languageItem)
+    languageItem = languages[0]
   const localLanguagePlaceholder = languageItem?.name || t('common.placeholder.select')
   const localLanguagePlaceholder = languageItem?.name || t('common.placeholder.select')
 
 
   const language = languageItem?.value
   const language = languageItem?.value
   const voiceItems = useSWR({ appId, language }, fetchAppVoices).data
   const voiceItems = useSWR({ appId, language }, fetchAppVoices).data
-  const voiceItem = voiceItems?.find(item => item.value === text2speech.voice)
+  let voiceItem = voiceItems?.find(item => item.value === text2speech?.voice)
+  if (voiceItems && !voiceItem)
+    voiceItem = voiceItems[0]
   const localVoicePlaceholder = voiceItem?.name || t('common.placeholder.select')
   const localVoicePlaceholder = voiceItem?.name || t('common.placeholder.select')
 
 
   const handleChange = (value: Record<string, string>) => {
   const handleChange = (value: Record<string, string>) => {
@@ -66,13 +72,14 @@ const VoiceParamConfig = ({
         <div className='pt-3 space-y-6'>
         <div className='pt-3 space-y-6'>
           <div>
           <div>
             <div className='mb-2 flex items-center  space-x-1'>
             <div className='mb-2 flex items-center  space-x-1'>
-              <div className='leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.language')}</div>
-              <Tooltip htmlContent={<div className='w-[180px]' >
+              <div
+                className='leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.language')}</div>
+              <Tooltip htmlContent={<div className='w-[180px]'>
                 {t('appDebug.voice.voiceSettings.resolutionTooltip').split('\n').map(item => (
                 {t('appDebug.voice.voiceSettings.resolutionTooltip').split('\n').map(item => (
                   <div key={item}>{item}</div>
                   <div key={item}>{item}</div>
                 ))}
                 ))}
               </div>} selector='config-resolution-tooltip'>
               </div>} selector='config-resolution-tooltip'>
-                <RiQuestionLine className='w-[14px] h-[14px] text-gray-400' />
+                <RiQuestionLine className='w-[14px] h-[14px] text-gray-400'/>
               </Tooltip>
               </Tooltip>
             </div>
             </div>
             <Listbox
             <Listbox
@@ -84,7 +91,8 @@ const VoiceParamConfig = ({
               }}
               }}
             >
             >
               <div className={'relative h-9'}>
               <div className={'relative h-9'}>
-                <Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
+                <Listbox.Button
+                  className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
                   <span className={classNames('block truncate text-left', !languageItem?.name && 'text-gray-400')}>
                   <span className={classNames('block truncate text-left', !languageItem?.name && 'text-gray-400')}>
                     {languageItem?.name ? t(`common.voice.language.${languageItem?.value.replace('-', '')}`) : localLanguagePlaceholder}
                     {languageItem?.name ? t(`common.voice.language.${languageItem?.value.replace('-', '')}`) : localLanguagePlaceholder}
                   </span>
                   </span>
@@ -102,7 +110,8 @@ const VoiceParamConfig = ({
                   leaveTo="opacity-0"
                   leaveTo="opacity-0"
                 >
                 >
 
 
-                  <Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
+                  <Listbox.Options
+                    className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
                     {languages.map((item: Item) => (
                     {languages.map((item: Item) => (
                       <Listbox.Option
                       <Listbox.Option
                         key={item.value}
                         key={item.value}
@@ -117,13 +126,13 @@ const VoiceParamConfig = ({
                           <>
                           <>
                             <span
                             <span
                               className={classNames('block', selected && 'font-normal')}>{t(`common.voice.language.${(item.value).toString().replace('-', '')}`)}</span>
                               className={classNames('block', selected && 'font-normal')}>{t(`common.voice.language.${(item.value).toString().replace('-', '')}`)}</span>
-                            {(selected || item.value === text2speech.language) && (
+                            {(selected || item.value === text2speech?.language) && (
                               <span
                               <span
                                 className={classNames(
                                 className={classNames(
                                   'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
                                   'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
                                 )}
                                 )}
                               >
                               >
-                                <CheckIcon className="h-5 w-5" aria-hidden="true" />
+                                <CheckIcon className="h-5 w-5" aria-hidden="true"/>
                               </span>
                               </span>
                             )}
                             )}
                           </>
                           </>
@@ -137,7 +146,8 @@ const VoiceParamConfig = ({
           </div>
           </div>
 
 
           <div>
           <div>
-            <div className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.voice')}</div>
+            <div
+              className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.voice')}</div>
             <Listbox
             <Listbox
               value={voiceItem}
               value={voiceItem}
               disabled={!languageItem}
               disabled={!languageItem}
@@ -148,8 +158,10 @@ const VoiceParamConfig = ({
               }}
               }}
             >
             >
               <div className={'relative h-9'}>
               <div className={'relative h-9'}>
-                <Listbox.Button className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
-                  <span className={classNames('block truncate text-left', !voiceItem?.name && 'text-gray-400')}>{voiceItem?.name ?? localVoicePlaceholder}</span>
+                <Listbox.Button
+                  className={'w-full h-full rounded-lg border-0 bg-gray-100 py-1.5 pl-3 pr-10 sm:text-sm sm:leading-6 focus-visible:outline-none focus-visible:bg-gray-200 group-hover:bg-gray-200 cursor-pointer'}>
+                  <span
+                    className={classNames('block truncate text-left', !voiceItem?.name && 'text-gray-400')}>{voiceItem?.name ?? localVoicePlaceholder}</span>
                   <span className="pointer-events-none absolute inset-y-0 right-0 flex items-center pr-2">
                   <span className="pointer-events-none absolute inset-y-0 right-0 flex items-center pr-2">
                     <ChevronDownIcon
                     <ChevronDownIcon
                       className="h-5 w-5 text-gray-400"
                       className="h-5 w-5 text-gray-400"
@@ -164,7 +176,8 @@ const VoiceParamConfig = ({
                   leaveTo="opacity-0"
                   leaveTo="opacity-0"
                 >
                 >
 
 
-                  <Listbox.Options className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
+                  <Listbox.Options
+                    className="absolute z-10 mt-1 px-1 max-h-60 w-full overflow-auto rounded-md bg-white py-1 text-base shadow-lg border-gray-200 border-[0.5px] focus:outline-none sm:text-sm">
                     {voiceItems?.map((item: Item) => (
                     {voiceItems?.map((item: Item) => (
                       <Listbox.Option
                       <Listbox.Option
                         key={item.value}
                         key={item.value}
@@ -178,13 +191,13 @@ const VoiceParamConfig = ({
                         {({ /* active, */ selected }) => (
                         {({ /* active, */ selected }) => (
                           <>
                           <>
                             <span className={classNames('block', selected && 'font-normal')}>{item.name}</span>
                             <span className={classNames('block', selected && 'font-normal')}>{item.name}</span>
-                            {(selected || item.value === text2speech.voice) && (
+                            {(selected || item.value === text2speech?.voice) && (
                               <span
                               <span
                                 className={classNames(
                                 className={classNames(
                                   'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
                                   'absolute inset-y-0 right-0 flex items-center pr-4 text-gray-700',
                                 )}
                                 )}
                               >
                               >
-                                <CheckIcon className="h-5 w-5" aria-hidden="true" />
+                                <CheckIcon className="h-5 w-5" aria-hidden="true"/>
                               </span>
                               </span>
                             )}
                             )}
                           </>
                           </>
@@ -196,6 +209,29 @@ const VoiceParamConfig = ({
               </div>
               </div>
             </Listbox>
             </Listbox>
           </div>
           </div>
+          <div>
+            <div
+              className='mb-2 leading-[18px] text-[13px] font-semibold text-gray-800'>{t('appDebug.voice.voiceSettings.autoPlay')}</div>
+            <RadioGroup
+              className='space-x-3'
+              options={[
+                {
+                  label: t('appDebug.voice.voiceSettings.autoPlayEnabled'),
+                  value: TtsAutoPlay.enabled,
+                },
+                {
+                  label: t('appDebug.voice.voiceSettings.autoPlayDisabled'),
+                  value: TtsAutoPlay.disabled,
+                },
+              ]}
+              value={text2speech?.autoPlay ? text2speech?.autoPlay : TtsAutoPlay.disabled}
+              onChange={(value: TtsAutoPlay) => {
+                handleChange({
+                  autoPlay: value,
+                })
+              }}
+            />
+          </div>
         </div>
         </div>
       </div>
       </div>
     </div>
     </div>

+ 2 - 1
web/app/components/base/features/types.ts

@@ -1,4 +1,4 @@
-import type { TransferMethod } from '@/types/app'
+import type { TransferMethod, TtsAutoPlay } from '@/types/app'
 
 
 export type EnabledOrDisabled = {
 export type EnabledOrDisabled = {
   enabled?: boolean
   enabled?: boolean
@@ -14,6 +14,7 @@ export type SuggestedQuestionsAfterAnswer = EnabledOrDisabled
 export type TextToSpeech = EnabledOrDisabled & {
 export type TextToSpeech = EnabledOrDisabled & {
   language?: string
   language?: string
   voice?: string
   voice?: string
+  autoPlay?: TtsAutoPlay
 }
 }
 
 
 export type SpeechToText = EnabledOrDisabled
 export type SpeechToText = EnabledOrDisabled

+ 27 - 0
web/app/components/workflow/hooks/use-workflow-run.ts

@@ -4,6 +4,8 @@ import {
   useStoreApi,
   useStoreApi,
 } from 'reactflow'
 } from 'reactflow'
 import produce from 'immer'
 import produce from 'immer'
+import { v4 as uuidV4 } from 'uuid'
+import { usePathname } from 'next/navigation'
 import { useWorkflowStore } from '../store'
 import { useWorkflowStore } from '../store'
 import { useNodesSyncDraft } from '../hooks'
 import { useNodesSyncDraft } from '../hooks'
 import {
 import {
@@ -19,6 +21,7 @@ import {
   stopWorkflowRun,
   stopWorkflowRun,
 } from '@/service/workflow'
 } from '@/service/workflow'
 import { useFeaturesStore } from '@/app/components/base/features/hooks'
 import { useFeaturesStore } from '@/app/components/base/features/hooks'
+import { AudioPlayerManager } from '@/app/components/base/audio-btn/audio.player.manager'
 
 
 export const useWorkflowRun = () => {
 export const useWorkflowRun = () => {
   const store = useStoreApi()
   const store = useStoreApi()
@@ -27,6 +30,7 @@ export const useWorkflowRun = () => {
   const featuresStore = useFeaturesStore()
   const featuresStore = useFeaturesStore()
   const { doSyncWorkflowDraft } = useNodesSyncDraft()
   const { doSyncWorkflowDraft } = useNodesSyncDraft()
   const { handleUpdateWorkflowCanvas } = useWorkflowUpdate()
   const { handleUpdateWorkflowCanvas } = useWorkflowUpdate()
+  const pathname = usePathname()
 
 
   const handleBackupDraft = useCallback(() => {
   const handleBackupDraft = useCallback(() => {
     const {
     const {
@@ -134,6 +138,20 @@ export const useWorkflowRun = () => {
     let isInIteration = false
     let isInIteration = false
     let iterationLength = 0
     let iterationLength = 0
 
 
+    let ttsUrl = ''
+    let ttsIsPublic = false
+    if (params.token) {
+      ttsUrl = '/text-to-audio'
+      ttsIsPublic = true
+    }
+    else if (params.appId) {
+      if (pathname.search('explore/installed') > -1)
+        ttsUrl = `/installed-apps/${params.appId}/text-to-audio`
+      else
+        ttsUrl = `/apps/${params.appId}/text-to-audio`
+    }
+    const player = AudioPlayerManager.getInstance().getAudioPlayer(ttsUrl, ttsIsPublic, uuidV4(), 'none', 'none', (_: any): any => {})
+
     ssePost(
     ssePost(
       url,
       url,
       {
       {
@@ -468,6 +486,15 @@ export const useWorkflowRun = () => {
             draft.resultText = text
             draft.resultText = text
           }))
           }))
         },
         },
+        onTTSChunk: (messageId: string, audio: string, audioType?: string) => {
+          if (!audio || audio === '')
+            return
+          player.playAudioWithAudio(audio, true)
+          AudioPlayerManager.getInstance().resetMsgId(messageId)
+        },
+        onTTSEnd: (messageId: string, audio: string, audioType?: string) => {
+          player.playAudioWithAudio(audio, false)
+        },
         ...restCallback,
         ...restCallback,
       },
       },
     )
     )

+ 3 - 0
web/i18n/en-US/app-debug.ts

@@ -323,6 +323,9 @@ const translation = {
       language: 'Language',
       language: 'Language',
       resolutionTooltip: 'Text-to-speech voice support language。',
       resolutionTooltip: 'Text-to-speech voice support language。',
       voice: 'Voice',
       voice: 'Voice',
+      autoPlay: 'Auto Play',
+      autoPlayEnabled: 'Turn On',
+      autoPlayDisabled: 'Turn Off',
     },
     },
   },
   },
   openingStatement: {
   openingStatement: {

+ 3 - 0
web/i18n/ja-JP/app-debug.ts

@@ -319,6 +319,9 @@ const translation = {
       language: '言語',
       language: '言語',
       resolutionTooltip: 'テキスト読み上げの音声言語をサポートします。',
       resolutionTooltip: 'テキスト読み上げの音声言語をサポートします。',
       voice: '音声',
       voice: '音声',
+      autoPlay: '自動再生',
+      autoPlayEnabled: '開ける',
+      autoPlayDisabled: '關閉',
     },
     },
   },
   },
   openingStatement: {
   openingStatement: {

+ 3 - 0
web/i18n/zh-Hans/app-debug.ts

@@ -319,6 +319,9 @@ const translation = {
       language: '语言',
       language: '语言',
       resolutionTooltip: '文本转语音音色支持语言。',
       resolutionTooltip: '文本转语音音色支持语言。',
       voice: '音色',
       voice: '音色',
+      autoPlay: '自动播放',
+      autoPlayEnabled: '开启',
+      autoPlayDisabled: '关闭',
     },
     },
   },
   },
   openingStatement: {
   openingStatement: {

+ 3 - 0
web/i18n/zh-Hant/app-debug.ts

@@ -318,6 +318,9 @@ const translation = {
       language: '語言',
       language: '語言',
       resolutionTooltip: '文字轉語音音色支援語言。',
       resolutionTooltip: '文字轉語音音色支援語言。',
       voice: '音色',
       voice: '音色',
+      autoPlay: '自動播放',
+      autoPlayEnabled: '開啟',
+      autoPlayDisabled: '關閉',
     },
     },
   },
   },
   openingStatement: {
   openingStatement: {

+ 2 - 1
web/models/debug.ts

@@ -1,4 +1,4 @@
-import type { AgentStrategy, ModelModeType, RETRIEVE_TYPE, ToolItem } from '@/types/app'
+import type { AgentStrategy, ModelModeType, RETRIEVE_TYPE, ToolItem, TtsAutoPlay } from '@/types/app'
 export type Inputs = Record<string, string | number | object>
 export type Inputs = Record<string, string | number | object>
 
 
 export enum PromptMode {
 export enum PromptMode {
@@ -79,6 +79,7 @@ export type TextToSpeechConfig = {
   enabled: boolean
   enabled: boolean
   voice?: string
   voice?: string
   language?: string
   language?: string
+  autoPlay?: TtsAutoPlay
 }
 }
 
 
 export type CitationConfig = MoreLikeThisConfig
 export type CitationConfig = MoreLikeThisConfig

+ 1 - 0
web/next.config.js

@@ -34,6 +34,7 @@ const nextConfig = {
     // https://nextjs.org/docs/api-reference/next.config.js/ignoring-typescript-errors
     // https://nextjs.org/docs/api-reference/next.config.js/ignoring-typescript-errors
     ignoreBuildErrors: true,
     ignoreBuildErrors: true,
   },
   },
+  reactStrictMode: true,
   async redirects() {
   async redirects() {
     return [
     return [
       {
       {

+ 1 - 0
web/service/apps.ts

@@ -120,6 +120,7 @@ export const generationIntroduction: Fetcher<GenerationIntroductionResponse, { u
 }
 }
 
 
 export const fetchAppVoices: Fetcher<AppVoicesListResponse, { appId: string; language?: string }> = ({ appId, language }) => {
 export const fetchAppVoices: Fetcher<AppVoicesListResponse, { appId: string; language?: string }> = ({ appId, language }) => {
+  language = language || 'en-US'
   return get<AppVoicesListResponse>(`apps/${appId}/text-to-audio/voices?language=${language}`)
   return get<AppVoicesListResponse>(`apps/${appId}/text-to-audio/voices?language=${language}`)
 }
 }
 
 

+ 19 - 3
web/service/base.ts

@@ -19,6 +19,7 @@ const TIME_OUT = 100000
 const ContentType = {
 const ContentType = {
   json: 'application/json',
   json: 'application/json',
   stream: 'text/event-stream',
   stream: 'text/event-stream',
+  audio: 'audio/mpeg',
   form: 'application/x-www-form-urlencoded; charset=UTF-8',
   form: 'application/x-www-form-urlencoded; charset=UTF-8',
   download: 'application/octet-stream', // for download
   download: 'application/octet-stream', // for download
   upload: 'multipart/form-data', // for upload
   upload: 'multipart/form-data', // for upload
@@ -59,6 +60,8 @@ export type IOnIterationStarted = (workflowStarted: IterationStartedResponse) =>
 export type IOnIterationNexted = (workflowStarted: IterationNextedResponse) => void
 export type IOnIterationNexted = (workflowStarted: IterationNextedResponse) => void
 export type IOnIterationFinished = (workflowFinished: IterationFinishedResponse) => void
 export type IOnIterationFinished = (workflowFinished: IterationFinishedResponse) => void
 export type IOnTextChunk = (textChunk: TextChunkResponse) => void
 export type IOnTextChunk = (textChunk: TextChunkResponse) => void
+export type IOnTTSChunk = (messageId: string, audioStr: string, audioType?: string) => void
+export type IOnTTSEnd = (messageId: string, audioStr: string, audioType?: string) => void
 export type IOnTextReplace = (textReplace: TextReplaceResponse) => void
 export type IOnTextReplace = (textReplace: TextReplaceResponse) => void
 
 
 export type IOtherOptions = {
 export type IOtherOptions = {
@@ -84,6 +87,8 @@ export type IOtherOptions = {
   onIterationNext?: IOnIterationNexted
   onIterationNext?: IOnIterationNexted
   onIterationFinish?: IOnIterationFinished
   onIterationFinish?: IOnIterationFinished
   onTextChunk?: IOnTextChunk
   onTextChunk?: IOnTextChunk
+  onTTSChunk?: IOnTTSChunk
+  onTTSEnd?: IOnTTSEnd
   onTextReplace?: IOnTextReplace
   onTextReplace?: IOnTextReplace
 }
 }
 
 
@@ -135,6 +140,8 @@ const handleStream = (
   onIterationNext?: IOnIterationNexted,
   onIterationNext?: IOnIterationNexted,
   onIterationFinish?: IOnIterationFinished,
   onIterationFinish?: IOnIterationFinished,
   onTextChunk?: IOnTextChunk,
   onTextChunk?: IOnTextChunk,
+  onTTSChunk?: IOnTTSChunk,
+  onTTSEnd?: IOnTTSEnd,
   onTextReplace?: IOnTextReplace,
   onTextReplace?: IOnTextReplace,
 ) => {
 ) => {
   if (!response.ok)
   if (!response.ok)
@@ -227,6 +234,12 @@ const handleStream = (
             else if (bufferObj.event === 'text_replace') {
             else if (bufferObj.event === 'text_replace') {
               onTextReplace?.(bufferObj as TextReplaceResponse)
               onTextReplace?.(bufferObj as TextReplaceResponse)
             }
             }
+            else if (bufferObj.event === 'tts_message') {
+              onTTSChunk?.(bufferObj.message_id, bufferObj.audio, bufferObj.audio_type)
+            }
+            else if (bufferObj.event === 'tts_message_end') {
+              onTTSEnd?.(bufferObj.message_id, bufferObj.audio)
+            }
           }
           }
         })
         })
         buffer = lines[lines.length - 1]
         buffer = lines[lines.length - 1]
@@ -390,9 +403,10 @@ const baseFetch = <T>(
           }
           }
 
 
           // return data
           // return data
-          const data: Promise<T> = options.headers.get('Content-type') === ContentType.download ? res.blob() : res.json()
+          if (options.headers.get('Content-type') === ContentType.download || options.headers.get('Content-type') === ContentType.audio)
+            resolve(needAllResponseContent ? resClone : res.blob())
 
 
-          resolve(needAllResponseContent ? resClone : data)
+          else resolve(needAllResponseContent ? resClone : res.json())
         })
         })
         .catch((err) => {
         .catch((err) => {
           if (!silent)
           if (!silent)
@@ -475,6 +489,8 @@ export const ssePost = (
     onIterationNext,
     onIterationNext,
     onIterationFinish,
     onIterationFinish,
     onTextChunk,
     onTextChunk,
+    onTTSChunk,
+    onTTSEnd,
     onTextReplace,
     onTextReplace,
     onError,
     onError,
     getAbortController,
     getAbortController,
@@ -527,7 +543,7 @@ export const ssePost = (
           return
           return
         }
         }
         onData?.(str, isFirstMessage, moreInfo)
         onData?.(str, isFirstMessage, moreInfo)
-      }, onCompleted, onThought, onMessageEnd, onMessageReplace, onFile, onWorkflowStarted, onWorkflowFinished, onNodeStarted, onNodeFinished, onIterationStart, onIterationNext, onIterationFinish, onTextChunk, onTextReplace)
+      }, onCompleted, onThought, onMessageEnd, onMessageReplace, onFile, onWorkflowStarted, onWorkflowFinished, onNodeStarted, onNodeFinished, onIterationStart, onIterationNext, onIterationFinish, onTextChunk, onTTSChunk, onTTSEnd, onTextReplace)
     }).catch((e) => {
     }).catch((e) => {
       if (e.toString() !== 'AbortError: The user aborted a request.')
       if (e.toString() !== 'AbortError: The user aborted a request.')
         Toast.notify({ type: 'error', message: e })
         Toast.notify({ type: 'error', message: e })

+ 9 - 3
web/service/share.ts

@@ -1,4 +1,4 @@
-import type { IOnCompleted, IOnData, IOnError, IOnFile, IOnIterationFinished, IOnIterationNexted, IOnIterationStarted, IOnMessageEnd, IOnMessageReplace, IOnNodeFinished, IOnNodeStarted, IOnTextChunk, IOnTextReplace, IOnThought, IOnWorkflowFinished, IOnWorkflowStarted } from './base'
+import type { IOnCompleted, IOnData, IOnError, IOnFile, IOnIterationFinished, IOnIterationNexted, IOnIterationStarted, IOnMessageEnd, IOnMessageReplace, IOnNodeFinished, IOnNodeStarted, IOnTTSChunk, IOnTTSEnd, IOnTextChunk, IOnTextReplace, IOnThought, IOnWorkflowFinished, IOnWorkflowStarted } from './base'
 import {
 import {
   del as consoleDel, get as consoleGet, patch as consolePatch, post as consolePost,
   del as consoleDel, get as consoleGet, patch as consolePatch, post as consolePost,
   delPublic as del, getPublic as get, patchPublic as patch, postPublic as post, ssePost,
   delPublic as del, getPublic as get, patchPublic as patch, postPublic as post, ssePost,
@@ -30,7 +30,7 @@ export function getUrl(url: string, isInstalledApp: boolean, installedAppId: str
   return isInstalledApp ? `installed-apps/${installedAppId}/${url.startsWith('/') ? url.slice(1) : url}` : url
   return isInstalledApp ? `installed-apps/${installedAppId}/${url.startsWith('/') ? url.slice(1) : url}` : url
 }
 }
 
 
-export const sendChatMessage = async (body: Record<string, any>, { onData, onCompleted, onThought, onFile, onError, getAbortController, onMessageEnd, onMessageReplace }: {
+export const sendChatMessage = async (body: Record<string, any>, { onData, onCompleted, onThought, onFile, onError, getAbortController, onMessageEnd, onMessageReplace, onTTSChunk, onTTSEnd }: {
   onData: IOnData
   onData: IOnData
   onCompleted: IOnCompleted
   onCompleted: IOnCompleted
   onFile: IOnFile
   onFile: IOnFile
@@ -39,13 +39,15 @@ export const sendChatMessage = async (body: Record<string, any>, { onData, onCom
   onMessageEnd?: IOnMessageEnd
   onMessageEnd?: IOnMessageEnd
   onMessageReplace?: IOnMessageReplace
   onMessageReplace?: IOnMessageReplace
   getAbortController?: (abortController: AbortController) => void
   getAbortController?: (abortController: AbortController) => void
+  onTTSChunk?: IOnTTSChunk
+  onTTSEnd?: IOnTTSEnd
 }, isInstalledApp: boolean, installedAppId = '') => {
 }, isInstalledApp: boolean, installedAppId = '') => {
   return ssePost(getUrl('chat-messages', isInstalledApp, installedAppId), {
   return ssePost(getUrl('chat-messages', isInstalledApp, installedAppId), {
     body: {
     body: {
       ...body,
       ...body,
       response_mode: 'streaming',
       response_mode: 'streaming',
     },
     },
-  }, { onData, onCompleted, onThought, onFile, isPublicAPI: !isInstalledApp, onError, getAbortController, onMessageEnd, onMessageReplace })
+  }, { onData, onCompleted, onThought, onFile, isPublicAPI: !isInstalledApp, onError, getAbortController, onMessageEnd, onMessageReplace, onTTSChunk, onTTSEnd })
 }
 }
 
 
 export const stopChatMessageResponding = async (appId: string, taskId: string, isInstalledApp: boolean, installedAppId = '') => {
 export const stopChatMessageResponding = async (appId: string, taskId: string, isInstalledApp: boolean, installedAppId = '') => {
@@ -214,6 +216,10 @@ export const textToAudio = (url: string, isPublicAPI: boolean, body: FormData) =
   return (getAction('post', !isPublicAPI))(url, { body }, { bodyStringify: false, deleteContentType: true }) as Promise<{ data: string }>
   return (getAction('post', !isPublicAPI))(url, { body }, { bodyStringify: false, deleteContentType: true }) as Promise<{ data: string }>
 }
 }
 
 
+export const textToAudioStream = (url: string, isPublicAPI: boolean, header: { content_type: string }, body: { streaming: boolean; voice?: string; message_id?: string; text?: string | null | undefined }) => {
+  return (getAction('post', !isPublicAPI))(url, { body, header }, { needAllResponseContent: true })
+}
+
 export const fetchAccessToken = async (appCode: string) => {
 export const fetchAccessToken = async (appCode: string) => {
   const headers = new Headers()
   const headers = new Headers()
   headers.append('X-App-Code', appCode)
   headers.append('X-App-Code', appCode)

+ 6 - 0
web/types/app.ts

@@ -160,6 +160,7 @@ export type ModelConfig = {
     enabled: boolean
     enabled: boolean
     voice?: string
     voice?: string
     language?: string
     language?: string
+    autoPlay?: TtsAutoPlay
   }
   }
   retriever_resource: {
   retriever_resource: {
     enabled: boolean
     enabled: boolean
@@ -349,6 +350,11 @@ export enum TransferMethod {
   remote_url = 'remote_url',
   remote_url = 'remote_url',
 }
 }
 
 
+export enum TtsAutoPlay {
+  enabled = 'enabled',
+  disabled = 'disabled',
+}
+
 export const ALLOW_FILE_EXTENSIONS = ['png', 'jpg', 'jpeg', 'webp', 'gif']
 export const ALLOW_FILE_EXTENSIONS = ['png', 'jpg', 'jpeg', 'webp', 'gif']
 
 
 export type VisionSettings = {
 export type VisionSettings = {