Ver Fonte

Add tongyi tts&tts function optimization (#2177)

Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM>
Co-authored-by: crazywoola <427733928@qq.com>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
Charlie.Wei há 1 ano atrás
pai
commit
ac4bb5c35f

+ 98 - 0
api/core/model_runtime/model_providers/__base/tts_model.py

@@ -1,8 +1,13 @@
+import uuid
+import hashlib
+import subprocess
 from abc import abstractmethod
 from abc import abstractmethod
 from typing import Optional
 from typing import Optional
 
 
+from core.model_runtime.errors.invoke import InvokeBadRequestError
 from core.model_runtime.entities.model_entities import ModelType
 from core.model_runtime.entities.model_entities import ModelType
 from core.model_runtime.model_providers.__base.ai_model import AIModel
 from core.model_runtime.model_providers.__base.ai_model import AIModel
+from core.model_runtime.entities.model_entities import ModelPropertyKey
 
 
 
 
 class TTSModel(AIModel):
 class TTSModel(AIModel):
@@ -40,3 +45,96 @@ class TTSModel(AIModel):
         :return: translated audio file
         :return: translated audio file
         """
         """
         raise NotImplementedError
         raise NotImplementedError
+
+    def _get_model_voice(self, model: str, credentials: dict) -> any:
+        """
+        Get voice for given tts model
+
+        :param model: model name
+        :param credentials: model credentials
+        :return: voice
+        """
+        model_schema = self.get_model_schema(model, credentials)
+
+        if model_schema and ModelPropertyKey.DEFAULT_VOICE in model_schema.model_properties:
+            return model_schema.model_properties[ModelPropertyKey.DEFAULT_VOICE]
+
+    def _get_model_audio_type(self, model: str, credentials: dict) -> str:
+        """
+        Get audio type for given tts model
+
+        :param model: model name
+        :param credentials: model credentials
+        :return: voice
+        """
+        model_schema = self.get_model_schema(model, credentials)
+
+        if model_schema and ModelPropertyKey.AUDOI_TYPE in model_schema.model_properties:
+            return model_schema.model_properties[ModelPropertyKey.AUDOI_TYPE]
+
+    def _get_model_word_limit(self, model: str, credentials: dict) -> int:
+        """
+        Get audio type for given tts model
+        :return: audio type
+        """
+        model_schema = self.get_model_schema(model, credentials)
+
+        if model_schema and ModelPropertyKey.WORD_LIMIT in model_schema.model_properties:
+            return model_schema.model_properties[ModelPropertyKey.WORD_LIMIT]
+
+    def _get_model_workers_limit(self, model: str, credentials: dict) -> int:
+        """
+        Get audio max workers for given tts model
+        :return: audio type
+        """
+        model_schema = self.get_model_schema(model, credentials)
+
+        if model_schema and ModelPropertyKey.MAX_WORKERS in model_schema.model_properties:
+            return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS]
+
+    @staticmethod
+    def _split_text_into_sentences(text: str, limit: int, delimiters=None):
+        if delimiters is None:
+            delimiters = set('。!?;\n')
+
+        buf = []
+        word_count = 0
+        for char in text:
+            buf.append(char)
+            if char in delimiters:
+                if word_count >= limit:
+                    yield ''.join(buf)
+                    buf = []
+                    word_count = 0
+                else:
+                    word_count += 1
+            else:
+                word_count += 1
+
+        if buf:
+            yield ''.join(buf)
+
+    @staticmethod
+    def _is_ffmpeg_installed():
+        try:
+            output = subprocess.check_output("ffmpeg -version", shell=True)
+            if "ffmpeg version" in output.decode("utf-8"):
+                return True
+            else:
+                raise InvokeBadRequestError("ffmpeg is not installed, "
+                                            "details: https://docs.dify.ai/getting-started/install-self-hosted"
+                                            "/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")
+        except Exception:
+            raise InvokeBadRequestError("ffmpeg is not installed, "
+                                        "details: https://docs.dify.ai/getting-started/install-self-hosted"
+                                        "/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")
+
+    # Todo: To improve the streaming function
+    @staticmethod
+    def _get_file_name(file_content: str) -> str:
+        hash_object = hashlib.sha256(file_content.encode())
+        hex_digest = hash_object.hexdigest()
+
+        namespace_uuid = uuid.UUID('a5da6ef9-b303-596f-8e88-bf8fa40f4b31')
+        unique_uuid = uuid.uuid5(namespace_uuid, hex_digest)
+        return str(unique_uuid)

+ 3 - 104
api/core/model_runtime/model_providers/openai/tts/tts.py

@@ -1,18 +1,13 @@
-import uuid
-import hashlib
-import subprocess
 from io import BytesIO
 from io import BytesIO
 from typing import Optional
 from typing import Optional
 from functools import reduce
 from functools import reduce
 from pydub import AudioSegment
 from pydub import AudioSegment
 
 
-from core.model_runtime.entities.model_entities import ModelPropertyKey
 from core.model_runtime.errors.validate import CredentialsValidateFailedError
 from core.model_runtime.errors.validate import CredentialsValidateFailedError
 from core.model_runtime.errors.invoke import InvokeBadRequestError
 from core.model_runtime.errors.invoke import InvokeBadRequestError
 from core.model_runtime.model_providers.__base.tts_model import TTSModel
 from core.model_runtime.model_providers.__base.tts_model import TTSModel
 from core.model_runtime.model_providers.openai._common import _CommonOpenAI
 from core.model_runtime.model_providers.openai._common import _CommonOpenAI
 
 
-from typing_extensions import Literal
 from flask import Response, stream_with_context
 from flask import Response, stream_with_context
 from openai import OpenAI
 from openai import OpenAI
 import concurrent.futures
 import concurrent.futures
@@ -22,9 +17,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
     """
     """
     Model class for OpenAI Speech to text model.
     Model class for OpenAI Speech to text model.
     """
     """
-
-    def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool,
-                user: Optional[str] = None) -> any:
+    def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any:
         """
         """
         _invoke text2speech model
         _invoke text2speech model
 
 
@@ -65,7 +58,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
         except Exception as ex:
         except Exception as ex:
             raise CredentialsValidateFailedError(str(ex))
             raise CredentialsValidateFailedError(str(ex))
 
 
-    def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
+    def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response:
         """
         """
         _tts_invoke text2speech model
         _tts_invoke text2speech model
 
 
@@ -104,8 +97,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
             raise InvokeBadRequestError(str(ex))
             raise InvokeBadRequestError(str(ex))
 
 
     # Todo: To improve the streaming function
     # Todo: To improve the streaming function
-    def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
-                              user: Optional[str] = None) -> any:
+    def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
         """
         """
         _tts_invoke_streaming text2speech model
         _tts_invoke_streaming text2speech model
 
 
@@ -131,84 +123,6 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
         except Exception as ex:
         except Exception as ex:
             raise InvokeBadRequestError(str(ex))
             raise InvokeBadRequestError(str(ex))
 
 
-    def _get_model_voice(self, model: str, credentials: dict) -> Literal[
-        "alloy", "echo", "fable", "onyx", "nova", "shimmer"]:
-        """
-        Get voice for given tts model
-
-        :param model: model name
-        :param credentials: model credentials
-        :return: voice
-        """
-        model_schema = self.get_model_schema(model, credentials)
-
-        if model_schema and ModelPropertyKey.DEFAULT_VOICE in model_schema.model_properties:
-            return model_schema.model_properties[ModelPropertyKey.DEFAULT_VOICE]
-
-    def _get_model_audio_type(self, model: str, credentials: dict) -> str:
-        """
-        Get audio type for given tts model
-
-        :param model: model name
-        :param credentials: model credentials
-        :return: voice
-        """
-        model_schema = self.get_model_schema(model, credentials)
-
-        if model_schema and ModelPropertyKey.AUDOI_TYPE in model_schema.model_properties:
-            return model_schema.model_properties[ModelPropertyKey.AUDOI_TYPE]
-
-    def _get_model_word_limit(self, model: str, credentials: dict) -> int:
-        """
-        Get audio type for given tts model
-        :return: audio type
-        """
-        model_schema = self.get_model_schema(model, credentials)
-
-        if model_schema and ModelPropertyKey.WORD_LIMIT in model_schema.model_properties:
-            return model_schema.model_properties[ModelPropertyKey.WORD_LIMIT]
-
-    def _get_model_workers_limit(self, model: str, credentials: dict) -> int:
-        """
-        Get audio max workers for given tts model
-        :return: audio type
-        """
-        model_schema = self.get_model_schema(model, credentials)
-
-        if model_schema and ModelPropertyKey.MAX_WORKERS in model_schema.model_properties:
-            return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS]
-
-    @staticmethod
-    def _split_text_into_sentences(text: str, limit: int, delimiters=None):
-        if delimiters is None:
-            delimiters = set('。!?;\n')
-
-        buf = []
-        word_count = 0
-        for char in text:
-            buf.append(char)
-            if char in delimiters:
-                if word_count >= limit:
-                    yield ''.join(buf)
-                    buf = []
-                    word_count = 0
-                else:
-                    word_count += 1
-            else:
-                word_count += 1
-
-        if buf:
-            yield ''.join(buf)
-
-    @staticmethod
-    def _get_file_name(file_content: str) -> str:
-        hash_object = hashlib.sha256(file_content.encode())
-        hex_digest = hash_object.hexdigest()
-
-        namespace_uuid = uuid.UUID('a5da6ef9-b303-596f-8e88-bf8fa40f4b31')
-        unique_uuid = uuid.uuid5(namespace_uuid, hex_digest)
-        return str(unique_uuid)
-
     def _process_sentence(self, sentence: str, model: str, credentials: dict):
     def _process_sentence(self, sentence: str, model: str, credentials: dict):
         """
         """
         _tts_invoke openai text2speech model api
         _tts_invoke openai text2speech model api
@@ -226,18 +140,3 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
         response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip())
         response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip())
         if isinstance(response.read(), bytes):
         if isinstance(response.read(), bytes):
             return response.read()
             return response.read()
-
-    @staticmethod
-    def _is_ffmpeg_installed():
-        try:
-            output = subprocess.check_output("ffmpeg -version", shell=True)
-            if "ffmpeg version" in output.decode("utf-8"):
-                return True
-            else:
-                raise InvokeBadRequestError("ffmpeg is not installed, "
-                                            "details: https://docs.dify.ai/getting-started/install-self-hosted"
-                                            "/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")
-        except Exception:
-            raise InvokeBadRequestError("ffmpeg is not installed, "
-                                        "details: https://docs.dify.ai/getting-started/install-self-hosted"
-                                        "/install-faq#id-14.-what-to-do-if-this-error-occurs-in-text-to-speech")

+ 23 - 0
api/core/model_runtime/model_providers/tongyi/_common.py

@@ -0,0 +1,23 @@
+from core.model_runtime.errors.invoke import InvokeError
+
+
+class _CommonTongyi:
+    @staticmethod
+    def _to_credential_kwargs(credentials: dict) -> dict:
+        credentials_kwargs = {
+            "dashscope_api_key": credentials['dashscope_api_key'],
+        }
+
+        return credentials_kwargs
+
+    @property
+    def _invoke_error_mapping(self) -> dict[type[InvokeError], list[type[Exception]]]:
+        """
+        Map model invoke error to unified error
+        The key is the error type thrown to the caller
+        The value is the error type thrown by the model,
+        which needs to be converted into a unified error type for the caller.
+
+        :return: Invoke error mapping
+        """
+        pass

+ 1 - 0
api/core/model_runtime/model_providers/tongyi/tongyi.yaml

@@ -16,6 +16,7 @@ help:
     en_US: https://dashscope.console.aliyun.com/api-key_management
     en_US: https://dashscope.console.aliyun.com/api-key_management
 supported_model_types:
 supported_model_types:
   - llm
   - llm
+  - tts
 configurate_methods:
 configurate_methods:
   - predefined-model
   - predefined-model
 provider_credential_schema:
 provider_credential_schema:

+ 0 - 0
api/core/model_runtime/model_providers/tongyi/tts/__init__.py


+ 7 - 0
api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml

@@ -0,0 +1,7 @@
+model: tts-1
+model_type: tts
+model_properties:
+  default_voice: 'sambert-zhiru-v1' # 音色参考 https://help.aliyun.com/zh/dashscope/model-list 配置
+  word_limit: 120
+  audio_type: 'mp3'
+  max_workers: 5

+ 142 - 0
api/core/model_runtime/model_providers/tongyi/tts/tts.py

@@ -0,0 +1,142 @@
+from io import BytesIO
+from typing import Optional
+from functools import reduce
+from pydub import AudioSegment
+
+from core.model_runtime.errors.validate import CredentialsValidateFailedError
+from core.model_runtime.errors.invoke import InvokeBadRequestError
+from core.model_runtime.model_providers.__base.tts_model import TTSModel
+from core.model_runtime.model_providers.tongyi._common import _CommonTongyi
+
+import dashscope
+from flask import Response, stream_with_context
+import concurrent.futures
+
+
+class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
+    """
+    Model class for Tongyi Speech to text model.
+    """
+    def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any:
+        """
+        _invoke text2speech model
+
+        :param model: model name
+        :param credentials: model credentials
+        :param content_text: text content to be translated
+        :param streaming: output is streaming
+        :param user: unique user id
+        :return: text translated to audio file
+        """
+        self._is_ffmpeg_installed()
+        audio_type = self._get_model_audio_type(model, credentials)
+        if streaming:
+            return Response(stream_with_context(self._tts_invoke_streaming(model=model,
+                                                                           credentials=credentials,
+                                                                           content_text=content_text,
+                                                                           user=user)),
+                            status=200, mimetype=f'audio/{audio_type}')
+        else:
+            return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, user=user)
+
+    def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
+        """
+        validate credentials text2speech model
+
+        :param model: model name
+        :param credentials: model credentials
+        :param user: unique user id
+        :return: text translated to audio file
+        """
+        try:
+            self._tts_invoke(
+                model=model,
+                credentials=credentials,
+                content_text='Hello world!',
+                user=user
+            )
+        except Exception as ex:
+            raise CredentialsValidateFailedError(str(ex))
+
+    def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response:
+        """
+        _tts_invoke text2speech model
+
+        :param model: model name
+        :param credentials: model credentials
+        :param content_text: text content to be translated
+        :param user: unique user id
+        :return: text translated to audio file
+        """
+        audio_type = self._get_model_audio_type(model, credentials)
+        word_limit = self._get_model_word_limit(model, credentials)
+        max_workers = self._get_model_workers_limit(model, credentials)
+
+        try:
+            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
+            audio_bytes_list = list()
+
+            # Create a thread pool and map the function to the list of sentences
+            with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
+                futures = [executor.submit(self._process_sentence, model=model, sentence=sentence,
+                                           credentials=credentials, audio_type=audio_type) for sentence in sentences]
+                for future in futures:
+                    try:
+                        audio_bytes_list.append(future.result())
+                    except Exception as ex:
+                        raise InvokeBadRequestError(str(ex))
+
+            audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
+                              audio_bytes_list if audio_bytes]
+            combined_segment = reduce(lambda x, y: x + y, audio_segments)
+            buffer: BytesIO = BytesIO()
+            combined_segment.export(buffer, format=audio_type)
+            buffer.seek(0)
+            return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
+        except Exception as ex:
+            raise InvokeBadRequestError(str(ex))
+
+    # Todo: To improve the streaming function
+    def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
+        """
+        _tts_invoke_streaming text2speech model
+
+        :param model: model name
+        :param credentials: model credentials
+        :param content_text: text content to be translated
+        :param user: unique user id
+        :return: text translated to audio file
+        """
+        # transform credentials to kwargs for model instance
+        dashscope.api_key = credentials.get('dashscope_api_key')
+        voice_name = self._get_model_voice(model, credentials)
+        word_limit = self._get_model_word_limit(model, credentials)
+        audio_type = self._get_model_audio_type(model, credentials)
+        try:
+            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
+            for sentence in sentences:
+                response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(),
+                                                                      format=audio_type, word_timestamp_enabled=True,
+                                                                      phoneme_timestamp_enabled=True)
+                if isinstance(response.get_audio_data(), bytes):
+                    return response.get_audio_data()
+        except Exception as ex:
+            raise InvokeBadRequestError(str(ex))
+
+    def _process_sentence(self, sentence: str, model: str, credentials: dict, audio_type: str):
+        """
+        _tts_invoke Tongyi text2speech model api
+
+        :param model: model name
+        :param credentials: model credentials
+        :param sentence: text content to be translated
+        :param audio_type: audio file type
+        :return: text translated to audio file
+        """
+        # transform credentials to kwargs for model instance
+        dashscope.api_key = credentials.get('dashscope_api_key')
+        voice_name = self._get_model_voice(model, credentials)
+
+        response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(), format=audio_type)
+        if isinstance(response.get_audio_data(), bytes):
+            return response.get_audio_data()

+ 1 - 1
web/app/components/develop/template/template.en.mdx

@@ -495,7 +495,7 @@ The text generation application offers non-session support and is ideal for tran
 />
 />
 <Row>
 <Row>
   <Col>
   <Col>
-    Text to speech, only supports openai model.
+    Text to speech.
 
 
     ### Request Body
     ### Request Body
 
 

+ 1 - 1
web/app/components/develop/template/template.zh.mdx

@@ -458,7 +458,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty } from '../md.tsx'
 />
 />
 <Row>
 <Row>
   <Col>
   <Col>
-    文字转语音,仅支持 openai 模型
+    文字转语音。
 
 
     ### Request Body
     ### Request Body
 
 

+ 1 - 1
web/app/components/develop/template/template_chat.en.mdx

@@ -845,7 +845,7 @@ Chat applications support session persistence, allowing previous chat history to
 />
 />
 <Row>
 <Row>
   <Col>
   <Col>
-    Text to speech, only supports openai model.
+    Text to speech.
 
 
     ### Request Body
     ### Request Body
 
 

+ 1 - 1
web/app/components/develop/template/template_chat.zh.mdx

@@ -917,7 +917,7 @@ import { Row, Col, Properties, Property, Heading, SubProperty } from '../md.tsx'
 />
 />
 <Row>
 <Row>
   <Col>
   <Col>
-    文字转语音,仅支持 openai 模型
+    文字转语音。
 
 
     ### Request Body
     ### Request Body