1 rok temu · fd5c45ae10
--- a/api/controllers/console/explore/audio.py
+++ b/api/controllers/console/explore/audio.py
@@ -32,6 +32,7 @@ class ChatAudioApi(InstalledAppResource):
 
				             response = AudioService.transcript_asr(
			
 
				                 tenant_id=app_model.tenant_id,
			
 
				                 file=file,
			
 
				+                end_user=None
			
 
				             )
			
 
				 
			
 
				             return response
			
--- a/api/controllers/service_api/app/audio.py
+++ b/api/controllers/service_api/app/audio.py
@@ -66,6 +66,7 @@ class TextApi(AppApiResource):
 
				         parser = reqparse.RequestParser()
			
 
				         parser.add_argument('text', type=str, required=True, nullable=False, location='json')
			
 
				         parser.add_argument('user', type=str, required=True, nullable=False, location='json')
			
 
				+        parser.add_argument('streaming', type=bool, required=False, nullable=False, location='json')
			
 
				         args = parser.parse_args()
			
 
				 
			
 
				         try:
			
@@ -73,7 +74,7 @@ class TextApi(AppApiResource):
 
				                 tenant_id=app_model.tenant_id,
			
 
				                 text=args['text'],
			
 
				                 end_user=args['user'],
			
 
				-                streaming=False
			
 
				+                streaming=args['streaming']
			
 
				             )
			
 
				 
			
 
				             return response
			
--- a/api/controllers/web/audio.py
+++ b/api/controllers/web/audio.py
@@ -31,6 +31,7 @@ class AudioApi(WebApiResource):
 
				             response = AudioService.transcript_asr(
			
 
				                 tenant_id=app_model.tenant_id,
			
 
				                 file=file,
			
 
				+                end_user=end_user
			
 
				             )
			
 
				 
			
 
				             return response
			
--- a/api/core/model_runtime/README.md
+++ b/api/core/model_runtime/README.md
@@ -13,6 +13,7 @@ This module provides the interface for invoking and authenticating various model
 
				   - `Text Embedding Model` - Text Embedding, pre-computed tokens capability
			
 
				   - `Rerank Model` - Segment Rerank capability
			
 
				   - `Speech-to-text Model` - Speech to text capability
			
 
				+  - `Text-to-speech Model` - Text to speech capability
			
 
				   - `Moderation` - Moderation capability
			
 
				 
			
 
				 - Model provider display
			
--- a/api/core/model_runtime/README_CN.md
+++ b/api/core/model_runtime/README_CN.md
@@ -13,6 +13,7 @@
 
				   - `Text Embedidng Model` - 文本 Embedding ，预计算 tokens 能力
			
 
				   - `Rerank Model` - 分段 Rerank 能力
			
 
				   - `Speech-to-text Model` - 语音转文本能力
			
 
				+  - `Text-to-speech Model` - 文本转语音能力
			
 
				   - `Moderation` - Moderation 能力
			
 
				 
			
 
				 - 模型供应商展示
			
--- a/api/core/model_runtime/docs/en_US/interfaces.md
+++ b/api/core/model_runtime/docs/en_US/interfaces.md
@@ -299,9 +299,7 @@ Inherit the `__base.speech2text_model.Speech2TextModel` base class and implement
 
				 - Invoke Invocation
			
 
				 
			
 
				   ```python
			
 
				-  def _invoke(self, model: str, credentials: dict,
			
 
				-              file: IO[bytes], user: Optional[str] = None) \
			
 
				-          -> str:
			
 
				+  def _invoke(self, model: str, credentials: dict, file: IO[bytes], user: Optional[str] = None) -> str:
			
 
				       """
			
 
				       Invoke large language model
			
 
				   
			
@@ -331,6 +329,46 @@ Inherit the `__base.speech2text_model.Speech2TextModel` base class and implement
 
				 
			
 
				     The string after speech-to-text conversion.
			
 
				 
			
 
				+### Text2speech
			
 
				+
			
 
				+Inherit the `__base.text2speech_model.Text2SpeechModel` base class and implement the following interfaces:
			
 
				+
			
 
				+- Invoke Invocation
			
 
				+
			
 
				+  ```python
			
 
				+  def _invoke(elf, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None):
			
 
				+      """
			
 
				+      Invoke large language model
			
 
				+  
			
 
				+      :param model: model name
			
 
				+      :param credentials: model credentials
			
 
				+      :param content_text: text content to be translated
			
 
				+      :param streaming: output is streaming
			
 
				+      :param user: unique user id
			
 
				+      :return: translated audio file
			
 
				+      """	
			
 
				+  ```
			
 
				+
			
 
				+  - Parameters：
			
 
				+
			
 
				+    - `model` (string) Model name
			
 
				+
			
 
				+    - `credentials` (object) Credential information
			
 
				+
			
 
				+      The parameters of credential information are defined by either the `provider_credential_schema` or `model_credential_schema` in the provider's YAML configuration file. Inputs such as `api_key` are included.
			
 
				+
			
 
				+    - `content_text` (string) The text content that needs to be converted
			
 
				+
			
 
				+    - `streaming` (bool) Whether to stream output
			
 
				+
			
 
				+    - `user` (string) [optional] Unique identifier of the user
			
 
				+
			
 
				+      This can help the provider monitor and detect abusive behavior.
			
 
				+
			
 
				+  - Returns：
			
 
				+
			
 
				+    Text converted speech stream。
			
 
				+
			
 
				 ### Moderation
			
 
				 
			
 
				 Inherit the `__base.moderation_model.ModerationModel` base class and implement the following interfaces:
			
--- a/api/core/model_runtime/docs/en_US/provider_scale_out.md
+++ b/api/core/model_runtime/docs/en_US/provider_scale_out.md
@@ -94,6 +94,7 @@ The currently supported model types are as follows:
 
				 - `text_embedding` Text Embedding model
			
 
				 - `rerank` Rerank model
			
 
				 - `speech2text` Speech to text
			
 
				+- `tts` Text to speech
			
 
				 - `moderation` Moderation
			
 
				 
			
 
				 Continuing with `Anthropic` as an example, since `Anthropic` only supports LLM, we create a `module` named `llm` in `model_providers.anthropic`.
			
--- a/api/core/model_runtime/docs/en_US/schema.md
+++ b/api/core/model_runtime/docs/en_US/schema.md
@@ -47,6 +47,10 @@
 
				   - `max_chunks` (int) Maximum number of chunks (available for model types `text-embedding`, `moderation`)
			
 
				   - `file_upload_limit` (int) Maximum file upload limit, in MB (available for model type `speech2text`)
			
 
				   - `supported_file_extensions` (string) Supported file extension formats, e.g., mp3, mp4 (available for model type `speech2text`)
			
 
				+  - `default_voice` (string)  default voice, e.g.：alloy,echo,fable,onyx,nova,shimmer（available for model type `tts`）
			
 
				+  - `word_limit` (int)  Single conversion word limit, paragraphwise by default（available for model type `tts`）
			
 
				+  - `audio_type` (string)  Support audio file extension format, e.g.：mp3,wav（available for model type `tts`）
			
 
				+  - `max_workers` (int)  Number of concurrent workers supporting text and audio conversion（available for model type`tts`）
			
 
				   - `max_characters_per_chunk` (int) Maximum characters per chunk (available for model type `moderation`)
			
 
				 - `parameter_rules` (array[[ParameterRule](#ParameterRule)]) [optional] Model invocation parameter rules
			
 
				 - `pricing` ([PriceConfig](#PriceConfig)) [optional] Pricing information
			
@@ -58,6 +62,7 @@
 
				 - `text-embedding` Text Embedding model
			
 
				 - `rerank` Rerank model
			
 
				 - `speech2text` Speech to text
			
 
				+- `tts` Text to speech
			
 
				 - `moderation` Moderation
			
 
				 
			
 
				 ### ConfigurateMethod
			
--- a/api/core/model_runtime/docs/zh_Hans/customizable_model_scale_out.md
+++ b/api/core/model_runtime/docs/zh_Hans/customizable_model_scale_out.md
@@ -23,6 +23,7 @@
 
				 - `text_embedding` 文本 Embedding 模型
			
 
				 - `rerank` Rerank 模型
			
 
				 - `speech2text` 语音转文字
			
 
				+- `tts` 文字转语音
			
 
				 - `moderation` 审查
			
 
				 
			
 
				 `Xinference`支持`LLM`和`Text Embedding`和Rerank，那么我们开始编写`xinference.yaml`。
			
--- a/api/core/model_runtime/docs/zh_Hans/interfaces.md
+++ b/api/core/model_runtime/docs/zh_Hans/interfaces.md
@@ -369,6 +369,46 @@ class XinferenceProvider(Provider):
 
				 
			
 
				     语音转换后的字符串。
			
 
				 
			
 
				+### Text2speech
			
 
				+
			
 
				+继承 `__base.text2speech_model.Text2SpeechModel` 基类，实现以下接口：
			
 
				+
			
 
				+- Invoke 调用
			
 
				+
			
 
				+  ```python
			
 
				+  def _invoke(elf, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None):
			
 
				+      """
			
 
				+      Invoke large language model
			
 
				+  
			
 
				+      :param model: model name
			
 
				+      :param credentials: model credentials
			
 
				+      :param content_text: text content to be translated
			
 
				+      :param streaming: output is streaming
			
 
				+      :param user: unique user id
			
 
				+      :return: translated audio file
			
 
				+      """	
			
 
				+  ```
			
 
				+
			
 
				+  - 参数：
			
 
				+
			
 
				+    - `model` (string) 模型名称
			
 
				+
			
 
				+    - `credentials` (object) 凭据信息
			
 
				+
			
 
				+      凭据信息的参数由供应商 YAML 配置文件的 `provider_credential_schema` 或 `model_credential_schema` 定义，传入如：`api_key` 等。
			
 
				+
			
 
				+    - `content_text` (string) 需要转换的文本内容
			
 
				+
			
 
				+    - `streaming` (bool) 是否进行流式输出
			
 
				+
			
 
				+    - `user` (string) [optional] 用户的唯一标识符
			
 
				+
			
 
				+      可以帮助供应商监控和检测滥用行为。
			
 
				+
			
 
				+  - 返回：
			
 
				+
			
 
				+    文本转换后的语音流。
			
 
				+
			
 
				 ### Moderation
			
 
				 
			
 
				 继承 `__base.moderation_model.ModerationModel` 基类，实现以下接口：
			
--- a/api/core/model_runtime/docs/zh_Hans/predefined_model_scale_out.md
+++ b/api/core/model_runtime/docs/zh_Hans/predefined_model_scale_out.md
@@ -10,6 +10,7 @@
 
				 - `text_embedding` 文本 Embedding 模型
			
 
				 - `rerank` Rerank 模型
			
 
				 - `speech2text` 语音转文字
			
 
				+- `tts` 文字转语音
			
 
				 - `moderation` 审查
			
 
				 
			
 
				 依旧以 `Anthropic` 为例，`Anthropic` 仅支持 LLM，因此在 `model_providers.anthropic` 创建一个 `llm` 为名称的 `module`。
			
--- a/api/core/model_runtime/docs/zh_Hans/schema.md
+++ b/api/core/model_runtime/docs/zh_Hans/schema.md
@@ -48,6 +48,10 @@
 
				   - `max_chunks` (int) 最大分块数量 (模型类型 `text-embedding ` `moderation` 可用)
			
 
				   - `file_upload_limit` (int) 文件最大上传限制，单位：MB。（模型类型 `speech2text` 可用）
			
 
				   - `supported_file_extensions` (string)  支持文件扩展格式，如：mp3,mp4（模型类型 `speech2text` 可用）
			
 
				+  - `default_voice` (string)  缺省音色，可选：alloy,echo,fable,onyx,nova,shimmer（模型类型 `tts` 可用）
			
 
				+  - `word_limit` (int)  单次转换字数限制，默认按段落分段（模型类型 `tts` 可用）
			
 
				+  - `audio_type` (string)  支持音频文件扩展格式，如：mp3,wav（模型类型 `tts` 可用）
			
 
				+  - `max_workers` (int)  支持文字音频转换并发任务数（模型类型 `tts` 可用）
			
 
				   - `max_characters_per_chunk` (int) 每块最大字符数 (模型类型  `moderation` 可用)
			
 
				 - `parameter_rules` (array[[ParameterRule](#ParameterRule)]) [optional] 模型调用参数规则
			
 
				 - `pricing` ([PriceConfig](#PriceConfig)) [optional] 价格信息
			
@@ -59,6 +63,7 @@
 
				 - `text-embedding` 文本 Embedding 模型
			
 
				 - `rerank` Rerank 模型
			
 
				 - `speech2text` 语音转文字
			
 
				+- `tts` 文字转语音
			
 
				 - `moderation` 审查
			
 
				 
			
 
				 ### ConfigurateMethod
			
--- a/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml
@@ -5,3 +5,8 @@ model_properties:
 
				   word_limit: 120
			
 
				   audio_type: 'mp3'
			
 
				   max_workers: 5
			
 
				+pricing:
			
 
				+  input: '1'
			
 
				+  output: '0'
			
 
				+  unit: '0.0001'
			
 
				+  currency: RMB
			
--- a/api/requirements.txt
+++ b/api/requirements.txt
@@ -62,7 +62,6 @@ bs4~=0.0.1
 
				 markdown~=3.5.1
			
 
				 google-generativeai~=0.3.2
			
 
				 httpx[socks]~=0.24.1
			
 
				-pydub~=0.25.1
			
 
				 matplotlib~=3.8.2
			
 
				 yfinance~=0.2.35
			
 
				 pydub~=0.25.1
			
--- a/api/services/audio_service.py
+++ b/api/services/audio_service.py
@@ -56,7 +56,6 @@ class AudioService:
 
				             raise ProviderNotSupportTextToSpeechServiceError()
			
 
				 
			
 
				         try:
			
 
				-            audio_response = model_instance.invoke_tts(content_text=text.strip(), user=end_user, streaming=streaming)
			
 
				-            return audio_response
			
 
				+            return model_instance.invoke_tts(content_text=text.strip(), user=end_user, streaming=streaming)
			
 
				         except Exception as e:
			
 
				             raise e