8 meses atrás · 4dfa8eedb8
--- a/api/core/tools/provider/builtin/did/_assets/icon.svg
+++ b/api/core/tools/provider/builtin/did/_assets/icon.svg
--- a/api/core/tools/provider/builtin/did/did.py
+++ b/api/core/tools/provider/builtin/did/did.py
@@ -0,0 +1,21 @@
 
				+from core.tools.errors import ToolProviderCredentialValidationError

			
 
				+from core.tools.provider.builtin.did.tools.talks import TalksTool

			
 
				+from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController

			
 
				+

			
 
				+

			
 
				+class DIDProvider(BuiltinToolProviderController):

			
 
				+    def _validate_credentials(self, credentials: dict) -> None:

			
 
				+        try:

			
 
				+            # Example validation using the D-ID talks tool

			
 
				+            TalksTool().fork_tool_runtime(

			
 
				+                runtime={"credentials": credentials}

			
 
				+            ).invoke(

			
 
				+                user_id='',

			
 
				+                tool_parameters={

			
 
				+                    "source_url": "https://www.d-id.com/wp-content/uploads/2023/11/Hero-image-1.png",

			
 
				+                    "text_input": "Hello, welcome to use D-ID tool in Dify",

			
 
				+                }

			
 
				+            )

			
 
				+        except Exception as e:

			
 
				+            raise ToolProviderCredentialValidationError(str(e))

			
 
				+        
			
--- a/api/core/tools/provider/builtin/did/did.yaml
+++ b/api/core/tools/provider/builtin/did/did.yaml
@@ -0,0 +1,28 @@
 
				+identity:
			
 
				+  author: Matri Qi
			
 
				+  name: did
			
 
				+  label:
			
 
				+    en_US: D-ID
			
 
				+  description:
			
 
				+    en_US: D-ID is a tool enabling the creation of high-quality, custom videos of Digital Humans from a single image.
			
 
				+  icon: icon.svg
			
 
				+  tags:
			
 
				+    - videos
			
 
				+credentials_for_provider:
			
 
				+  did_api_key:
			
 
				+    type: secret-input
			
 
				+    required: true
			
 
				+    label:
			
 
				+      en_US: D-ID API Key
			
 
				+    placeholder:
			
 
				+      en_US: Please input your D-ID API key
			
 
				+    help:
			
 
				+      en_US: Get your D-ID API key from your D-ID account settings.
			
 
				+    url: https://studio.d-id.com/account-settings
			
 
				+  base_url:
			
 
				+    type: text-input
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: D-ID server's Base URL
			
 
				+    placeholder:
			
 
				+      en_US: https://api.d-id.com
			
--- a/api/core/tools/provider/builtin/did/did_appx.py
+++ b/api/core/tools/provider/builtin/did/did_appx.py
@@ -0,0 +1,87 @@
 
				+import logging
			
 
				+import time
			
 
				+from collections.abc import Mapping
			
 
				+from typing import Any
			
 
				+
			
 
				+import requests
			
 
				+from requests.exceptions import HTTPError
			
 
				+
			
 
				+logger = logging.getLogger(__name__)
			
 
				+
			
 
				+
			
 
				+class DIDApp:
			
 
				+    def __init__(self, api_key: str | None = None, base_url: str | None = None):
			
 
				+        self.api_key = api_key
			
 
				+        self.base_url = base_url or 'https://api.d-id.com'
			
 
				+        if not self.api_key:
			
 
				+            raise ValueError('API key is required')
			
 
				+
			
 
				+    def _prepare_headers(self, idempotency_key: str | None = None):
			
 
				+        headers = {'Content-Type': 'application/json', 'Authorization': f'Basic {self.api_key}'}
			
 
				+        if idempotency_key:
			
 
				+            headers['Idempotency-Key'] = idempotency_key
			
 
				+        return headers
			
 
				+
			
 
				+    def _request(
			
 
				+        self,
			
 
				+        method: str,
			
 
				+        url: str,
			
 
				+        data: Mapping[str, Any] | None = None,
			
 
				+        headers: Mapping[str, str] | None = None,
			
 
				+        retries: int = 3,
			
 
				+        backoff_factor: float = 0.3,
			
 
				+    ) -> Mapping[str, Any] | None:
			
 
				+        for i in range(retries):
			
 
				+            try:
			
 
				+                response = requests.request(method, url, json=data, headers=headers)
			
 
				+                response.raise_for_status()
			
 
				+                return response.json()
			
 
				+            except requests.exceptions.RequestException as e:
			
 
				+                if i < retries - 1 and isinstance(e, HTTPError) and e.response.status_code >= 500:
			
 
				+                    time.sleep(backoff_factor * (2**i))
			
 
				+                else:
			
 
				+                    raise
			
 
				+        return None
			
 
				+
			
 
				+    def talks(self, wait: bool = True, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs):
			
 
				+        endpoint = f'{self.base_url}/talks'
			
 
				+        headers = self._prepare_headers(idempotency_key)
			
 
				+        data = kwargs['params']
			
 
				+        logger.debug(f'Send request to {endpoint=} body={data}')
			
 
				+        response = self._request('POST', endpoint, data, headers)
			
 
				+        if response is None:
			
 
				+            raise HTTPError('Failed to initiate D-ID talks after multiple retries')
			
 
				+        id: str = response['id']
			
 
				+        if wait:
			
 
				+            return self._monitor_job_status(id=id, target='talks', poll_interval=poll_interval)
			
 
				+        return id
			
 
				+
			
 
				+    def animations(self, wait: bool = True, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs):
			
 
				+        endpoint = f'{self.base_url}/animations'
			
 
				+        headers = self._prepare_headers(idempotency_key)
			
 
				+        data = kwargs['params']
			
 
				+        logger.debug(f'Send request to {endpoint=} body={data}')
			
 
				+        response = self._request('POST', endpoint, data, headers)
			
 
				+        if response is None:
			
 
				+            raise HTTPError('Failed to initiate D-ID talks after multiple retries')
			
 
				+        id: str = response['id']
			
 
				+        if wait:
			
 
				+            return self._monitor_job_status(target='animations', id=id, poll_interval=poll_interval)
			
 
				+        return id
			
 
				+
			
 
				+    def check_did_status(self, target: str, id: str):
			
 
				+        endpoint = f'{self.base_url}/{target}/{id}'
			
 
				+        headers = self._prepare_headers()
			
 
				+        response = self._request('GET', endpoint, headers=headers)
			
 
				+        if response is None:
			
 
				+            raise HTTPError(f'Failed to check status for talks {id} after multiple retries')
			
 
				+        return response
			
 
				+
			
 
				+    def _monitor_job_status(self, target: str, id: str, poll_interval: int):
			
 
				+        while True:
			
 
				+            status = self.check_did_status(target=target, id=id)
			
 
				+            if status['status'] == 'done':
			
 
				+                return status
			
 
				+            elif status['status'] == 'error' or status['status'] == 'rejected':
			
 
				+                raise HTTPError(f'Talks {id} failed: {status["status"]} {status.get("error",{}).get("description")}')
			
 
				+            time.sleep(poll_interval)
			
--- a/api/core/tools/provider/builtin/did/tools/animations.py
+++ b/api/core/tools/provider/builtin/did/tools/animations.py
@@ -0,0 +1,49 @@
 
				+import json

			
 
				+from typing import Any, Union

			
 
				+

			
 
				+from core.tools.entities.tool_entities import ToolInvokeMessage

			
 
				+from core.tools.provider.builtin.did.did_appx import DIDApp

			
 
				+from core.tools.tool.builtin_tool import BuiltinTool

			
 
				+

			
 
				+

			
 
				+class AnimationsTool(BuiltinTool):

			
 
				+    def _invoke(

			
 
				+        self, user_id: str, tool_parameters: dict[str, Any]

			
 
				+    ) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:

			
 
				+        app = DIDApp(api_key=self.runtime.credentials['did_api_key'], base_url=self.runtime.credentials['base_url'])

			
 
				+

			
 
				+        driver_expressions_str = tool_parameters.get('driver_expressions')

			
 
				+        driver_expressions = json.loads(driver_expressions_str) if driver_expressions_str else None

			
 
				+

			
 
				+        config = {

			
 
				+            'stitch': tool_parameters.get('stitch', True),

			
 
				+            'mute': tool_parameters.get('mute'),

			
 
				+            'result_format': tool_parameters.get('result_format') or 'mp4',

			
 
				+        }

			
 
				+        config = {k: v for k, v in config.items() if v is not None and v != ''}

			
 
				+

			
 
				+        options = {

			
 
				+            'source_url': tool_parameters['source_url'],

			
 
				+            'driver_url': tool_parameters.get('driver_url'),

			
 
				+            'config': config,

			
 
				+        }

			
 
				+        options = {k: v for k, v in options.items() if v is not None and v != ''}

			
 
				+

			
 
				+        if not options.get('source_url'):

			
 
				+            raise ValueError('Source URL is required')

			
 
				+

			
 
				+        if config.get('logo_url'):

			
 
				+            if not config.get('logo_x'):

			
 
				+                raise ValueError('Logo X position is required when logo URL is provided')

			
 
				+            if not config.get('logo_y'):

			
 
				+                raise ValueError('Logo Y position is required when logo URL is provided')

			
 
				+

			
 
				+        animations_result = app.animations(params=options, wait=True)

			
 
				+

			
 
				+        if not isinstance(animations_result, str):

			
 
				+            animations_result = json.dumps(animations_result, ensure_ascii=False, indent=4)

			
 
				+

			
 
				+        if not animations_result:

			
 
				+            return self.create_text_message('D-ID animations request failed.')

			
 
				+

			
 
				+        return self.create_text_message(animations_result)

			
--- a/api/core/tools/provider/builtin/did/tools/animations.yaml
+++ b/api/core/tools/provider/builtin/did/tools/animations.yaml
@@ -0,0 +1,86 @@
 
				+identity:
			
 
				+  name: animations
			
 
				+  author: Matri Qi
			
 
				+  label:
			
 
				+    en_US: Animations
			
 
				+description:
			
 
				+  human:
			
 
				+    en_US: Animations enables to create videos matching head movements, expressions, emotions, and voice from a driver video and image.
			
 
				+  llm: Animations enables to create videos matching head movements, expressions, emotions, and voice from a driver video and image.
			
 
				+parameters:
			
 
				+  - name: source_url
			
 
				+    type: string
			
 
				+    required: true
			
 
				+    label:
			
 
				+      en_US: source url
			
 
				+    human_description:
			
 
				+      en_US: The URL of the source image to be animated by the driver video, or a selection from the list of provided studio actors.
			
 
				+    llm_description: The URL of the source image to be animated by the driver video, or a selection from the list of provided studio actors.
			
 
				+    form: llm
			
 
				+  - name: driver_url
			
 
				+    type: string
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: driver url
			
 
				+    human_description:
			
 
				+      en_US: The URL of the driver video to drive the animation, or a provided driver name from D-ID.
			
 
				+    form: form
			
 
				+  - name: mute
			
 
				+    type: boolean
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: mute
			
 
				+    human_description:
			
 
				+      en_US: Mutes the driver sound in the animated video result, defaults to true
			
 
				+    form: form
			
 
				+  - name: stitch
			
 
				+    type: boolean
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: stitch
			
 
				+    human_description:
			
 
				+      en_US: If enabled, the driver video will be stitched with the animationing head video.
			
 
				+    form: form
			
 
				+  - name: logo_url
			
 
				+    type: string
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: logo url
			
 
				+    human_description:
			
 
				+      en_US: The URL of the logo image to be added to the animation video.
			
 
				+    form: form
			
 
				+  - name: logo_x
			
 
				+    type: number
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: logo position x
			
 
				+    human_description:
			
 
				+      en_US: The x position of the logo image in the animation video. It's required when logo url is provided.
			
 
				+    form: form
			
 
				+  - name: logo_y
			
 
				+    type: number
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: logo position y
			
 
				+    human_description:
			
 
				+      en_US: The y position of the logo image in the animation video. It's required when logo url is provided.
			
 
				+    form: form
			
 
				+  - name: result_format
			
 
				+    type: string
			
 
				+    default: mp4
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: result format
			
 
				+    human_description:
			
 
				+      en_US: The format of the result video.
			
 
				+    form: form
			
 
				+    options:
			
 
				+      - value: mp4
			
 
				+        label:
			
 
				+          en_US: mp4
			
 
				+      - value: gif
			
 
				+        label:
			
 
				+          en_US: gif
			
 
				+      - value: mov
			
 
				+        label:
			
 
				+          en_US: mov
			
--- a/api/core/tools/provider/builtin/did/tools/talks.py
+++ b/api/core/tools/provider/builtin/did/tools/talks.py
@@ -0,0 +1,65 @@
 
				+import json

			
 
				+from typing import Any, Union

			
 
				+

			
 
				+from core.tools.entities.tool_entities import ToolInvokeMessage

			
 
				+from core.tools.provider.builtin.did.did_appx import DIDApp

			
 
				+from core.tools.tool.builtin_tool import BuiltinTool

			
 
				+

			
 
				+

			
 
				+class TalksTool(BuiltinTool):

			
 
				+    def _invoke(

			
 
				+        self, user_id: str, tool_parameters: dict[str, Any]

			
 
				+    ) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:

			
 
				+        app = DIDApp(api_key=self.runtime.credentials['did_api_key'], base_url=self.runtime.credentials['base_url'])

			
 
				+

			
 
				+        driver_expressions_str = tool_parameters.get('driver_expressions')

			
 
				+        driver_expressions = json.loads(driver_expressions_str) if driver_expressions_str else None

			
 
				+

			
 
				+        script = {

			
 
				+            'type': tool_parameters.get('script_type') or 'text',

			
 
				+            'input': tool_parameters.get('text_input'),

			
 
				+            'audio_url': tool_parameters.get('audio_url'),

			
 
				+            'reduce_noise': tool_parameters.get('audio_reduce_noise', False),

			
 
				+        }

			
 
				+        script = {k: v for k, v in script.items() if v is not None and v != ''}

			
 
				+        config = {

			
 
				+            'stitch': tool_parameters.get('stitch', True),

			
 
				+            'sharpen': tool_parameters.get('sharpen'),

			
 
				+            'fluent': tool_parameters.get('fluent'),

			
 
				+            'result_format': tool_parameters.get('result_format') or 'mp4',

			
 
				+            'pad_audio': tool_parameters.get('pad_audio'),

			
 
				+            'driver_expressions': driver_expressions,

			
 
				+        }

			
 
				+        config = {k: v for k, v in config.items() if v is not None and v != ''}

			
 
				+

			
 
				+        options = {

			
 
				+            'source_url': tool_parameters['source_url'],

			
 
				+            'driver_url': tool_parameters.get('driver_url'),

			
 
				+            'script': script,

			
 
				+            'config': config,

			
 
				+        }

			
 
				+        options = {k: v for k, v in options.items() if v is not None and v != ''}

			
 
				+

			
 
				+        if not options.get('source_url'):

			
 
				+            raise ValueError('Source URL is required')

			
 
				+

			
 
				+        if script.get('type') == 'audio':

			
 
				+            script.pop('input', None)

			
 
				+            if not script.get('audio_url'):

			
 
				+                raise ValueError('Audio URL is required for audio script type')

			
 
				+

			
 
				+        if script.get('type') == 'text':

			
 
				+            script.pop('audio_url', None)

			
 
				+            script.pop('reduce_noise', None)

			
 
				+            if not script.get('input'):

			
 
				+                raise ValueError('Text input is required for text script type')

			
 
				+

			
 
				+        talks_result = app.talks(params=options, wait=True)

			
 
				+

			
 
				+        if not isinstance(talks_result, str):

			
 
				+            talks_result = json.dumps(talks_result, ensure_ascii=False, indent=4)

			
 
				+

			
 
				+        if not talks_result:

			
 
				+            return self.create_text_message('D-ID talks request failed.')

			
 
				+

			
 
				+        return self.create_text_message(talks_result)

			
--- a/api/core/tools/provider/builtin/did/tools/talks.yaml
+++ b/api/core/tools/provider/builtin/did/tools/talks.yaml
@@ -0,0 +1,126 @@
 
				+identity:
			
 
				+  name: talks
			
 
				+  author: Matri Qi
			
 
				+  label:
			
 
				+    en_US: Talks
			
 
				+description:
			
 
				+  human:
			
 
				+    en_US: Talks enables the creation of realistic talking head videos from text or audio inputs.
			
 
				+  llm: Talks enables the creation of realistic talking head videos from text or audio inputs.
			
 
				+parameters:
			
 
				+  - name: source_url
			
 
				+    type: string
			
 
				+    required: true
			
 
				+    label:
			
 
				+      en_US: source url
			
 
				+    human_description:
			
 
				+      en_US: The URL of the source image to be animated by the driver video, or a selection from the list of provided studio actors.
			
 
				+    llm_description: The URL of the source image to be animated by the driver video, or a selection from the list of provided studio actors.
			
 
				+    form: llm
			
 
				+  - name: driver_url
			
 
				+    type: string
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: driver url
			
 
				+    human_description:
			
 
				+      en_US: The URL of the driver video to drive the talk, or a provided driver name from D-ID.
			
 
				+    form: form
			
 
				+  - name: script_type
			
 
				+    type: string
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: script type
			
 
				+    human_description:
			
 
				+      en_US: The type of the script.
			
 
				+    form: form
			
 
				+    options:
			
 
				+      - value: text
			
 
				+        label:
			
 
				+          en_US: text
			
 
				+      - value: audio
			
 
				+        label:
			
 
				+          en_US: audio
			
 
				+  - name: text_input
			
 
				+    type: string
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: text input
			
 
				+    human_description:
			
 
				+      en_US: The text input to be spoken by the talking head. Required when script type is text.
			
 
				+    form: form
			
 
				+  - name: audio_url
			
 
				+    type: string
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: audio url
			
 
				+    human_description:
			
 
				+      en_US: The URL of the audio file to be spoken by the talking head. Required when script type is audio.
			
 
				+    form: form
			
 
				+  - name: audio_reduce_noise
			
 
				+    type: boolean
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: audio reduce noise
			
 
				+    human_description:
			
 
				+      en_US: If enabled, the audio will be processed to reduce noise before being spoken by the talking head. It only works when script type is audio.
			
 
				+    form: form
			
 
				+  - name: stitch
			
 
				+    type: boolean
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: stitch
			
 
				+    human_description:
			
 
				+      en_US: If enabled, the driver video will be stitched with the talking head video.
			
 
				+    form: form
			
 
				+  - name: sharpen
			
 
				+    type: boolean
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: sharpen
			
 
				+    human_description:
			
 
				+      en_US: If enabled, the talking head video will be sharpened.
			
 
				+    form: form
			
 
				+  - name: result_format
			
 
				+    type: string
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: result format
			
 
				+    human_description:
			
 
				+      en_US: The format of the result video.
			
 
				+    form: form
			
 
				+    options:
			
 
				+      - value: mp4
			
 
				+        label:
			
 
				+          en_US: mp4
			
 
				+      - value: gif
			
 
				+        label:
			
 
				+          en_US: gif
			
 
				+      - value: mov
			
 
				+        label:
			
 
				+          en_US: mov
			
 
				+  - name: fluent
			
 
				+    type: boolean
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: fluent
			
 
				+    human_description:
			
 
				+      en_US: Interpolate between the last & first frames of the driver video When used together with pad_audio can create a seamless transition between videos of the same driver
			
 
				+    form: form
			
 
				+  - name: pad_audio
			
 
				+    type: number
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: pad audio
			
 
				+    human_description:
			
 
				+      en_US: Pad the audio with silence at the end (given in seconds) Will increase the video duration & the credits it consumes
			
 
				+    form: form
			
 
				+    min: 1
			
 
				+    max: 60
			
 
				+  - name: driver_expressions
			
 
				+    type: string
			
 
				+    required: false
			
 
				+    label:
			
 
				+      en_US: driver expressions
			
 
				+    human_description:
			
 
				+      en_US: timed expressions for animation. It should be an JSON array style string. Take D-ID documentation(https://docs.d-id.com/reference/createtalk) for more information.
			
 
				+    form: form