Ver código fonte

Feat/tool-D-ID (#6278)

Matri 8 meses atrás
pai
commit
4dfa8eedb8

Diferenças do arquivo suprimidas por serem muito extensas
+ 11 - 0
api/core/tools/provider/builtin/did/_assets/icon.svg


+ 21 - 0
api/core/tools/provider/builtin/did/did.py

@@ -0,0 +1,21 @@
+from core.tools.errors import ToolProviderCredentialValidationError
+from core.tools.provider.builtin.did.tools.talks import TalksTool
+from core.tools.provider.builtin_tool_provider import BuiltinToolProviderController
+
+
+class DIDProvider(BuiltinToolProviderController):
+    def _validate_credentials(self, credentials: dict) -> None:
+        try:
+            # Example validation using the D-ID talks tool
+            TalksTool().fork_tool_runtime(
+                runtime={"credentials": credentials}
+            ).invoke(
+                user_id='',
+                tool_parameters={
+                    "source_url": "https://www.d-id.com/wp-content/uploads/2023/11/Hero-image-1.png",
+                    "text_input": "Hello, welcome to use D-ID tool in Dify",
+                }
+            )
+        except Exception as e:
+            raise ToolProviderCredentialValidationError(str(e))
+        

+ 28 - 0
api/core/tools/provider/builtin/did/did.yaml

@@ -0,0 +1,28 @@
+identity:
+  author: Matri Qi
+  name: did
+  label:
+    en_US: D-ID
+  description:
+    en_US: D-ID is a tool enabling the creation of high-quality, custom videos of Digital Humans from a single image.
+  icon: icon.svg
+  tags:
+    - videos
+credentials_for_provider:
+  did_api_key:
+    type: secret-input
+    required: true
+    label:
+      en_US: D-ID API Key
+    placeholder:
+      en_US: Please input your D-ID API key
+    help:
+      en_US: Get your D-ID API key from your D-ID account settings.
+    url: https://studio.d-id.com/account-settings
+  base_url:
+    type: text-input
+    required: false
+    label:
+      en_US: D-ID server's Base URL
+    placeholder:
+      en_US: https://api.d-id.com

+ 87 - 0
api/core/tools/provider/builtin/did/did_appx.py

@@ -0,0 +1,87 @@
+import logging
+import time
+from collections.abc import Mapping
+from typing import Any
+
+import requests
+from requests.exceptions import HTTPError
+
+logger = logging.getLogger(__name__)
+
+
+class DIDApp:
+    def __init__(self, api_key: str | None = None, base_url: str | None = None):
+        self.api_key = api_key
+        self.base_url = base_url or 'https://api.d-id.com'
+        if not self.api_key:
+            raise ValueError('API key is required')
+
+    def _prepare_headers(self, idempotency_key: str | None = None):
+        headers = {'Content-Type': 'application/json', 'Authorization': f'Basic {self.api_key}'}
+        if idempotency_key:
+            headers['Idempotency-Key'] = idempotency_key
+        return headers
+
+    def _request(
+        self,
+        method: str,
+        url: str,
+        data: Mapping[str, Any] | None = None,
+        headers: Mapping[str, str] | None = None,
+        retries: int = 3,
+        backoff_factor: float = 0.3,
+    ) -> Mapping[str, Any] | None:
+        for i in range(retries):
+            try:
+                response = requests.request(method, url, json=data, headers=headers)
+                response.raise_for_status()
+                return response.json()
+            except requests.exceptions.RequestException as e:
+                if i < retries - 1 and isinstance(e, HTTPError) and e.response.status_code >= 500:
+                    time.sleep(backoff_factor * (2**i))
+                else:
+                    raise
+        return None
+
+    def talks(self, wait: bool = True, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs):
+        endpoint = f'{self.base_url}/talks'
+        headers = self._prepare_headers(idempotency_key)
+        data = kwargs['params']
+        logger.debug(f'Send request to {endpoint=} body={data}')
+        response = self._request('POST', endpoint, data, headers)
+        if response is None:
+            raise HTTPError('Failed to initiate D-ID talks after multiple retries')
+        id: str = response['id']
+        if wait:
+            return self._monitor_job_status(id=id, target='talks', poll_interval=poll_interval)
+        return id
+
+    def animations(self, wait: bool = True, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs):
+        endpoint = f'{self.base_url}/animations'
+        headers = self._prepare_headers(idempotency_key)
+        data = kwargs['params']
+        logger.debug(f'Send request to {endpoint=} body={data}')
+        response = self._request('POST', endpoint, data, headers)
+        if response is None:
+            raise HTTPError('Failed to initiate D-ID talks after multiple retries')
+        id: str = response['id']
+        if wait:
+            return self._monitor_job_status(target='animations', id=id, poll_interval=poll_interval)
+        return id
+
+    def check_did_status(self, target: str, id: str):
+        endpoint = f'{self.base_url}/{target}/{id}'
+        headers = self._prepare_headers()
+        response = self._request('GET', endpoint, headers=headers)
+        if response is None:
+            raise HTTPError(f'Failed to check status for talks {id} after multiple retries')
+        return response
+
+    def _monitor_job_status(self, target: str, id: str, poll_interval: int):
+        while True:
+            status = self.check_did_status(target=target, id=id)
+            if status['status'] == 'done':
+                return status
+            elif status['status'] == 'error' or status['status'] == 'rejected':
+                raise HTTPError(f'Talks {id} failed: {status["status"]} {status.get("error",{}).get("description")}')
+            time.sleep(poll_interval)

+ 49 - 0
api/core/tools/provider/builtin/did/tools/animations.py

@@ -0,0 +1,49 @@
+import json
+from typing import Any, Union
+
+from core.tools.entities.tool_entities import ToolInvokeMessage
+from core.tools.provider.builtin.did.did_appx import DIDApp
+from core.tools.tool.builtin_tool import BuiltinTool
+
+
+class AnimationsTool(BuiltinTool):
+    def _invoke(
+        self, user_id: str, tool_parameters: dict[str, Any]
+    ) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
+        app = DIDApp(api_key=self.runtime.credentials['did_api_key'], base_url=self.runtime.credentials['base_url'])
+
+        driver_expressions_str = tool_parameters.get('driver_expressions')
+        driver_expressions = json.loads(driver_expressions_str) if driver_expressions_str else None
+
+        config = {
+            'stitch': tool_parameters.get('stitch', True),
+            'mute': tool_parameters.get('mute'),
+            'result_format': tool_parameters.get('result_format') or 'mp4',
+        }
+        config = {k: v for k, v in config.items() if v is not None and v != ''}
+
+        options = {
+            'source_url': tool_parameters['source_url'],
+            'driver_url': tool_parameters.get('driver_url'),
+            'config': config,
+        }
+        options = {k: v for k, v in options.items() if v is not None and v != ''}
+
+        if not options.get('source_url'):
+            raise ValueError('Source URL is required')
+
+        if config.get('logo_url'):
+            if not config.get('logo_x'):
+                raise ValueError('Logo X position is required when logo URL is provided')
+            if not config.get('logo_y'):
+                raise ValueError('Logo Y position is required when logo URL is provided')
+
+        animations_result = app.animations(params=options, wait=True)
+
+        if not isinstance(animations_result, str):
+            animations_result = json.dumps(animations_result, ensure_ascii=False, indent=4)
+
+        if not animations_result:
+            return self.create_text_message('D-ID animations request failed.')
+
+        return self.create_text_message(animations_result)

+ 86 - 0
api/core/tools/provider/builtin/did/tools/animations.yaml

@@ -0,0 +1,86 @@
+identity:
+  name: animations
+  author: Matri Qi
+  label:
+    en_US: Animations
+description:
+  human:
+    en_US: Animations enables to create videos matching head movements, expressions, emotions, and voice from a driver video and image.
+  llm: Animations enables to create videos matching head movements, expressions, emotions, and voice from a driver video and image.
+parameters:
+  - name: source_url
+    type: string
+    required: true
+    label:
+      en_US: source url
+    human_description:
+      en_US: The URL of the source image to be animated by the driver video, or a selection from the list of provided studio actors.
+    llm_description: The URL of the source image to be animated by the driver video, or a selection from the list of provided studio actors.
+    form: llm
+  - name: driver_url
+    type: string
+    required: false
+    label:
+      en_US: driver url
+    human_description:
+      en_US: The URL of the driver video to drive the animation, or a provided driver name from D-ID.
+    form: form
+  - name: mute
+    type: boolean
+    required: false
+    label:
+      en_US: mute
+    human_description:
+      en_US: Mutes the driver sound in the animated video result, defaults to true
+    form: form
+  - name: stitch
+    type: boolean
+    required: false
+    label:
+      en_US: stitch
+    human_description:
+      en_US: If enabled, the driver video will be stitched with the animationing head video.
+    form: form
+  - name: logo_url
+    type: string
+    required: false
+    label:
+      en_US: logo url
+    human_description:
+      en_US: The URL of the logo image to be added to the animation video.
+    form: form
+  - name: logo_x
+    type: number
+    required: false
+    label:
+      en_US: logo position x
+    human_description:
+      en_US: The x position of the logo image in the animation video. It's required when logo url is provided.
+    form: form
+  - name: logo_y
+    type: number
+    required: false
+    label:
+      en_US: logo position y
+    human_description:
+      en_US: The y position of the logo image in the animation video. It's required when logo url is provided.
+    form: form
+  - name: result_format
+    type: string
+    default: mp4
+    required: false
+    label:
+      en_US: result format
+    human_description:
+      en_US: The format of the result video.
+    form: form
+    options:
+      - value: mp4
+        label:
+          en_US: mp4
+      - value: gif
+        label:
+          en_US: gif
+      - value: mov
+        label:
+          en_US: mov

+ 65 - 0
api/core/tools/provider/builtin/did/tools/talks.py

@@ -0,0 +1,65 @@
+import json
+from typing import Any, Union
+
+from core.tools.entities.tool_entities import ToolInvokeMessage
+from core.tools.provider.builtin.did.did_appx import DIDApp
+from core.tools.tool.builtin_tool import BuiltinTool
+
+
+class TalksTool(BuiltinTool):
+    def _invoke(
+        self, user_id: str, tool_parameters: dict[str, Any]
+    ) -> Union[ToolInvokeMessage, list[ToolInvokeMessage]]:
+        app = DIDApp(api_key=self.runtime.credentials['did_api_key'], base_url=self.runtime.credentials['base_url'])
+
+        driver_expressions_str = tool_parameters.get('driver_expressions')
+        driver_expressions = json.loads(driver_expressions_str) if driver_expressions_str else None
+
+        script = {
+            'type': tool_parameters.get('script_type') or 'text',
+            'input': tool_parameters.get('text_input'),
+            'audio_url': tool_parameters.get('audio_url'),
+            'reduce_noise': tool_parameters.get('audio_reduce_noise', False),
+        }
+        script = {k: v for k, v in script.items() if v is not None and v != ''}
+        config = {
+            'stitch': tool_parameters.get('stitch', True),
+            'sharpen': tool_parameters.get('sharpen'),
+            'fluent': tool_parameters.get('fluent'),
+            'result_format': tool_parameters.get('result_format') or 'mp4',
+            'pad_audio': tool_parameters.get('pad_audio'),
+            'driver_expressions': driver_expressions,
+        }
+        config = {k: v for k, v in config.items() if v is not None and v != ''}
+
+        options = {
+            'source_url': tool_parameters['source_url'],
+            'driver_url': tool_parameters.get('driver_url'),
+            'script': script,
+            'config': config,
+        }
+        options = {k: v for k, v in options.items() if v is not None and v != ''}
+
+        if not options.get('source_url'):
+            raise ValueError('Source URL is required')
+
+        if script.get('type') == 'audio':
+            script.pop('input', None)
+            if not script.get('audio_url'):
+                raise ValueError('Audio URL is required for audio script type')
+
+        if script.get('type') == 'text':
+            script.pop('audio_url', None)
+            script.pop('reduce_noise', None)
+            if not script.get('input'):
+                raise ValueError('Text input is required for text script type')
+
+        talks_result = app.talks(params=options, wait=True)
+
+        if not isinstance(talks_result, str):
+            talks_result = json.dumps(talks_result, ensure_ascii=False, indent=4)
+
+        if not talks_result:
+            return self.create_text_message('D-ID talks request failed.')
+
+        return self.create_text_message(talks_result)

+ 126 - 0
api/core/tools/provider/builtin/did/tools/talks.yaml

@@ -0,0 +1,126 @@
+identity:
+  name: talks
+  author: Matri Qi
+  label:
+    en_US: Talks
+description:
+  human:
+    en_US: Talks enables the creation of realistic talking head videos from text or audio inputs.
+  llm: Talks enables the creation of realistic talking head videos from text or audio inputs.
+parameters:
+  - name: source_url
+    type: string
+    required: true
+    label:
+      en_US: source url
+    human_description:
+      en_US: The URL of the source image to be animated by the driver video, or a selection from the list of provided studio actors.
+    llm_description: The URL of the source image to be animated by the driver video, or a selection from the list of provided studio actors.
+    form: llm
+  - name: driver_url
+    type: string
+    required: false
+    label:
+      en_US: driver url
+    human_description:
+      en_US: The URL of the driver video to drive the talk, or a provided driver name from D-ID.
+    form: form
+  - name: script_type
+    type: string
+    required: false
+    label:
+      en_US: script type
+    human_description:
+      en_US: The type of the script.
+    form: form
+    options:
+      - value: text
+        label:
+          en_US: text
+      - value: audio
+        label:
+          en_US: audio
+  - name: text_input
+    type: string
+    required: false
+    label:
+      en_US: text input
+    human_description:
+      en_US: The text input to be spoken by the talking head. Required when script type is text.
+    form: form
+  - name: audio_url
+    type: string
+    required: false
+    label:
+      en_US: audio url
+    human_description:
+      en_US: The URL of the audio file to be spoken by the talking head. Required when script type is audio.
+    form: form
+  - name: audio_reduce_noise
+    type: boolean
+    required: false
+    label:
+      en_US: audio reduce noise
+    human_description:
+      en_US: If enabled, the audio will be processed to reduce noise before being spoken by the talking head. It only works when script type is audio.
+    form: form
+  - name: stitch
+    type: boolean
+    required: false
+    label:
+      en_US: stitch
+    human_description:
+      en_US: If enabled, the driver video will be stitched with the talking head video.
+    form: form
+  - name: sharpen
+    type: boolean
+    required: false
+    label:
+      en_US: sharpen
+    human_description:
+      en_US: If enabled, the talking head video will be sharpened.
+    form: form
+  - name: result_format
+    type: string
+    required: false
+    label:
+      en_US: result format
+    human_description:
+      en_US: The format of the result video.
+    form: form
+    options:
+      - value: mp4
+        label:
+          en_US: mp4
+      - value: gif
+        label:
+          en_US: gif
+      - value: mov
+        label:
+          en_US: mov
+  - name: fluent
+    type: boolean
+    required: false
+    label:
+      en_US: fluent
+    human_description:
+      en_US: Interpolate between the last & first frames of the driver video When used together with pad_audio can create a seamless transition between videos of the same driver
+    form: form
+  - name: pad_audio
+    type: number
+    required: false
+    label:
+      en_US: pad audio
+    human_description:
+      en_US: Pad the audio with silence at the end (given in seconds) Will increase the video duration & the credits it consumes
+    form: form
+    min: 1
+    max: 60
+  - name: driver_expressions
+    type: string
+    required: false
+    label:
+      en_US: driver expressions
+    human_description:
+      en_US: timed expressions for animation. It should be an JSON array style string. Take D-ID documentation(https://docs.d-id.com/reference/createtalk) for more information.
+    form: form

Alguns arquivos não foram mostrados porque muitos arquivos mudaram nesse diff