Procházet zdrojové kódy

Add query_prefix + Return TED Transcript URL for Downstream Scraping Tasks (#11090)

Tao Wang před 4 měsíci
rodič
revize
af2461cccc

+ 6 - 0
api/core/tools/provider/builtin/duckduckgo/tools/ddgo_img.py

@@ -18,6 +18,12 @@ class DuckDuckGoImageSearchTool(BuiltinTool):
             "size": tool_parameters.get("size"),
             "max_results": tool_parameters.get("max_results"),
         }
+
+        # Add query_prefix handling
+        query_prefix = tool_parameters.get("query_prefix", "").strip()
+        final_query = f"{query_prefix} {query_dict['keywords']}".strip()
+        query_dict["keywords"] = final_query
+
         response = DDGS().images(**query_dict)
         markdown_result = "\n\n"
         json_result = []

+ 11 - 0
api/core/tools/provider/builtin/duckduckgo/tools/ddgo_img.yaml

@@ -86,3 +86,14 @@ parameters:
       en_US: The size of the image to be searched.
       zh_Hans: 要搜索的图片的大小
     form: form
+  - name: query_prefix
+    label:
+      en_US: Query Prefix
+      zh_Hans: 查询前缀
+    type: string
+    required: false
+    default: ""
+    form: form
+    human_description:
+      en_US: Specific Search e.g. "site:unsplash.com"
+      zh_Hans: 定向搜索 e.g. "site:unsplash.com"

+ 7 - 1
api/core/tools/provider/builtin/duckduckgo/tools/ddgo_news.py

@@ -7,7 +7,7 @@ from core.tools.entities.tool_entities import ToolInvokeMessage
 from core.tools.tool.builtin_tool import BuiltinTool
 
 SUMMARY_PROMPT = """
-User's query: 
+User's query:
 {query}
 
 Here are the news results:
@@ -30,6 +30,12 @@ class DuckDuckGoNewsSearchTool(BuiltinTool):
             "safesearch": "moderate",
             "region": "wt-wt",
         }
+
+        # Add query_prefix handling
+        query_prefix = tool_parameters.get("query_prefix", "").strip()
+        final_query = f"{query_prefix} {query_dict['keywords']}".strip()
+        query_dict["keywords"] = final_query
+
         try:
             response = list(DDGS().news(**query_dict))
             if not response:

+ 11 - 0
api/core/tools/provider/builtin/duckduckgo/tools/ddgo_news.yaml

@@ -69,3 +69,14 @@ parameters:
       en_US: Whether to pass the news results to llm for summarization.
       zh_Hans: 是否需要将新闻结果传给大模型总结
     form: form
+  - name: query_prefix
+    label:
+      en_US: Query Prefix
+      zh_Hans: 查询前缀
+    type: string
+    required: false
+    default: ""
+    form: form
+    human_description:
+      en_US: Specific Search e.g. "site:msn.com"
+      zh_Hans: 定向搜索 e.g. "site:msn.com"

+ 7 - 2
api/core/tools/provider/builtin/duckduckgo/tools/ddgo_search.py

@@ -7,7 +7,7 @@ from core.tools.entities.tool_entities import ToolInvokeMessage
 from core.tools.tool.builtin_tool import BuiltinTool
 
 SUMMARY_PROMPT = """
-User's query: 
+User's query:
 {query}
 
 Here is the search engine result:
@@ -26,7 +26,12 @@ class DuckDuckGoSearchTool(BuiltinTool):
         query = tool_parameters.get("query")
         max_results = tool_parameters.get("max_results", 5)
         require_summary = tool_parameters.get("require_summary", False)
-        response = DDGS().text(query, max_results=max_results)
+
+        # Add query_prefix handling
+        query_prefix = tool_parameters.get("query_prefix", "").strip()
+        final_query = f"{query_prefix} {query}".strip()
+
+        response = DDGS().text(final_query, max_results=max_results)
         if require_summary:
             results = "\n".join([res.get("body") for res in response])
             results = self.summary_results(user_id=user_id, content=results, query=query)

+ 11 - 0
api/core/tools/provider/builtin/duckduckgo/tools/ddgo_search.yaml

@@ -39,3 +39,14 @@ parameters:
       en_US: Whether to pass the search results to llm for summarization.
       zh_Hans: 是否需要将搜索结果传给大模型总结
     form: form
+  - name: query_prefix
+    label:
+      en_US: Query Prefix
+      zh_Hans: 查询前缀
+    type: string
+    required: false
+    default: ""
+    form: form
+    human_description:
+      en_US: Specific Search e.g. "site:wikipedia.org"
+      zh_Hans: 定向搜索 e.g. "site:wikipedia.org"

+ 19 - 3
api/core/tools/provider/builtin/duckduckgo/tools/ddgo_video.py

@@ -24,7 +24,7 @@ max-width: 100%; border-radius: 8px;">
 
     def _invoke(self, user_id: str, tool_parameters: dict[str, Any]) -> list[ToolInvokeMessage]:
         query_dict = {
-            "keywords": tool_parameters.get("query"),
+            "keywords": tool_parameters.get("query"),  # LLM's query
             "region": tool_parameters.get("region", "wt-wt"),
             "safesearch": tool_parameters.get("safesearch", "moderate"),
             "timelimit": tool_parameters.get("timelimit"),
@@ -40,6 +40,12 @@ max-width: 100%; border-radius: 8px;">
         # Get proxy URL from parameters
         proxy_url = tool_parameters.get("proxy_url", "").strip()
 
+        query_prefix = tool_parameters.get("query_prefix", "").strip()
+        final_query = f"{query_prefix} {query_dict['keywords']}".strip()
+
+        # Update the keywords in query_dict with the final_query
+        query_dict["keywords"] = final_query
+
         response = DDGS().videos(**query_dict)
 
         # Create HTML result with embedded iframes
@@ -51,9 +57,13 @@ max-width: 100%; border-radius: 8px;">
             embed_html = res.get("embed_html", "")
             description = res.get("description", "")
             content_url = res.get("content", "")
+            transcript_url = None
 
             # Handle TED.com videos
-            if not embed_html and "ted.com/talks" in content_url:
+            if "ted.com/talks" in content_url:
+                # Create transcript URL
+                transcript_url = f"{content_url}/transcript"
+                # Create embed URL
                 embed_url = content_url.replace("www.ted.com", "embed.ted.com")
                 if proxy_url:
                     embed_url = f"{proxy_url}{embed_url}"
@@ -68,8 +78,14 @@ max-width: 100%; border-radius: 8px;">
 
             markdown_result += f"{title}\n\n"
             markdown_result += f"{embed_html}\n\n"
+            if description:
+                markdown_result += f"{description}\n\n"
             markdown_result += "---\n\n"
 
-            json_result.append(self.create_json_message(res))
+            # Add transcript_url to the JSON result if available
+            result_dict = res.copy()
+            if transcript_url:
+                result_dict["transcript_url"] = transcript_url
+            json_result.append(self.create_json_message(result_dict))
 
         return [self.create_text_message(markdown_result)] + json_result

+ 11 - 0
api/core/tools/provider/builtin/duckduckgo/tools/ddgo_video.yaml

@@ -95,3 +95,14 @@ parameters:
       en_US: Proxy URL
       zh_Hans: 视频代理地址
     form: form
+  - name: query_prefix
+    label:
+      en_US: Query Prefix
+      zh_Hans: 查询前缀
+    type: string
+    required: false
+    default: ""
+    form: form
+    human_description:
+      en_US: Specific Search e.g. "site:www.ted.com"
+      zh_Hans: 定向搜索 e.g. "site:www.ted.com"