فهرست منبع

feat: optimize generation of conversation title (#1075)

takatost 1 سال پیش
والد
کامیت
df6604a734
3فایلهای تغییر یافته به همراه73 افزوده شده و 10 حذف شده
  1. 2 1
      .github/workflows/check_no_chinese_comments.py
  2. 10 2
      api/core/generator/llm_generator.py
  3. 61 7
      api/core/prompt/prompts.py

+ 2 - 1
.github/workflows/check_no_chinese_comments.py

@@ -20,7 +20,8 @@ def check_file_for_chinese_comments(file_path):
 def main():
     has_chinese = False
     excluded_files = ["model_template.py", 'stopwords.py', 'commands.py',
-                      'indexing_runner.py', 'web_reader_tool.py', 'spark_provider.py']
+                      'indexing_runner.py', 'web_reader_tool.py', 'spark_provider.py',
+                      'prompts.py']
 
     for root, _, files in os.walk("."):
         for file in files:

+ 10 - 2
api/core/generator/llm_generator.py

@@ -1,3 +1,4 @@
+import json
 import logging
 
 from langchain.schema import OutputParserException
@@ -22,18 +23,25 @@ class LLMGenerator:
         if len(query) > 2000:
             query = query[:300] + "...[TRUNCATED]..." + query[-300:]
 
-        prompt = prompt.format(query=query)
+        query = query.replace("\n", " ")
+
+        prompt += query + "\n"
 
         model_instance = ModelFactory.get_text_generation_model(
             tenant_id=tenant_id,
             model_kwargs=ModelKwargs(
-                max_tokens=50
+                temperature=1,
+                max_tokens=100
             )
         )
 
         prompts = [PromptMessage(content=prompt)]
         response = model_instance.run(prompts)
         answer = response.content
+
+        result_dict = json.loads(answer)
+        answer = result_dict['Your Output']
+
         return answer.strip()
 
     @classmethod

+ 61 - 7
api/core/prompt/prompts.py

@@ -1,10 +1,64 @@
-CONVERSATION_TITLE_PROMPT = (
-    "Human:{query}\n-----\n"
-    "Help me summarize the intent of what the human said and provide a title, the title should not exceed 20 words.\n"
-    "If what the human said is conducted in English, you should only return an English title.\n" 
-    "If what the human said is conducted in Chinese, you should only return a Chinese title.\n"
-    "title:"
-)
+# Written by YORKI MINAKO🤡
+CONVERSATION_TITLE_PROMPT = """You need to decompose the user's input into "subject" and "intention" in order to accurately figure out what the user's input language actually is. 
+Notice: the language type user using is abundant, can be English, Chinese, Español, Arabic, Japanese, and etc.
+MAKE SURE your output is the SAME language as the user's input!
+Your output is restricted only to: (Input language) Intention + Subject(short as possible)
+
+Tip: When the user's question is directed at you (the language model), you can add an emoji to make it more fun.
+
+
+example 1:
+User Input: hi, yesterday i had some burgers.
+{
+  "Language Type": "The user's input is pure English",
+  "Your Reasoning": "The language of my output must be pure English.",
+  "Your Output": "sharing yesterday's food"
+}
+
+example 2:
+User Input: hello
+{
+  "Language Type": "The user's input is written in pure English",
+  "Your Reasoning": "The language of my output must be pure English.",
+  "Your Output": "Greeting myself☺️"
+}
+
+
+example 3:
+User Input: why mmap file: oom
+{
+  "Language Type": "The user's input is written in pure English",
+  "Your Reasoning": "The language of my output must be pure English.",
+  "Your Output": "Asking about the reason for mmap file: oom"
+}
+
+
+example 4:
+User Input: www.convinceme.yesterday-you-ate-seafood.tv讲了什么?
+{
+  "Language Type": "The user's input English-Chinese mixed",
+  "Your Reasoning": "The English-part is an URL, the main intention is still written in Chinese, so the language of my output must be using Chinese.",
+  "Your Output": "询问网站www.convinceme.yesterday-you-ate-seafood.tv"
+}
+
+example 5:
+User Input: why小红的年龄is老than小明?
+{
+  "Language Type": "The user's input is English-Chinese mixed",
+  "Your Reasoning": "The English parts are subjective particles, the main intention is written in Chinese, besides, Chinese occupies a greater \"actual meaning\" than English, so the language of my output must be using Chinese.",
+  "Your Output": "询问小红和小明的年龄"
+}
+
+example 6:
+User Input: yo, 你今天咋样?
+{
+  "Language Type": "The user's input is English-Chinese mixed",
+  "Your Reasoning": "The English-part is a subjective particle, the main intention is written in Chinese, so the language of my output must be using Chinese.",
+  "Your Output": "查询今日我的状态☺️"
+}
+
+User Input: 
+"""
 
 CONVERSATION_SUMMARY_PROMPT = (
     "Please generate a short summary of the following conversation.\n"