|
@@ -2,27 +2,19 @@ import logging
|
|
import re
|
|
import re
|
|
from typing import Optional, List, Union, Tuple
|
|
from typing import Optional, List, Union, Tuple
|
|
|
|
|
|
-from langchain.base_language import BaseLanguageModel
|
|
|
|
-from langchain.callbacks.base import BaseCallbackHandler
|
|
|
|
-from langchain.chat_models.base import BaseChatModel
|
|
|
|
-from langchain.llms import BaseLLM
|
|
|
|
-from langchain.schema import BaseMessage, HumanMessage
|
|
|
|
|
|
+from langchain.schema import BaseMessage
|
|
from requests.exceptions import ChunkedEncodingError
|
|
from requests.exceptions import ChunkedEncodingError
|
|
|
|
|
|
from core.agent.agent_executor import AgentExecuteResult, PlanningStrategy
|
|
from core.agent.agent_executor import AgentExecuteResult, PlanningStrategy
|
|
from core.callback_handler.main_chain_gather_callback_handler import MainChainGatherCallbackHandler
|
|
from core.callback_handler.main_chain_gather_callback_handler import MainChainGatherCallbackHandler
|
|
-from core.constant import llm_constant
|
|
|
|
from core.callback_handler.llm_callback_handler import LLMCallbackHandler
|
|
from core.callback_handler.llm_callback_handler import LLMCallbackHandler
|
|
-from core.callback_handler.std_out_callback_handler import DifyStreamingStdOutCallbackHandler, \
|
|
|
|
- DifyStdOutCallbackHandler
|
|
|
|
from core.conversation_message_task import ConversationMessageTask, ConversationTaskStoppedException
|
|
from core.conversation_message_task import ConversationMessageTask, ConversationTaskStoppedException
|
|
-from core.llm.error import LLMBadRequestError
|
|
|
|
-from core.llm.fake import FakeLLM
|
|
|
|
-from core.llm.llm_builder import LLMBuilder
|
|
|
|
-from core.llm.streamable_chat_open_ai import StreamableChatOpenAI
|
|
|
|
-from core.llm.streamable_open_ai import StreamableOpenAI
|
|
|
|
|
|
+from core.model_providers.error import LLMBadRequestError
|
|
from core.memory.read_only_conversation_token_db_buffer_shared_memory import \
|
|
from core.memory.read_only_conversation_token_db_buffer_shared_memory import \
|
|
ReadOnlyConversationTokenDBBufferSharedMemory
|
|
ReadOnlyConversationTokenDBBufferSharedMemory
|
|
|
|
+from core.model_providers.model_factory import ModelFactory
|
|
|
|
+from core.model_providers.models.entity.message import PromptMessage, to_prompt_messages
|
|
|
|
+from core.model_providers.models.llm.base import BaseLLM
|
|
from core.orchestrator_rule_parser import OrchestratorRuleParser
|
|
from core.orchestrator_rule_parser import OrchestratorRuleParser
|
|
from core.prompt.prompt_builder import PromptBuilder
|
|
from core.prompt.prompt_builder import PromptBuilder
|
|
from core.prompt.prompt_template import JinjaPromptTemplate
|
|
from core.prompt.prompt_template import JinjaPromptTemplate
|
|
@@ -51,12 +43,10 @@ class Completion:
|
|
|
|
|
|
inputs = conversation.inputs
|
|
inputs = conversation.inputs
|
|
|
|
|
|
- rest_tokens_for_context_and_memory = cls.get_validate_rest_tokens(
|
|
|
|
- mode=app.mode,
|
|
|
|
|
|
+ final_model_instance = ModelFactory.get_text_generation_model_from_model_config(
|
|
tenant_id=app.tenant_id,
|
|
tenant_id=app.tenant_id,
|
|
- app_model_config=app_model_config,
|
|
|
|
- query=query,
|
|
|
|
- inputs=inputs
|
|
|
|
|
|
+ model_config=app_model_config.model_dict,
|
|
|
|
+ streaming=streaming
|
|
)
|
|
)
|
|
|
|
|
|
conversation_message_task = ConversationMessageTask(
|
|
conversation_message_task = ConversationMessageTask(
|
|
@@ -68,10 +58,17 @@ class Completion:
|
|
is_override=is_override,
|
|
is_override=is_override,
|
|
inputs=inputs,
|
|
inputs=inputs,
|
|
query=query,
|
|
query=query,
|
|
- streaming=streaming
|
|
|
|
|
|
+ streaming=streaming,
|
|
|
|
+ model_instance=final_model_instance
|
|
)
|
|
)
|
|
|
|
|
|
- chain_callback = MainChainGatherCallbackHandler(conversation_message_task)
|
|
|
|
|
|
+ rest_tokens_for_context_and_memory = cls.get_validate_rest_tokens(
|
|
|
|
+ mode=app.mode,
|
|
|
|
+ model_instance=final_model_instance,
|
|
|
|
+ app_model_config=app_model_config,
|
|
|
|
+ query=query,
|
|
|
|
+ inputs=inputs
|
|
|
|
+ )
|
|
|
|
|
|
# init orchestrator rule parser
|
|
# init orchestrator rule parser
|
|
orchestrator_rule_parser = OrchestratorRuleParser(
|
|
orchestrator_rule_parser = OrchestratorRuleParser(
|
|
@@ -80,6 +77,7 @@ class Completion:
|
|
)
|
|
)
|
|
|
|
|
|
# parse sensitive_word_avoidance_chain
|
|
# parse sensitive_word_avoidance_chain
|
|
|
|
+ chain_callback = MainChainGatherCallbackHandler(conversation_message_task)
|
|
sensitive_word_avoidance_chain = orchestrator_rule_parser.to_sensitive_word_avoidance_chain([chain_callback])
|
|
sensitive_word_avoidance_chain = orchestrator_rule_parser.to_sensitive_word_avoidance_chain([chain_callback])
|
|
if sensitive_word_avoidance_chain:
|
|
if sensitive_word_avoidance_chain:
|
|
query = sensitive_word_avoidance_chain.run(query)
|
|
query = sensitive_word_avoidance_chain.run(query)
|
|
@@ -102,15 +100,14 @@ class Completion:
|
|
# run the final llm
|
|
# run the final llm
|
|
try:
|
|
try:
|
|
cls.run_final_llm(
|
|
cls.run_final_llm(
|
|
- tenant_id=app.tenant_id,
|
|
|
|
|
|
+ model_instance=final_model_instance,
|
|
mode=app.mode,
|
|
mode=app.mode,
|
|
app_model_config=app_model_config,
|
|
app_model_config=app_model_config,
|
|
query=query,
|
|
query=query,
|
|
inputs=inputs,
|
|
inputs=inputs,
|
|
agent_execute_result=agent_execute_result,
|
|
agent_execute_result=agent_execute_result,
|
|
conversation_message_task=conversation_message_task,
|
|
conversation_message_task=conversation_message_task,
|
|
- memory=memory,
|
|
|
|
- streaming=streaming
|
|
|
|
|
|
+ memory=memory
|
|
)
|
|
)
|
|
except ConversationTaskStoppedException:
|
|
except ConversationTaskStoppedException:
|
|
return
|
|
return
|
|
@@ -121,31 +118,20 @@ class Completion:
|
|
return
|
|
return
|
|
|
|
|
|
@classmethod
|
|
@classmethod
|
|
- def run_final_llm(cls, tenant_id: str, mode: str, app_model_config: AppModelConfig, query: str, inputs: dict,
|
|
|
|
|
|
+ def run_final_llm(cls, model_instance: BaseLLM, mode: str, app_model_config: AppModelConfig, query: str, inputs: dict,
|
|
agent_execute_result: Optional[AgentExecuteResult],
|
|
agent_execute_result: Optional[AgentExecuteResult],
|
|
conversation_message_task: ConversationMessageTask,
|
|
conversation_message_task: ConversationMessageTask,
|
|
- memory: Optional[ReadOnlyConversationTokenDBBufferSharedMemory], streaming: bool):
|
|
|
|
|
|
+ memory: Optional[ReadOnlyConversationTokenDBBufferSharedMemory]):
|
|
# When no extra pre prompt is specified,
|
|
# When no extra pre prompt is specified,
|
|
# the output of the agent can be used directly as the main output content without calling LLM again
|
|
# the output of the agent can be used directly as the main output content without calling LLM again
|
|
|
|
+ fake_response = None
|
|
if not app_model_config.pre_prompt and agent_execute_result and agent_execute_result.output \
|
|
if not app_model_config.pre_prompt and agent_execute_result and agent_execute_result.output \
|
|
and agent_execute_result.strategy != PlanningStrategy.ROUTER:
|
|
and agent_execute_result.strategy != PlanningStrategy.ROUTER:
|
|
- final_llm = FakeLLM(response=agent_execute_result.output,
|
|
|
|
- origin_llm=agent_execute_result.configuration.llm,
|
|
|
|
- streaming=streaming)
|
|
|
|
- final_llm.callbacks = cls.get_llm_callbacks(final_llm, streaming, conversation_message_task)
|
|
|
|
- response = final_llm.generate([[HumanMessage(content=query)]])
|
|
|
|
- return response
|
|
|
|
-
|
|
|
|
- final_llm = LLMBuilder.to_llm_from_model(
|
|
|
|
- tenant_id=tenant_id,
|
|
|
|
- model=app_model_config.model_dict,
|
|
|
|
- streaming=streaming
|
|
|
|
- )
|
|
|
|
|
|
+ fake_response = agent_execute_result.output
|
|
|
|
|
|
# get llm prompt
|
|
# get llm prompt
|
|
- prompt, stop_words = cls.get_main_llm_prompt(
|
|
|
|
|
|
+ prompt_messages, stop_words = cls.get_main_llm_prompt(
|
|
mode=mode,
|
|
mode=mode,
|
|
- llm=final_llm,
|
|
|
|
model=app_model_config.model_dict,
|
|
model=app_model_config.model_dict,
|
|
pre_prompt=app_model_config.pre_prompt,
|
|
pre_prompt=app_model_config.pre_prompt,
|
|
query=query,
|
|
query=query,
|
|
@@ -154,25 +140,26 @@ class Completion:
|
|
memory=memory
|
|
memory=memory
|
|
)
|
|
)
|
|
|
|
|
|
- final_llm.callbacks = cls.get_llm_callbacks(final_llm, streaming, conversation_message_task)
|
|
|
|
-
|
|
|
|
cls.recale_llm_max_tokens(
|
|
cls.recale_llm_max_tokens(
|
|
- final_llm=final_llm,
|
|
|
|
- model=app_model_config.model_dict,
|
|
|
|
- prompt=prompt,
|
|
|
|
- mode=mode
|
|
|
|
|
|
+ model_instance=model_instance,
|
|
|
|
+ prompt_messages=prompt_messages,
|
|
)
|
|
)
|
|
|
|
|
|
- response = final_llm.generate([prompt], stop_words)
|
|
|
|
|
|
+ response = model_instance.run(
|
|
|
|
+ messages=prompt_messages,
|
|
|
|
+ stop=stop_words,
|
|
|
|
+ callbacks=[LLMCallbackHandler(model_instance, conversation_message_task)],
|
|
|
|
+ fake_response=fake_response
|
|
|
|
+ )
|
|
|
|
|
|
return response
|
|
return response
|
|
|
|
|
|
@classmethod
|
|
@classmethod
|
|
- def get_main_llm_prompt(cls, mode: str, llm: BaseLanguageModel, model: dict,
|
|
|
|
|
|
+ def get_main_llm_prompt(cls, mode: str, model: dict,
|
|
pre_prompt: str, query: str, inputs: dict,
|
|
pre_prompt: str, query: str, inputs: dict,
|
|
agent_execute_result: Optional[AgentExecuteResult],
|
|
agent_execute_result: Optional[AgentExecuteResult],
|
|
memory: Optional[ReadOnlyConversationTokenDBBufferSharedMemory]) -> \
|
|
memory: Optional[ReadOnlyConversationTokenDBBufferSharedMemory]) -> \
|
|
- Tuple[Union[str | List[BaseMessage]], Optional[List[str]]]:
|
|
|
|
|
|
+ Tuple[List[PromptMessage], Optional[List[str]]]:
|
|
if mode == 'completion':
|
|
if mode == 'completion':
|
|
prompt_template = JinjaPromptTemplate.from_template(
|
|
prompt_template = JinjaPromptTemplate.from_template(
|
|
template=("""Use the following context as your learned knowledge, inside <context></context> XML tags.
|
|
template=("""Use the following context as your learned knowledge, inside <context></context> XML tags.
|
|
@@ -200,11 +187,7 @@ And answer according to the language of the user's question.
|
|
**prompt_inputs
|
|
**prompt_inputs
|
|
)
|
|
)
|
|
|
|
|
|
- if isinstance(llm, BaseChatModel):
|
|
|
|
- # use chat llm as completion model
|
|
|
|
- return [HumanMessage(content=prompt_content)], None
|
|
|
|
- else:
|
|
|
|
- return prompt_content, None
|
|
|
|
|
|
+ return [PromptMessage(content=prompt_content)], None
|
|
else:
|
|
else:
|
|
messages: List[BaseMessage] = []
|
|
messages: List[BaseMessage] = []
|
|
|
|
|
|
@@ -249,12 +232,14 @@ And answer according to the language of the user's question.
|
|
inputs=human_inputs
|
|
inputs=human_inputs
|
|
)
|
|
)
|
|
|
|
|
|
- curr_message_tokens = memory.llm.get_num_tokens_from_messages([tmp_human_message])
|
|
|
|
- model_name = model['name']
|
|
|
|
- max_tokens = model.get("completion_params").get('max_tokens')
|
|
|
|
- rest_tokens = llm_constant.max_context_token_length[model_name] \
|
|
|
|
- - max_tokens - curr_message_tokens
|
|
|
|
- rest_tokens = max(rest_tokens, 0)
|
|
|
|
|
|
+ if memory.model_instance.model_rules.max_tokens.max:
|
|
|
|
+ curr_message_tokens = memory.model_instance.get_num_tokens(to_prompt_messages([tmp_human_message]))
|
|
|
|
+ max_tokens = model.get("completion_params").get('max_tokens')
|
|
|
|
+ rest_tokens = memory.model_instance.model_rules.max_tokens.max - max_tokens - curr_message_tokens
|
|
|
|
+ rest_tokens = max(rest_tokens, 0)
|
|
|
|
+ else:
|
|
|
|
+ rest_tokens = 2000
|
|
|
|
+
|
|
histories = cls.get_history_messages_from_memory(memory, rest_tokens)
|
|
histories = cls.get_history_messages_from_memory(memory, rest_tokens)
|
|
human_message_prompt += "\n\n" if human_message_prompt else ""
|
|
human_message_prompt += "\n\n" if human_message_prompt else ""
|
|
human_message_prompt += "Here is the chat histories between human and assistant, " \
|
|
human_message_prompt += "Here is the chat histories between human and assistant, " \
|
|
@@ -274,17 +259,7 @@ And answer according to the language of the user's question.
|
|
for message in messages:
|
|
for message in messages:
|
|
message.content = re.sub(r'<\|.*?\|>', '', message.content)
|
|
message.content = re.sub(r'<\|.*?\|>', '', message.content)
|
|
|
|
|
|
- return messages, ['\nHuman:', '</histories>']
|
|
|
|
-
|
|
|
|
- @classmethod
|
|
|
|
- def get_llm_callbacks(cls, llm: BaseLanguageModel,
|
|
|
|
- streaming: bool,
|
|
|
|
- conversation_message_task: ConversationMessageTask) -> List[BaseCallbackHandler]:
|
|
|
|
- llm_callback_handler = LLMCallbackHandler(llm, conversation_message_task)
|
|
|
|
- if streaming:
|
|
|
|
- return [llm_callback_handler, DifyStreamingStdOutCallbackHandler()]
|
|
|
|
- else:
|
|
|
|
- return [llm_callback_handler, DifyStdOutCallbackHandler()]
|
|
|
|
|
|
+ return to_prompt_messages(messages), ['\nHuman:', '</histories>']
|
|
|
|
|
|
@classmethod
|
|
@classmethod
|
|
def get_history_messages_from_memory(cls, memory: ReadOnlyConversationTokenDBBufferSharedMemory,
|
|
def get_history_messages_from_memory(cls, memory: ReadOnlyConversationTokenDBBufferSharedMemory,
|
|
@@ -300,15 +275,15 @@ And answer according to the language of the user's question.
|
|
conversation: Conversation,
|
|
conversation: Conversation,
|
|
**kwargs) -> ReadOnlyConversationTokenDBBufferSharedMemory:
|
|
**kwargs) -> ReadOnlyConversationTokenDBBufferSharedMemory:
|
|
# only for calc token in memory
|
|
# only for calc token in memory
|
|
- memory_llm = LLMBuilder.to_llm_from_model(
|
|
|
|
|
|
+ memory_model_instance = ModelFactory.get_text_generation_model_from_model_config(
|
|
tenant_id=tenant_id,
|
|
tenant_id=tenant_id,
|
|
- model=app_model_config.model_dict
|
|
|
|
|
|
+ model_config=app_model_config.model_dict
|
|
)
|
|
)
|
|
|
|
|
|
# use llm config from conversation
|
|
# use llm config from conversation
|
|
memory = ReadOnlyConversationTokenDBBufferSharedMemory(
|
|
memory = ReadOnlyConversationTokenDBBufferSharedMemory(
|
|
conversation=conversation,
|
|
conversation=conversation,
|
|
- llm=memory_llm,
|
|
|
|
|
|
+ model_instance=memory_model_instance,
|
|
max_token_limit=kwargs.get("max_token_limit", 2048),
|
|
max_token_limit=kwargs.get("max_token_limit", 2048),
|
|
memory_key=kwargs.get("memory_key", "chat_history"),
|
|
memory_key=kwargs.get("memory_key", "chat_history"),
|
|
return_messages=kwargs.get("return_messages", True),
|
|
return_messages=kwargs.get("return_messages", True),
|
|
@@ -320,21 +295,20 @@ And answer according to the language of the user's question.
|
|
return memory
|
|
return memory
|
|
|
|
|
|
@classmethod
|
|
@classmethod
|
|
- def get_validate_rest_tokens(cls, mode: str, tenant_id: str, app_model_config: AppModelConfig,
|
|
|
|
|
|
+ def get_validate_rest_tokens(cls, mode: str, model_instance: BaseLLM, app_model_config: AppModelConfig,
|
|
query: str, inputs: dict) -> int:
|
|
query: str, inputs: dict) -> int:
|
|
- llm = LLMBuilder.to_llm_from_model(
|
|
|
|
- tenant_id=tenant_id,
|
|
|
|
- model=app_model_config.model_dict
|
|
|
|
- )
|
|
|
|
|
|
+ model_limited_tokens = model_instance.model_rules.max_tokens.max
|
|
|
|
+ max_tokens = model_instance.get_model_kwargs().max_tokens
|
|
|
|
|
|
- model_name = app_model_config.model_dict.get("name")
|
|
|
|
- model_limited_tokens = llm_constant.max_context_token_length[model_name]
|
|
|
|
- max_tokens = app_model_config.model_dict.get("completion_params").get('max_tokens')
|
|
|
|
|
|
+ if model_limited_tokens is None:
|
|
|
|
+ return -1
|
|
|
|
+
|
|
|
|
+ if max_tokens is None:
|
|
|
|
+ max_tokens = 0
|
|
|
|
|
|
# get prompt without memory and context
|
|
# get prompt without memory and context
|
|
- prompt, _ = cls.get_main_llm_prompt(
|
|
|
|
|
|
+ prompt_messages, _ = cls.get_main_llm_prompt(
|
|
mode=mode,
|
|
mode=mode,
|
|
- llm=llm,
|
|
|
|
model=app_model_config.model_dict,
|
|
model=app_model_config.model_dict,
|
|
pre_prompt=app_model_config.pre_prompt,
|
|
pre_prompt=app_model_config.pre_prompt,
|
|
query=query,
|
|
query=query,
|
|
@@ -343,9 +317,7 @@ And answer according to the language of the user's question.
|
|
memory=None
|
|
memory=None
|
|
)
|
|
)
|
|
|
|
|
|
- prompt_tokens = llm.get_num_tokens(prompt) if isinstance(prompt, str) \
|
|
|
|
- else llm.get_num_tokens_from_messages(prompt)
|
|
|
|
-
|
|
|
|
|
|
+ prompt_tokens = model_instance.get_num_tokens(prompt_messages)
|
|
rest_tokens = model_limited_tokens - max_tokens - prompt_tokens
|
|
rest_tokens = model_limited_tokens - max_tokens - prompt_tokens
|
|
if rest_tokens < 0:
|
|
if rest_tokens < 0:
|
|
raise LLMBadRequestError("Query or prefix prompt is too long, you can reduce the prefix prompt, "
|
|
raise LLMBadRequestError("Query or prefix prompt is too long, you can reduce the prefix prompt, "
|
|
@@ -354,36 +326,40 @@ And answer according to the language of the user's question.
|
|
return rest_tokens
|
|
return rest_tokens
|
|
|
|
|
|
@classmethod
|
|
@classmethod
|
|
- def recale_llm_max_tokens(cls, final_llm: BaseLanguageModel, model: dict,
|
|
|
|
- prompt: Union[str, List[BaseMessage]], mode: str):
|
|
|
|
|
|
+ def recale_llm_max_tokens(cls, model_instance: BaseLLM, prompt_messages: List[PromptMessage]):
|
|
# recalc max_tokens if sum(prompt_token + max_tokens) over model token limit
|
|
# recalc max_tokens if sum(prompt_token + max_tokens) over model token limit
|
|
- model_name = model.get("name")
|
|
|
|
- model_limited_tokens = llm_constant.max_context_token_length[model_name]
|
|
|
|
- max_tokens = model.get("completion_params").get('max_tokens')
|
|
|
|
|
|
+ model_limited_tokens = model_instance.model_rules.max_tokens.max
|
|
|
|
+ max_tokens = model_instance.get_model_kwargs().max_tokens
|
|
|
|
|
|
- if mode == 'completion' and isinstance(final_llm, BaseLLM):
|
|
|
|
- prompt_tokens = final_llm.get_num_tokens(prompt)
|
|
|
|
- else:
|
|
|
|
- prompt_tokens = final_llm.get_num_tokens_from_messages(prompt)
|
|
|
|
|
|
+ if model_limited_tokens is None:
|
|
|
|
+ return
|
|
|
|
+
|
|
|
|
+ if max_tokens is None:
|
|
|
|
+ max_tokens = 0
|
|
|
|
+
|
|
|
|
+ prompt_tokens = model_instance.get_num_tokens(prompt_messages)
|
|
|
|
|
|
if prompt_tokens + max_tokens > model_limited_tokens:
|
|
if prompt_tokens + max_tokens > model_limited_tokens:
|
|
max_tokens = max(model_limited_tokens - prompt_tokens, 16)
|
|
max_tokens = max(model_limited_tokens - prompt_tokens, 16)
|
|
- final_llm.max_tokens = max_tokens
|
|
|
|
|
|
+
|
|
|
|
+ # update model instance max tokens
|
|
|
|
+ model_kwargs = model_instance.get_model_kwargs()
|
|
|
|
+ model_kwargs.max_tokens = max_tokens
|
|
|
|
+ model_instance.set_model_kwargs(model_kwargs)
|
|
|
|
|
|
@classmethod
|
|
@classmethod
|
|
def generate_more_like_this(cls, task_id: str, app: App, message: Message, pre_prompt: str,
|
|
def generate_more_like_this(cls, task_id: str, app: App, message: Message, pre_prompt: str,
|
|
app_model_config: AppModelConfig, user: Account, streaming: bool):
|
|
app_model_config: AppModelConfig, user: Account, streaming: bool):
|
|
|
|
|
|
- llm = LLMBuilder.to_llm_from_model(
|
|
|
|
|
|
+ final_model_instance = ModelFactory.get_text_generation_model_from_model_config(
|
|
tenant_id=app.tenant_id,
|
|
tenant_id=app.tenant_id,
|
|
- model=app_model_config.model_dict,
|
|
|
|
|
|
+ model_config=app_model_config.model_dict,
|
|
streaming=streaming
|
|
streaming=streaming
|
|
)
|
|
)
|
|
|
|
|
|
# get llm prompt
|
|
# get llm prompt
|
|
- original_prompt, _ = cls.get_main_llm_prompt(
|
|
|
|
|
|
+ old_prompt_messages, _ = cls.get_main_llm_prompt(
|
|
mode="completion",
|
|
mode="completion",
|
|
- llm=llm,
|
|
|
|
model=app_model_config.model_dict,
|
|
model=app_model_config.model_dict,
|
|
pre_prompt=pre_prompt,
|
|
pre_prompt=pre_prompt,
|
|
query=message.query,
|
|
query=message.query,
|
|
@@ -395,10 +371,9 @@ And answer according to the language of the user's question.
|
|
original_completion = message.answer.strip()
|
|
original_completion = message.answer.strip()
|
|
|
|
|
|
prompt = MORE_LIKE_THIS_GENERATE_PROMPT
|
|
prompt = MORE_LIKE_THIS_GENERATE_PROMPT
|
|
- prompt = prompt.format(prompt=original_prompt, original_completion=original_completion)
|
|
|
|
|
|
+ prompt = prompt.format(prompt=old_prompt_messages[0].content, original_completion=original_completion)
|
|
|
|
|
|
- if isinstance(llm, BaseChatModel):
|
|
|
|
- prompt = [HumanMessage(content=prompt)]
|
|
|
|
|
|
+ prompt_messages = [PromptMessage(content=prompt)]
|
|
|
|
|
|
conversation_message_task = ConversationMessageTask(
|
|
conversation_message_task = ConversationMessageTask(
|
|
task_id=task_id,
|
|
task_id=task_id,
|
|
@@ -408,16 +383,16 @@ And answer according to the language of the user's question.
|
|
inputs=message.inputs,
|
|
inputs=message.inputs,
|
|
query=message.query,
|
|
query=message.query,
|
|
is_override=True if message.override_model_configs else False,
|
|
is_override=True if message.override_model_configs else False,
|
|
- streaming=streaming
|
|
|
|
|
|
+ streaming=streaming,
|
|
|
|
+ model_instance=final_model_instance
|
|
)
|
|
)
|
|
|
|
|
|
- llm.callbacks = cls.get_llm_callbacks(llm, streaming, conversation_message_task)
|
|
|
|
-
|
|
|
|
cls.recale_llm_max_tokens(
|
|
cls.recale_llm_max_tokens(
|
|
- final_llm=llm,
|
|
|
|
- model=app_model_config.model_dict,
|
|
|
|
- prompt=prompt,
|
|
|
|
- mode='completion'
|
|
|
|
|
|
+ model_instance=final_model_instance,
|
|
|
|
+ prompt_messages=prompt_messages
|
|
)
|
|
)
|
|
|
|
|
|
- llm.generate([prompt])
|
|
|
|
|
|
+ final_model_instance.run(
|
|
|
|
+ messages=prompt_messages,
|
|
|
|
+ callbacks=[LLMCallbackHandler(final_model_instance, conversation_message_task)]
|
|
|
|
+ )
|