dataset_tool_builder.py 3.2 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283
  1. from typing import Optional
  2. from langchain.callbacks import CallbackManager
  3. from llama_index.langchain_helpers.agents import IndexToolConfig
  4. from core.callback_handler.dataset_tool_callback_handler import DatasetToolCallbackHandler
  5. from core.callback_handler.index_tool_callback_handler import DatasetIndexToolCallbackHandler
  6. from core.callback_handler.std_out_callback_handler import DifyStdOutCallbackHandler
  7. from core.index.keyword_table_index import KeywordTableIndex
  8. from core.index.vector_index import VectorIndex
  9. from core.prompt.prompts import QUERY_KEYWORD_EXTRACT_TEMPLATE
  10. from core.tool.llama_index_tool import EnhanceLlamaIndexTool
  11. from extensions.ext_database import db
  12. from models.dataset import Dataset
  13. class DatasetToolBuilder:
  14. @classmethod
  15. def build_dataset_tool(cls, tenant_id: str, dataset_id: str,
  16. response_mode: str = "no_synthesizer",
  17. callback_handler: Optional[DatasetToolCallbackHandler] = None):
  18. # get dataset from dataset id
  19. dataset = db.session.query(Dataset).filter(
  20. Dataset.tenant_id == tenant_id,
  21. Dataset.id == dataset_id
  22. ).first()
  23. if not dataset:
  24. return None
  25. if dataset.indexing_technique == "economy":
  26. # use keyword table query
  27. index = KeywordTableIndex(dataset=dataset).query_index
  28. if not index:
  29. return None
  30. query_kwargs = {
  31. "mode": "default",
  32. "response_mode": response_mode,
  33. "query_keyword_extract_template": QUERY_KEYWORD_EXTRACT_TEMPLATE,
  34. "max_keywords_per_query": 5,
  35. # If num_chunks_per_query is too large,
  36. # it will slow down the synthesis process due to multiple iterations of refinement.
  37. "num_chunks_per_query": 2
  38. }
  39. else:
  40. index = VectorIndex(dataset=dataset).query_index
  41. if not index:
  42. return None
  43. query_kwargs = {
  44. "mode": "default",
  45. "response_mode": response_mode,
  46. # If top_k is too large,
  47. # it will slow down the synthesis process due to multiple iterations of refinement.
  48. "similarity_top_k": 2
  49. }
  50. # fulfill description when it is empty
  51. description = dataset.description
  52. if not description:
  53. description = 'useful for when you want to answer queries about the ' + dataset.name
  54. index_tool_config = IndexToolConfig(
  55. index=index,
  56. name=f"dataset-{dataset_id}",
  57. description=description,
  58. index_query_kwargs=query_kwargs,
  59. tool_kwargs={
  60. "callback_manager": CallbackManager([callback_handler, DifyStdOutCallbackHandler()])
  61. },
  62. # tool_kwargs={"return_direct": True},
  63. # return_direct: Whether to return LLM results directly or process the output data with an Output Parser
  64. )
  65. index_callback_handler = DatasetIndexToolCallbackHandler(dataset_id=dataset_id)
  66. return EnhanceLlamaIndexTool.from_tool_config(
  67. tool_config=index_tool_config,
  68. callback_handler=index_callback_handler
  69. )