|
@@ -1,16 +1,187 @@
|
|
|
+import json
|
|
|
+import time
|
|
|
+import urllib.error
|
|
|
+import urllib.parse
|
|
|
+import urllib.request
|
|
|
from typing import Any
|
|
|
|
|
|
-from langchain.tools import PubmedQueryRun
|
|
|
from pydantic import BaseModel, Field
|
|
|
|
|
|
from core.tools.entities.tool_entities import ToolInvokeMessage
|
|
|
from core.tools.tool.builtin_tool import BuiltinTool
|
|
|
|
|
|
|
|
|
+class PubMedAPIWrapper(BaseModel):
|
|
|
+ """
|
|
|
+ Wrapper around PubMed API.
|
|
|
+
|
|
|
+ This wrapper will use the PubMed API to conduct searches and fetch
|
|
|
+ document summaries. By default, it will return the document summaries
|
|
|
+ of the top-k results of an input search.
|
|
|
+
|
|
|
+ Parameters:
|
|
|
+ top_k_results: number of the top-scored document used for the PubMed tool
|
|
|
+ load_max_docs: a limit to the number of loaded documents
|
|
|
+ load_all_available_meta:
|
|
|
+ if True: the `metadata` of the loaded Documents gets all available meta info
|
|
|
+ (see https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch)
|
|
|
+ if False: the `metadata` gets only the most informative fields.
|
|
|
+ """
|
|
|
+
|
|
|
+ base_url_esearch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?"
|
|
|
+ base_url_efetch = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?"
|
|
|
+ max_retry = 5
|
|
|
+ sleep_time = 0.2
|
|
|
+
|
|
|
+ # Default values for the parameters
|
|
|
+ top_k_results: int = 3
|
|
|
+ load_max_docs: int = 25
|
|
|
+ ARXIV_MAX_QUERY_LENGTH = 300
|
|
|
+ doc_content_chars_max: int = 2000
|
|
|
+ load_all_available_meta: bool = False
|
|
|
+ email: str = "your_email@example.com"
|
|
|
+
|
|
|
+ def run(self, query: str) -> str:
|
|
|
+ """
|
|
|
+ Run PubMed search and get the article meta information.
|
|
|
+ See https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.ESearch
|
|
|
+ It uses only the most informative fields of article meta information.
|
|
|
+ """
|
|
|
+
|
|
|
+ try:
|
|
|
+ # Retrieve the top-k results for the query
|
|
|
+ docs = [
|
|
|
+ f"Published: {result['pub_date']}\nTitle: {result['title']}\n"
|
|
|
+ f"Summary: {result['summary']}"
|
|
|
+ for result in self.load(query[: self.ARXIV_MAX_QUERY_LENGTH])
|
|
|
+ ]
|
|
|
+
|
|
|
+ # Join the results and limit the character count
|
|
|
+ return (
|
|
|
+ "\n\n".join(docs)[:self.doc_content_chars_max]
|
|
|
+ if docs
|
|
|
+ else "No good PubMed Result was found"
|
|
|
+ )
|
|
|
+ except Exception as ex:
|
|
|
+ return f"PubMed exception: {ex}"
|
|
|
+
|
|
|
+ def load(self, query: str) -> list[dict]:
|
|
|
+ """
|
|
|
+ Search PubMed for documents matching the query.
|
|
|
+ Return a list of dictionaries containing the document metadata.
|
|
|
+ """
|
|
|
+
|
|
|
+ url = (
|
|
|
+ self.base_url_esearch
|
|
|
+ + "db=pubmed&term="
|
|
|
+ + str({urllib.parse.quote(query)})
|
|
|
+ + f"&retmode=json&retmax={self.top_k_results}&usehistory=y"
|
|
|
+ )
|
|
|
+ result = urllib.request.urlopen(url)
|
|
|
+ text = result.read().decode("utf-8")
|
|
|
+ json_text = json.loads(text)
|
|
|
+
|
|
|
+ articles = []
|
|
|
+ webenv = json_text["esearchresult"]["webenv"]
|
|
|
+ for uid in json_text["esearchresult"]["idlist"]:
|
|
|
+ article = self.retrieve_article(uid, webenv)
|
|
|
+ articles.append(article)
|
|
|
+
|
|
|
+ # Convert the list of articles to a JSON string
|
|
|
+ return articles
|
|
|
+
|
|
|
+ def retrieve_article(self, uid: str, webenv: str) -> dict:
|
|
|
+ url = (
|
|
|
+ self.base_url_efetch
|
|
|
+ + "db=pubmed&retmode=xml&id="
|
|
|
+ + uid
|
|
|
+ + "&webenv="
|
|
|
+ + webenv
|
|
|
+ )
|
|
|
+
|
|
|
+ retry = 0
|
|
|
+ while True:
|
|
|
+ try:
|
|
|
+ result = urllib.request.urlopen(url)
|
|
|
+ break
|
|
|
+ except urllib.error.HTTPError as e:
|
|
|
+ if e.code == 429 and retry < self.max_retry:
|
|
|
+ # Too Many Requests error
|
|
|
+ # wait for an exponentially increasing amount of time
|
|
|
+ print(
|
|
|
+ f"Too Many Requests, "
|
|
|
+ f"waiting for {self.sleep_time:.2f} seconds..."
|
|
|
+ )
|
|
|
+ time.sleep(self.sleep_time)
|
|
|
+ self.sleep_time *= 2
|
|
|
+ retry += 1
|
|
|
+ else:
|
|
|
+ raise e
|
|
|
+
|
|
|
+ xml_text = result.read().decode("utf-8")
|
|
|
+
|
|
|
+ # Get title
|
|
|
+ title = ""
|
|
|
+ if "<ArticleTitle>" in xml_text and "</ArticleTitle>" in xml_text:
|
|
|
+ start_tag = "<ArticleTitle>"
|
|
|
+ end_tag = "</ArticleTitle>"
|
|
|
+ title = xml_text[
|
|
|
+ xml_text.index(start_tag) + len(start_tag) : xml_text.index(end_tag)
|
|
|
+ ]
|
|
|
+
|
|
|
+ # Get abstract
|
|
|
+ abstract = ""
|
|
|
+ if "<AbstractText>" in xml_text and "</AbstractText>" in xml_text:
|
|
|
+ start_tag = "<AbstractText>"
|
|
|
+ end_tag = "</AbstractText>"
|
|
|
+ abstract = xml_text[
|
|
|
+ xml_text.index(start_tag) + len(start_tag) : xml_text.index(end_tag)
|
|
|
+ ]
|
|
|
+
|
|
|
+ # Get publication date
|
|
|
+ pub_date = ""
|
|
|
+ if "<PubDate>" in xml_text and "</PubDate>" in xml_text:
|
|
|
+ start_tag = "<PubDate>"
|
|
|
+ end_tag = "</PubDate>"
|
|
|
+ pub_date = xml_text[
|
|
|
+ xml_text.index(start_tag) + len(start_tag) : xml_text.index(end_tag)
|
|
|
+ ]
|
|
|
+
|
|
|
+ # Return article as dictionary
|
|
|
+ article = {
|
|
|
+ "uid": uid,
|
|
|
+ "title": title,
|
|
|
+ "summary": abstract,
|
|
|
+ "pub_date": pub_date,
|
|
|
+ }
|
|
|
+ return article
|
|
|
+
|
|
|
+
|
|
|
+class PubmedQueryRun(BaseModel):
|
|
|
+ """Tool that searches the PubMed API."""
|
|
|
+
|
|
|
+ name = "PubMed"
|
|
|
+ description = (
|
|
|
+ "A wrapper around PubMed.org "
|
|
|
+ "Useful for when you need to answer questions about Physics, Mathematics, "
|
|
|
+ "Computer Science, Quantitative Biology, Quantitative Finance, Statistics, "
|
|
|
+ "Electrical Engineering, and Economics "
|
|
|
+ "from scientific articles on PubMed.org. "
|
|
|
+ "Input should be a search query."
|
|
|
+ )
|
|
|
+ api_wrapper: PubMedAPIWrapper = Field(default_factory=PubMedAPIWrapper)
|
|
|
+
|
|
|
+ def _run(
|
|
|
+ self,
|
|
|
+ query: str,
|
|
|
+ ) -> str:
|
|
|
+ """Use the Arxiv tool."""
|
|
|
+ return self.api_wrapper.run(query)
|
|
|
+
|
|
|
+
|
|
|
class PubMedInput(BaseModel):
|
|
|
query: str = Field(..., description="Search query.")
|
|
|
|
|
|
-
|
|
|
class PubMedSearchTool(BuiltinTool):
|
|
|
"""
|
|
|
Tool for performing a search using PubMed search engine.
|
|
@@ -34,7 +205,7 @@ class PubMedSearchTool(BuiltinTool):
|
|
|
|
|
|
tool = PubmedQueryRun(args_schema=PubMedInput)
|
|
|
|
|
|
- result = tool.run(query)
|
|
|
+ result = tool._run(query)
|
|
|
|
|
|
return self.create_text_message(self.summary(user_id=user_id, content=result))
|
|
|
|