website_service.py 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229
  1. import datetime
  2. import json
  3. from typing import Any
  4. import requests
  5. from flask_login import current_user # type: ignore
  6. from core.helper import encrypter
  7. from core.rag.extractor.firecrawl.firecrawl_app import FirecrawlApp
  8. from extensions.ext_redis import redis_client
  9. from extensions.ext_storage import storage
  10. from services.auth.api_key_auth_service import ApiKeyAuthService
  11. class WebsiteService:
  12. @classmethod
  13. def document_create_args_validate(cls, args: dict):
  14. if "url" not in args or not args["url"]:
  15. raise ValueError("url is required")
  16. if "options" not in args or not args["options"]:
  17. raise ValueError("options is required")
  18. if "limit" not in args["options"] or not args["options"]["limit"]:
  19. raise ValueError("limit is required")
  20. @classmethod
  21. def crawl_url(cls, args: dict) -> dict:
  22. provider = args.get("provider", "")
  23. url = args.get("url")
  24. options = args.get("options", "")
  25. credentials = ApiKeyAuthService.get_auth_credentials(current_user.current_tenant_id, "website", provider)
  26. if provider == "firecrawl":
  27. # decrypt api_key
  28. api_key = encrypter.decrypt_token(
  29. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  30. )
  31. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  32. crawl_sub_pages = options.get("crawl_sub_pages", False)
  33. only_main_content = options.get("only_main_content", False)
  34. if not crawl_sub_pages:
  35. params = {
  36. "includes": [],
  37. "excludes": [],
  38. "generateImgAltText": True,
  39. "limit": 1,
  40. "scrapeOptions": {"onlyMainContent": only_main_content},
  41. }
  42. else:
  43. includes = options.get("includes").split(",") if options.get("includes") else []
  44. excludes = options.get("excludes").split(",") if options.get("excludes") else []
  45. params = {
  46. "includes": includes,
  47. "excludes": excludes,
  48. "generateImgAltText": True,
  49. "limit": options.get("limit", 1),
  50. "scrapeOptions": {"onlyMainContent": only_main_content},
  51. }
  52. if options.get("max_depth"):
  53. params["maxDepth"] = options.get("max_depth")
  54. job_id = firecrawl_app.crawl_url(url, params)
  55. website_crawl_time_cache_key = f"website_crawl_{job_id}"
  56. time = str(datetime.datetime.now().timestamp())
  57. redis_client.setex(website_crawl_time_cache_key, 3600, time)
  58. return {"status": "active", "job_id": job_id}
  59. elif provider == "jinareader":
  60. api_key = encrypter.decrypt_token(
  61. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  62. )
  63. crawl_sub_pages = options.get("crawl_sub_pages", False)
  64. if not crawl_sub_pages:
  65. response = requests.get(
  66. f"https://r.jina.ai/{url}",
  67. headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
  68. )
  69. if response.json().get("code") != 200:
  70. raise ValueError("Failed to crawl")
  71. return {"status": "active", "data": response.json().get("data")}
  72. else:
  73. response = requests.post(
  74. "https://adaptivecrawl-kir3wx7b3a-uc.a.run.app",
  75. json={
  76. "url": url,
  77. "maxPages": options.get("limit", 1),
  78. "useSitemap": options.get("use_sitemap", True),
  79. },
  80. headers={
  81. "Content-Type": "application/json",
  82. "Authorization": f"Bearer {api_key}",
  83. },
  84. )
  85. if response.json().get("code") != 200:
  86. raise ValueError("Failed to crawl")
  87. return {"status": "active", "job_id": response.json().get("data", {}).get("taskId")}
  88. else:
  89. raise ValueError("Invalid provider")
  90. @classmethod
  91. def get_crawl_status(cls, job_id: str, provider: str) -> dict:
  92. credentials = ApiKeyAuthService.get_auth_credentials(current_user.current_tenant_id, "website", provider)
  93. if provider == "firecrawl":
  94. # decrypt api_key
  95. api_key = encrypter.decrypt_token(
  96. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  97. )
  98. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  99. result = firecrawl_app.check_crawl_status(job_id)
  100. crawl_status_data = {
  101. "status": result.get("status", "active"),
  102. "job_id": job_id,
  103. "total": result.get("total", 0),
  104. "current": result.get("current", 0),
  105. "data": result.get("data", []),
  106. }
  107. if crawl_status_data["status"] == "completed":
  108. website_crawl_time_cache_key = f"website_crawl_{job_id}"
  109. start_time = redis_client.get(website_crawl_time_cache_key)
  110. if start_time:
  111. end_time = datetime.datetime.now().timestamp()
  112. time_consuming = abs(end_time - float(start_time))
  113. crawl_status_data["time_consuming"] = f"{time_consuming:.2f}"
  114. redis_client.delete(website_crawl_time_cache_key)
  115. elif provider == "jinareader":
  116. api_key = encrypter.decrypt_token(
  117. tenant_id=current_user.current_tenant_id, token=credentials.get("config").get("api_key")
  118. )
  119. response = requests.post(
  120. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  121. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  122. json={"taskId": job_id},
  123. )
  124. data = response.json().get("data", {})
  125. crawl_status_data = {
  126. "status": data.get("status", "active"),
  127. "job_id": job_id,
  128. "total": len(data.get("urls", [])),
  129. "current": len(data.get("processed", [])) + len(data.get("failed", [])),
  130. "data": [],
  131. "time_consuming": data.get("duration", 0) / 1000,
  132. }
  133. if crawl_status_data["status"] == "completed":
  134. response = requests.post(
  135. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  136. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  137. json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
  138. )
  139. data = response.json().get("data", {})
  140. formatted_data = [
  141. {
  142. "title": item.get("data", {}).get("title"),
  143. "source_url": item.get("data", {}).get("url"),
  144. "description": item.get("data", {}).get("description"),
  145. "markdown": item.get("data", {}).get("content"),
  146. }
  147. for item in data.get("processed", {}).values()
  148. ]
  149. crawl_status_data["data"] = formatted_data
  150. else:
  151. raise ValueError("Invalid provider")
  152. return crawl_status_data
  153. @classmethod
  154. def get_crawl_url_data(cls, job_id: str, provider: str, url: str, tenant_id: str) -> dict[Any, Any] | None:
  155. credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
  156. # decrypt api_key
  157. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  158. # FIXME data is redefine too many times here, use Any to ease the type checking, fix it later
  159. data: Any
  160. if provider == "firecrawl":
  161. file_key = "website_files/" + job_id + ".txt"
  162. if storage.exists(file_key):
  163. d = storage.load_once(file_key)
  164. if d:
  165. data = json.loads(d.decode("utf-8"))
  166. else:
  167. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  168. result = firecrawl_app.check_crawl_status(job_id)
  169. if result.get("status") != "completed":
  170. raise ValueError("Crawl job is not completed")
  171. data = result.get("data")
  172. if data:
  173. for item in data:
  174. if item.get("source_url") == url:
  175. return dict(item)
  176. return None
  177. elif provider == "jinareader":
  178. if not job_id:
  179. response = requests.get(
  180. f"https://r.jina.ai/{url}",
  181. headers={"Accept": "application/json", "Authorization": f"Bearer {api_key}"},
  182. )
  183. if response.json().get("code") != 200:
  184. raise ValueError("Failed to crawl")
  185. return dict(response.json().get("data", {}))
  186. else:
  187. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  188. response = requests.post(
  189. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  190. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  191. json={"taskId": job_id},
  192. )
  193. data = response.json().get("data", {})
  194. if data.get("status") != "completed":
  195. raise ValueError("Crawl job is not completed")
  196. response = requests.post(
  197. "https://adaptivecrawlstatus-kir3wx7b3a-uc.a.run.app",
  198. headers={"Content-Type": "application/json", "Authorization": f"Bearer {api_key}"},
  199. json={"taskId": job_id, "urls": list(data.get("processed", {}).keys())},
  200. )
  201. data = response.json().get("data", {})
  202. for item in data.get("processed", {}).values():
  203. if item.get("data", {}).get("url") == url:
  204. return dict(item.get("data", {}))
  205. return None
  206. else:
  207. raise ValueError("Invalid provider")
  208. @classmethod
  209. def get_scrape_url_data(cls, provider: str, url: str, tenant_id: str, only_main_content: bool) -> dict:
  210. credentials = ApiKeyAuthService.get_auth_credentials(tenant_id, "website", provider)
  211. if provider == "firecrawl":
  212. # decrypt api_key
  213. api_key = encrypter.decrypt_token(tenant_id=tenant_id, token=credentials.get("config").get("api_key"))
  214. firecrawl_app = FirecrawlApp(api_key=api_key, base_url=credentials.get("config").get("base_url", None))
  215. params = {"onlyMainContent": only_main_content}
  216. result = firecrawl_app.scrape_url(url, params)
  217. return result
  218. else:
  219. raise ValueError("Invalid provider")