|
@@ -0,0 +1,237 @@
|
|
|
+import os
|
|
|
+from typing import Literal, Optional, TypedDict
|
|
|
+
|
|
|
+import requests
|
|
|
+
|
|
|
+
|
|
|
+class RequestParamsDict(TypedDict, total=False):
|
|
|
+ url: Optional[str]
|
|
|
+ request: Optional[Literal["http", "chrome", "smart"]]
|
|
|
+ limit: Optional[int]
|
|
|
+ return_format: Optional[Literal["raw", "markdown", "html2text", "text", "bytes"]]
|
|
|
+ tld: Optional[bool]
|
|
|
+ depth: Optional[int]
|
|
|
+ cache: Optional[bool]
|
|
|
+ budget: Optional[dict[str, int]]
|
|
|
+ locale: Optional[str]
|
|
|
+ cookies: Optional[str]
|
|
|
+ stealth: Optional[bool]
|
|
|
+ headers: Optional[dict[str, str]]
|
|
|
+ anti_bot: Optional[bool]
|
|
|
+ metadata: Optional[bool]
|
|
|
+ viewport: Optional[dict[str, int]]
|
|
|
+ encoding: Optional[str]
|
|
|
+ subdomains: Optional[bool]
|
|
|
+ user_agent: Optional[str]
|
|
|
+ store_data: Optional[bool]
|
|
|
+ gpt_config: Optional[list[str]]
|
|
|
+ fingerprint: Optional[bool]
|
|
|
+ storageless: Optional[bool]
|
|
|
+ readability: Optional[bool]
|
|
|
+ proxy_enabled: Optional[bool]
|
|
|
+ respect_robots: Optional[bool]
|
|
|
+ query_selector: Optional[str]
|
|
|
+ full_resources: Optional[bool]
|
|
|
+ request_timeout: Optional[int]
|
|
|
+ run_in_background: Optional[bool]
|
|
|
+ skip_config_checks: Optional[bool]
|
|
|
+
|
|
|
+
|
|
|
+class Spider:
|
|
|
+ def __init__(self, api_key: Optional[str] = None):
|
|
|
+ """
|
|
|
+ Initialize the Spider with an API key.
|
|
|
+
|
|
|
+ :param api_key: A string of the API key for Spider. Defaults to the SPIDER_API_KEY environment variable.
|
|
|
+ :raises ValueError: If no API key is provided.
|
|
|
+ """
|
|
|
+ self.api_key = api_key or os.getenv("SPIDER_API_KEY")
|
|
|
+ if self.api_key is None:
|
|
|
+ raise ValueError("No API key provided")
|
|
|
+
|
|
|
+ def api_post(
|
|
|
+ self,
|
|
|
+ endpoint: str,
|
|
|
+ data: dict,
|
|
|
+ stream: bool,
|
|
|
+ content_type: str = "application/json",
|
|
|
+ ):
|
|
|
+ """
|
|
|
+ Send a POST request to the specified API endpoint.
|
|
|
+
|
|
|
+ :param endpoint: The API endpoint to which the POST request is sent.
|
|
|
+ :param data: The data (dictionary) to be sent in the POST request.
|
|
|
+ :param stream: Boolean indicating if the response should be streamed.
|
|
|
+ :return: The JSON response or the raw response stream if stream is True.
|
|
|
+ """
|
|
|
+ headers = self._prepare_headers(content_type)
|
|
|
+ response = self._post_request(
|
|
|
+ f"https://api.spider.cloud/v1/{endpoint}", data, headers, stream
|
|
|
+ )
|
|
|
+
|
|
|
+ if stream:
|
|
|
+ return response
|
|
|
+ elif response.status_code == 200:
|
|
|
+ return response.json()
|
|
|
+ else:
|
|
|
+ self._handle_error(response, f"post to {endpoint}")
|
|
|
+
|
|
|
+ def api_get(
|
|
|
+ self, endpoint: str, stream: bool, content_type: str = "application/json"
|
|
|
+ ):
|
|
|
+ """
|
|
|
+ Send a GET request to the specified endpoint.
|
|
|
+
|
|
|
+ :param endpoint: The API endpoint from which to retrieve data.
|
|
|
+ :return: The JSON decoded response.
|
|
|
+ """
|
|
|
+ headers = self._prepare_headers(content_type)
|
|
|
+ response = self._get_request(
|
|
|
+ f"https://api.spider.cloud/v1/{endpoint}", headers, stream
|
|
|
+ )
|
|
|
+ if response.status_code == 200:
|
|
|
+ return response.json()
|
|
|
+ else:
|
|
|
+ self._handle_error(response, f"get from {endpoint}")
|
|
|
+
|
|
|
+ def get_credits(self):
|
|
|
+ """
|
|
|
+ Retrieve the account's remaining credits.
|
|
|
+
|
|
|
+ :return: JSON response containing the number of credits left.
|
|
|
+ """
|
|
|
+ return self.api_get("credits", stream=False)
|
|
|
+
|
|
|
+ def scrape_url(
|
|
|
+ self,
|
|
|
+ url: str,
|
|
|
+ params: Optional[RequestParamsDict] = None,
|
|
|
+ stream: bool = False,
|
|
|
+ content_type: str = "application/json",
|
|
|
+ ):
|
|
|
+ """
|
|
|
+ Scrape data from the specified URL.
|
|
|
+
|
|
|
+ :param url: The URL from which to scrape data.
|
|
|
+ :param params: Optional dictionary of additional parameters for the scrape request.
|
|
|
+ :return: JSON response containing the scraping results.
|
|
|
+ """
|
|
|
+
|
|
|
+ # Add { "return_format": "markdown" } to the params if not already present
|
|
|
+ if "return_format" not in params:
|
|
|
+ params["return_format"] = "markdown"
|
|
|
+
|
|
|
+ # Set limit to 1
|
|
|
+ params["limit"] = 1
|
|
|
+
|
|
|
+ return self.api_post(
|
|
|
+ "crawl", {"url": url, **(params or {})}, stream, content_type
|
|
|
+ )
|
|
|
+
|
|
|
+ def crawl_url(
|
|
|
+ self,
|
|
|
+ url: str,
|
|
|
+ params: Optional[RequestParamsDict] = None,
|
|
|
+ stream: bool = False,
|
|
|
+ content_type: str = "application/json",
|
|
|
+ ):
|
|
|
+ """
|
|
|
+ Start crawling at the specified URL.
|
|
|
+
|
|
|
+ :param url: The URL to begin crawling.
|
|
|
+ :param params: Optional dictionary with additional parameters to customize the crawl.
|
|
|
+ :param stream: Boolean indicating if the response should be streamed. Defaults to False.
|
|
|
+ :return: JSON response or the raw response stream if streaming enabled.
|
|
|
+ """
|
|
|
+
|
|
|
+ # Add { "return_format": "markdown" } to the params if not already present
|
|
|
+ if "return_format" not in params:
|
|
|
+ params["return_format"] = "markdown"
|
|
|
+
|
|
|
+ return self.api_post(
|
|
|
+ "crawl", {"url": url, **(params or {})}, stream, content_type
|
|
|
+ )
|
|
|
+
|
|
|
+ def links(
|
|
|
+ self,
|
|
|
+ url: str,
|
|
|
+ params: Optional[RequestParamsDict] = None,
|
|
|
+ stream: bool = False,
|
|
|
+ content_type: str = "application/json",
|
|
|
+ ):
|
|
|
+ """
|
|
|
+ Retrieve links from the specified URL.
|
|
|
+
|
|
|
+ :param url: The URL from which to extract links.
|
|
|
+ :param params: Optional parameters for the link retrieval request.
|
|
|
+ :return: JSON response containing the links.
|
|
|
+ """
|
|
|
+ return self.api_post(
|
|
|
+ "links", {"url": url, **(params or {})}, stream, content_type
|
|
|
+ )
|
|
|
+
|
|
|
+ def extract_contacts(
|
|
|
+ self,
|
|
|
+ url: str,
|
|
|
+ params: Optional[RequestParamsDict] = None,
|
|
|
+ stream: bool = False,
|
|
|
+ content_type: str = "application/json",
|
|
|
+ ):
|
|
|
+ """
|
|
|
+ Extract contact information from the specified URL.
|
|
|
+
|
|
|
+ :param url: The URL from which to extract contact information.
|
|
|
+ :param params: Optional parameters for the contact extraction.
|
|
|
+ :return: JSON response containing extracted contact details.
|
|
|
+ """
|
|
|
+ return self.api_post(
|
|
|
+ "pipeline/extract-contacts",
|
|
|
+ {"url": url, **(params or {})},
|
|
|
+ stream,
|
|
|
+ content_type,
|
|
|
+ )
|
|
|
+
|
|
|
+ def label(
|
|
|
+ self,
|
|
|
+ url: str,
|
|
|
+ params: Optional[RequestParamsDict] = None,
|
|
|
+ stream: bool = False,
|
|
|
+ content_type: str = "application/json",
|
|
|
+ ):
|
|
|
+ """
|
|
|
+ Apply labeling to data extracted from the specified URL.
|
|
|
+
|
|
|
+ :param url: The URL to label data from.
|
|
|
+ :param params: Optional parameters to guide the labeling process.
|
|
|
+ :return: JSON response with labeled data.
|
|
|
+ """
|
|
|
+ return self.api_post(
|
|
|
+ "pipeline/label", {"url": url, **(params or {})}, stream, content_type
|
|
|
+ )
|
|
|
+
|
|
|
+ def _prepare_headers(self, content_type: str = "application/json"):
|
|
|
+ return {
|
|
|
+ "Content-Type": content_type,
|
|
|
+ "Authorization": f"Bearer {self.api_key}",
|
|
|
+ "User-Agent": "Spider-Client/0.0.27",
|
|
|
+ }
|
|
|
+
|
|
|
+ def _post_request(self, url: str, data, headers, stream=False):
|
|
|
+ return requests.post(url, headers=headers, json=data, stream=stream)
|
|
|
+
|
|
|
+ def _get_request(self, url: str, headers, stream=False):
|
|
|
+ return requests.get(url, headers=headers, stream=stream)
|
|
|
+
|
|
|
+ def _delete_request(self, url: str, headers, stream=False):
|
|
|
+ return requests.delete(url, headers=headers, stream=stream)
|
|
|
+
|
|
|
+ def _handle_error(self, response, action):
|
|
|
+ if response.status_code in [402, 409, 500]:
|
|
|
+ error_message = response.json().get("error", "Unknown error occurred")
|
|
|
+ raise Exception(
|
|
|
+ f"Failed to {action}. Status code: {response.status_code}. Error: {error_message}"
|
|
|
+ )
|
|
|
+ else:
|
|
|
+ raise Exception(
|
|
|
+ f"Unexpected error occurred while trying to {action}. Status code: {response.status_code}"
|
|
|
+ )
|