|
@@ -1,98 +1,93 @@
|
|
|
import time
|
|
|
+from collections.abc import Mapping
|
|
|
+from typing import Any
|
|
|
|
|
|
import requests
|
|
|
+from requests.exceptions import HTTPError
|
|
|
|
|
|
|
|
|
class FirecrawlApp:
|
|
|
- def __init__(self, api_key=None, base_url=None):
|
|
|
+ def __init__(self, api_key: str | None = None, base_url: str | None = None):
|
|
|
self.api_key = api_key
|
|
|
self.base_url = base_url or 'https://api.firecrawl.dev'
|
|
|
- if self.api_key is None and self.base_url == 'https://api.firecrawl.dev':
|
|
|
- raise ValueError('No API key provided')
|
|
|
+ if not self.api_key:
|
|
|
+ raise ValueError("API key is required")
|
|
|
|
|
|
- def scrape_url(self, url, params=None) -> dict:
|
|
|
+ def _prepare_headers(self, idempotency_key: str | None = None):
|
|
|
headers = {
|
|
|
'Content-Type': 'application/json',
|
|
|
'Authorization': f'Bearer {self.api_key}'
|
|
|
}
|
|
|
- json_data = {'url': url}
|
|
|
- if params:
|
|
|
- json_data.update(params)
|
|
|
- response = requests.post(
|
|
|
- f'{self.base_url}/v0/scrape',
|
|
|
- headers=headers,
|
|
|
- json=json_data
|
|
|
- )
|
|
|
- if response.status_code == 200:
|
|
|
- response = response.json()
|
|
|
- if response['success'] == True:
|
|
|
- return response['data']
|
|
|
- else:
|
|
|
- raise Exception(f'Failed to scrape URL. Error: {response["error"]}')
|
|
|
+ if idempotency_key:
|
|
|
+ headers['Idempotency-Key'] = idempotency_key
|
|
|
+ return headers
|
|
|
|
|
|
- elif response.status_code in [402, 409, 500]:
|
|
|
- error_message = response.json().get('error', 'Unknown error occurred')
|
|
|
- raise Exception(f'Failed to scrape URL. Status code: {response.status_code}. Error: {error_message}')
|
|
|
- else:
|
|
|
- raise Exception(f'Failed to scrape URL. Status code: {response.status_code}')
|
|
|
+ def _request(
|
|
|
+ self,
|
|
|
+ method: str,
|
|
|
+ url: str,
|
|
|
+ data: Mapping[str, Any] | None = None,
|
|
|
+ headers: Mapping[str, str] | None = None,
|
|
|
+ retries: int = 3,
|
|
|
+ backoff_factor: float = 0.3,
|
|
|
+ ) -> Mapping[str, Any] | None:
|
|
|
+ for i in range(retries):
|
|
|
+ try:
|
|
|
+ response = requests.request(method, url, json=data, headers=headers)
|
|
|
+ response.raise_for_status()
|
|
|
+ return response.json()
|
|
|
+ except requests.exceptions.RequestException as e:
|
|
|
+ if i < retries - 1:
|
|
|
+ time.sleep(backoff_factor * (2 ** i))
|
|
|
+ else:
|
|
|
+ raise
|
|
|
+ return None
|
|
|
|
|
|
- def crawl_url(self, url, params=None, wait_until_done=True, timeout=2) -> str:
|
|
|
+ def scrape_url(self, url: str, **kwargs):
|
|
|
+ endpoint = f'{self.base_url}/v0/scrape'
|
|
|
headers = self._prepare_headers()
|
|
|
- json_data = {'url': url}
|
|
|
- if params:
|
|
|
- json_data.update(params)
|
|
|
- response = self._post_request(f'{self.base_url}/v0/crawl', json_data, headers)
|
|
|
- if response.status_code == 200:
|
|
|
- job_id = response.json().get('jobId')
|
|
|
- if wait_until_done:
|
|
|
- return self._monitor_job_status(job_id, headers, timeout)
|
|
|
- else:
|
|
|
- return {'jobId': job_id}
|
|
|
- else:
|
|
|
- self._handle_error(response, 'start crawl job')
|
|
|
+ data = {'url': url, **kwargs}
|
|
|
+ response = self._request('POST', endpoint, data, headers)
|
|
|
+ if response is None:
|
|
|
+ raise HTTPError("Failed to scrape URL after multiple retries")
|
|
|
+ return response
|
|
|
|
|
|
- def check_crawl_status(self, job_id) -> dict:
|
|
|
+ def search(self, query: str, **kwargs):
|
|
|
+ endpoint = f'{self.base_url}/v0/search'
|
|
|
headers = self._prepare_headers()
|
|
|
- response = self._get_request(f'{self.base_url}/v0/crawl/status/{job_id}', headers)
|
|
|
- if response.status_code == 200:
|
|
|
- return response.json()
|
|
|
- else:
|
|
|
- self._handle_error(response, 'check crawl status')
|
|
|
-
|
|
|
- def _prepare_headers(self):
|
|
|
- return {
|
|
|
- 'Content-Type': 'application/json',
|
|
|
- 'Authorization': f'Bearer {self.api_key}'
|
|
|
- }
|
|
|
+ data = {'query': query, **kwargs}
|
|
|
+ response = self._request('POST', endpoint, data, headers)
|
|
|
+ if response is None:
|
|
|
+ raise HTTPError("Failed to perform search after multiple retries")
|
|
|
+ return response
|
|
|
|
|
|
- def _post_request(self, url, data, headers):
|
|
|
- return requests.post(url, headers=headers, json=data)
|
|
|
+ def crawl_url(
|
|
|
+ self, url: str, wait: bool = False, poll_interval: int = 5, idempotency_key: str | None = None, **kwargs
|
|
|
+ ):
|
|
|
+ endpoint = f'{self.base_url}/v0/crawl'
|
|
|
+ headers = self._prepare_headers(idempotency_key)
|
|
|
+ data = {'url': url, **kwargs}
|
|
|
+ response = self._request('POST', endpoint, data, headers)
|
|
|
+ if response is None:
|
|
|
+ raise HTTPError("Failed to initiate crawl after multiple retries")
|
|
|
+ job_id: str = response['jobId']
|
|
|
+ if wait:
|
|
|
+ return self._monitor_job_status(job_id=job_id, poll_interval=poll_interval)
|
|
|
+ return job_id
|
|
|
|
|
|
- def _get_request(self, url, headers):
|
|
|
- return requests.get(url, headers=headers)
|
|
|
+ def check_crawl_status(self, job_id: str):
|
|
|
+ endpoint = f'{self.base_url}/v0/crawl/status/{job_id}'
|
|
|
+ headers = self._prepare_headers()
|
|
|
+ response = self._request('GET', endpoint, headers=headers)
|
|
|
+ if response is None:
|
|
|
+ raise HTTPError(f"Failed to check status for job {job_id} after multiple retries")
|
|
|
+ return response
|
|
|
|
|
|
- def _monitor_job_status(self, job_id, headers, timeout):
|
|
|
+ def _monitor_job_status(self, job_id: str, poll_interval: int):
|
|
|
while True:
|
|
|
- status_response = self._get_request(f'{self.base_url}/v0/crawl/status/{job_id}', headers)
|
|
|
- if status_response.status_code == 200:
|
|
|
- status_data = status_response.json()
|
|
|
- if status_data['status'] == 'completed':
|
|
|
- if 'data' in status_data:
|
|
|
- return status_data['data']
|
|
|
- else:
|
|
|
- raise Exception('Crawl job completed but no data was returned')
|
|
|
- elif status_data['status'] in ['active', 'paused', 'pending', 'queued']:
|
|
|
- if timeout < 2:
|
|
|
- timeout = 2
|
|
|
- time.sleep(timeout) # Wait for the specified timeout before checking again
|
|
|
- else:
|
|
|
- raise Exception(f'Crawl job failed or was stopped. Status: {status_data["status"]}')
|
|
|
- else:
|
|
|
- self._handle_error(status_response, 'check crawl status')
|
|
|
-
|
|
|
- def _handle_error(self, response, action):
|
|
|
- if response.status_code in [402, 409, 500]:
|
|
|
- error_message = response.json().get('error', 'Unknown error occurred')
|
|
|
- raise Exception(f'Failed to {action}. Status code: {response.status_code}. Error: {error_message}')
|
|
|
- else:
|
|
|
- raise Exception(f'Unexpected error occurred while trying to {action}. Status code: {response.status_code}')
|
|
|
+ status = self.check_crawl_status(job_id)
|
|
|
+ if status['status'] == 'completed':
|
|
|
+ return status
|
|
|
+ elif status['status'] == 'failed':
|
|
|
+ raise HTTPError(f'Job {job_id} failed: {status["error"]}')
|
|
|
+ time.sleep(poll_interval)
|