|
@@ -11,11 +11,10 @@ from contextlib import contextmanager
|
|
|
from urllib.parse import unquote
|
|
|
|
|
|
import cloudscraper
|
|
|
-import requests
|
|
|
from bs4 import BeautifulSoup, CData, Comment, NavigableString
|
|
|
-from newspaper import Article
|
|
|
from regex import regex
|
|
|
|
|
|
+from core.helper import ssrf_proxy
|
|
|
from core.rag.extractor import extract_processor
|
|
|
from core.rag.extractor.extract_processor import ExtractProcessor
|
|
|
|
|
@@ -45,7 +44,7 @@ def get_url(url: str, user_agent: str = None) -> str:
|
|
|
|
|
|
main_content_type = None
|
|
|
supported_content_types = extract_processor.SUPPORT_URL_CONTENT_TYPES + ["text/html"]
|
|
|
- response = requests.head(url, headers=headers, allow_redirects=True, timeout=(5, 10))
|
|
|
+ response = ssrf_proxy.head(url, headers=headers, follow_redirects=True, timeout=(5, 10))
|
|
|
|
|
|
if response.status_code == 200:
|
|
|
# check content-type
|
|
@@ -67,10 +66,11 @@ def get_url(url: str, user_agent: str = None) -> str:
|
|
|
if main_content_type in extract_processor.SUPPORT_URL_CONTENT_TYPES:
|
|
|
return ExtractProcessor.load_from_url(url, return_text=True)
|
|
|
|
|
|
- response = requests.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
|
|
|
+ response = ssrf_proxy.get(url, headers=headers, follow_redirects=True, timeout=(120, 300))
|
|
|
elif response.status_code == 403:
|
|
|
scraper = cloudscraper.create_scraper()
|
|
|
- response = scraper.get(url, headers=headers, allow_redirects=True, timeout=(120, 300))
|
|
|
+ scraper.perform_request = ssrf_proxy.make_request
|
|
|
+ response = scraper.get(url, headers=headers, follow_redirects=True, timeout=(120, 300))
|
|
|
|
|
|
if response.status_code != 200:
|
|
|
return "URL returned status code {}.".format(response.status_code)
|
|
@@ -78,7 +78,7 @@ def get_url(url: str, user_agent: str = None) -> str:
|
|
|
a = extract_using_readabilipy(response.text)
|
|
|
|
|
|
if not a['plain_text'] or not a['plain_text'].strip():
|
|
|
- return get_url_from_newspaper3k(url)
|
|
|
+ return ''
|
|
|
|
|
|
res = FULL_TEMPLATE.format(
|
|
|
title=a['title'],
|
|
@@ -91,23 +91,6 @@ def get_url(url: str, user_agent: str = None) -> str:
|
|
|
return res
|
|
|
|
|
|
|
|
|
-def get_url_from_newspaper3k(url: str) -> str:
|
|
|
-
|
|
|
- a = Article(url)
|
|
|
- a.download()
|
|
|
- a.parse()
|
|
|
-
|
|
|
- res = FULL_TEMPLATE.format(
|
|
|
- title=a.title,
|
|
|
- authors=a.authors,
|
|
|
- publish_date=a.publish_date,
|
|
|
- top_image=a.top_image,
|
|
|
- text=a.text,
|
|
|
- )
|
|
|
-
|
|
|
- return res
|
|
|
-
|
|
|
-
|
|
|
def extract_using_readabilipy(html):
|
|
|
with tempfile.NamedTemporaryFile(delete=False, mode='w+') as f_html:
|
|
|
f_html.write(html)
|