|
@@ -2,6 +2,7 @@ import mimetypes
|
|
|
from collections.abc import Mapping, Sequence
|
|
|
from typing import Any
|
|
|
|
|
|
+import httpx
|
|
|
from sqlalchemy import select
|
|
|
|
|
|
from constants import AUDIO_EXTENSIONS, DOCUMENT_EXTENSIONS, IMAGE_EXTENSIONS, VIDEO_EXTENSIONS
|
|
@@ -154,7 +155,7 @@ def _build_from_local_file(
|
|
|
file = File(
|
|
|
id=mapping.get("id"),
|
|
|
filename=row.name,
|
|
|
- extension=row.extension,
|
|
|
+ extension="." + row.extension,
|
|
|
mime_type=row.mime_type,
|
|
|
tenant_id=tenant_id,
|
|
|
type=file_type,
|
|
@@ -177,25 +178,29 @@ def _build_from_remote_url(
|
|
|
url = mapping.get("url")
|
|
|
if not url:
|
|
|
raise ValueError("Invalid file url")
|
|
|
- resp = ssrf_proxy.head(url, follow_redirects=True)
|
|
|
- resp.raise_for_status()
|
|
|
|
|
|
- # Try to extract filename from response headers or URL
|
|
|
- content_disposition = resp.headers.get("Content-Disposition")
|
|
|
- if content_disposition:
|
|
|
- filename = content_disposition.split("filename=")[-1].strip('"')
|
|
|
+ resp = ssrf_proxy.head(url, follow_redirects=True)
|
|
|
+ if resp.status_code == httpx.codes.OK:
|
|
|
+ # Try to extract filename from response headers or URL
|
|
|
+ content_disposition = resp.headers.get("Content-Disposition")
|
|
|
+ if content_disposition:
|
|
|
+ filename = content_disposition.split("filename=")[-1].strip('"')
|
|
|
+ else:
|
|
|
+ filename = url.split("/")[-1].split("?")[0]
|
|
|
+ # Create the File object
|
|
|
+ file_size = int(resp.headers.get("Content-Length", -1))
|
|
|
+ mime_type = str(resp.headers.get("Content-Type", ""))
|
|
|
else:
|
|
|
- filename = url.split("/")[-1].split("?")[0]
|
|
|
+ filename = ""
|
|
|
+ file_size = -1
|
|
|
+ mime_type = ""
|
|
|
+
|
|
|
# If filename is empty, set a default one
|
|
|
if not filename:
|
|
|
filename = "unknown_file"
|
|
|
-
|
|
|
# Determine file extension
|
|
|
extension = "." + filename.split(".")[-1] if "." in filename else ".bin"
|
|
|
|
|
|
- # Create the File object
|
|
|
- file_size = int(resp.headers.get("Content-Length", -1))
|
|
|
- mime_type = str(resp.headers.get("Content-Type", ""))
|
|
|
if not mime_type:
|
|
|
mime_type, _ = mimetypes.guess_type(url)
|
|
|
file = File(
|