mirror of
https://github.com/langgenius/dify.git
synced 2026-05-27 12:26:15 +08:00
346 lines
12 KiB
Python
346 lines
12 KiB
Python
"""Unified remote-file retrieval with Dify signed file URL resolution.
|
|
|
|
Use this module for backend workflows whose intent is to fetch remote file content
|
|
or remote file metadata from a URL, even when the URL originally came from a user
|
|
upload, a workflow variable, a tool/datasource file, or an app DSL. GET/HEAD
|
|
requests can resolve Dify-signed file URLs locally through DB + storage before
|
|
falling back to the SSRF-protected network client.
|
|
|
|
Use `core.helper.ssrf_proxy` directly only for generic outbound HTTP where the
|
|
URL is not being treated as a remote file, such as HTTP Request nodes, external
|
|
API integrations, auth discovery, or user-configured tool calls. Those calls must
|
|
stay as real network requests and should not reinterpret Dify file URLs as stored
|
|
files.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import base64
|
|
import hashlib
|
|
import hmac
|
|
import re
|
|
import time
|
|
import urllib.parse
|
|
from dataclasses import dataclass
|
|
from typing import Any, Literal
|
|
|
|
import httpx
|
|
|
|
from configs import dify_config
|
|
from core.app.file_access import DatabaseFileAccessController
|
|
from core.db.session_factory import session_factory
|
|
from core.helper import ssrf_proxy
|
|
from core.helper.ssrf_proxy import (
|
|
SSRF_DEFAULT_MAX_RETRIES,
|
|
_to_graphon_http_response,
|
|
max_retries_exceeded_error,
|
|
request_error,
|
|
)
|
|
from extensions.ext_storage import storage
|
|
from models import ToolFile, UploadFile
|
|
|
|
_UPLOAD_FILE_PATH_PATTERN = re.compile(
|
|
r"^/files/(?P<file_id>[a-fA-F0-9-]+)/(?P<preview_kind>file-preview|image-preview)$"
|
|
)
|
|
_TOOL_FILE_PATH_PATTERN = re.compile(r"^/files/tools/(?P<file_id>[a-fA-F0-9-]+)(?P<extension>\.[^/]*)?$")
|
|
_DATASOURCE_FILE_PATH_PATTERN = re.compile(r"^/files/datasources/(?P<file_id>[a-fA-F0-9-]+)(?P<extension>\.[^/]*)?$")
|
|
|
|
_file_access_controller = DatabaseFileAccessController()
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class _SignedFileUrl:
|
|
file_id: str
|
|
preview_kind: Literal["file-preview", "image-preview"]
|
|
record_kind: Literal["upload", "tool", "datasource"]
|
|
|
|
|
|
def make_request(method: str, url: str, max_retries: int = SSRF_DEFAULT_MAX_RETRIES, **kwargs: Any) -> httpx.Response:
|
|
"""Fetch remote file content or metadata.
|
|
|
|
GET and HEAD requests for Dify-owned signed file URLs are served from local
|
|
storage. Every other request is delegated unchanged to the SSRF proxy.
|
|
"""
|
|
|
|
normalized_method = method.upper()
|
|
if normalized_method == "GET":
|
|
response = _resolve_dify_signed_file_url("GET", url)
|
|
if response is not None:
|
|
return response
|
|
if normalized_method == "HEAD":
|
|
response = _resolve_dify_signed_file_url("HEAD", url)
|
|
if response is not None:
|
|
return response
|
|
return ssrf_proxy.make_request(method=method, url=url, max_retries=max_retries, **kwargs)
|
|
|
|
|
|
class GraphonRemoteFileFetcher:
|
|
"""Graphon HTTP-client adapter backed by the unified remote-file fetcher.
|
|
|
|
Graphon requires method-specific HTTP client methods, while regular Dify
|
|
call sites should use `make_request` directly.
|
|
"""
|
|
|
|
@property
|
|
def max_retries_exceeded_error(self) -> type[Exception]:
|
|
return max_retries_exceeded_error
|
|
|
|
@property
|
|
def request_error(self) -> type[Exception]:
|
|
return request_error
|
|
|
|
def get(self, url: str, max_retries: int = SSRF_DEFAULT_MAX_RETRIES, **kwargs: Any):
|
|
return _to_graphon_http_response(make_request("GET", url=url, max_retries=max_retries, **kwargs))
|
|
|
|
def head(self, url: str, max_retries: int = SSRF_DEFAULT_MAX_RETRIES, **kwargs: Any):
|
|
return _to_graphon_http_response(make_request("HEAD", url=url, max_retries=max_retries, **kwargs))
|
|
|
|
def post(self, url: str, max_retries: int = SSRF_DEFAULT_MAX_RETRIES, **kwargs: Any):
|
|
return _to_graphon_http_response(make_request("POST", url=url, max_retries=max_retries, **kwargs))
|
|
|
|
def put(self, url: str, max_retries: int = SSRF_DEFAULT_MAX_RETRIES, **kwargs: Any):
|
|
return _to_graphon_http_response(make_request("PUT", url=url, max_retries=max_retries, **kwargs))
|
|
|
|
def delete(self, url: str, max_retries: int = SSRF_DEFAULT_MAX_RETRIES, **kwargs: Any):
|
|
return _to_graphon_http_response(make_request("DELETE", url=url, max_retries=max_retries, **kwargs))
|
|
|
|
def patch(self, url: str, max_retries: int = SSRF_DEFAULT_MAX_RETRIES, **kwargs: Any):
|
|
return _to_graphon_http_response(make_request("PATCH", url=url, max_retries=max_retries, **kwargs))
|
|
|
|
|
|
def _resolve_dify_signed_file_url(method: Literal["GET", "HEAD"], url: str) -> httpx.Response | None:
|
|
parsed_url = urllib.parse.urlparse(url)
|
|
if not _is_dify_file_origin(parsed_url):
|
|
return None
|
|
|
|
signed_file_url = _parse_signed_file_path(parsed_url.path)
|
|
if signed_file_url is None:
|
|
return None
|
|
|
|
query = urllib.parse.parse_qs(parsed_url.query, keep_blank_values=True)
|
|
timestamp = _single_query_value(query, "timestamp")
|
|
nonce = _single_query_value(query, "nonce")
|
|
sign = _single_query_value(query, "sign")
|
|
if timestamp is None or nonce is None or sign is None:
|
|
return None
|
|
|
|
if not _verify_signed_file_url(
|
|
signed_file_url=signed_file_url,
|
|
timestamp=timestamp,
|
|
nonce=nonce,
|
|
sign=sign,
|
|
):
|
|
return None
|
|
|
|
if signed_file_url.record_kind == "upload":
|
|
return _build_upload_file_response(method=method, url=url, file_id=signed_file_url.file_id)
|
|
if signed_file_url.record_kind == "tool":
|
|
return _build_tool_file_response(method=method, url=url, file_id=signed_file_url.file_id)
|
|
return _build_datasource_file_response(method=method, url=url, file_id=signed_file_url.file_id)
|
|
|
|
|
|
def _parse_signed_file_path(path: str) -> _SignedFileUrl | None:
|
|
upload_match = _UPLOAD_FILE_PATH_PATTERN.match(path)
|
|
if upload_match:
|
|
preview_kind: Literal["file-preview", "image-preview"]
|
|
if upload_match.group("preview_kind") == "image-preview":
|
|
preview_kind = "image-preview"
|
|
else:
|
|
preview_kind = "file-preview"
|
|
|
|
return _SignedFileUrl(
|
|
file_id=upload_match.group("file_id"),
|
|
preview_kind=preview_kind,
|
|
record_kind="upload",
|
|
)
|
|
|
|
tool_match = _TOOL_FILE_PATH_PATTERN.match(path)
|
|
if tool_match:
|
|
return _SignedFileUrl(
|
|
file_id=tool_match.group("file_id"),
|
|
preview_kind="file-preview",
|
|
record_kind="tool",
|
|
)
|
|
|
|
datasource_match = _DATASOURCE_FILE_PATH_PATTERN.match(path)
|
|
if datasource_match:
|
|
return _SignedFileUrl(
|
|
file_id=datasource_match.group("file_id"),
|
|
preview_kind="file-preview",
|
|
record_kind="datasource",
|
|
)
|
|
|
|
return None
|
|
|
|
|
|
def _is_dify_file_origin(parsed_url: urllib.parse.ParseResult) -> bool:
|
|
if parsed_url.scheme not in {"http", "https"} or not parsed_url.hostname:
|
|
return False
|
|
|
|
url_origin = _origin_parts(parsed_url)
|
|
if url_origin is None:
|
|
return False
|
|
|
|
allowed_origins = {
|
|
origin
|
|
for configured_url in [dify_config.FILES_URL, dify_config.INTERNAL_FILES_URL]
|
|
if configured_url and (origin := _origin_parts(urllib.parse.urlparse(configured_url))) is not None
|
|
}
|
|
return url_origin in allowed_origins
|
|
|
|
|
|
def _origin_parts(parsed_url: urllib.parse.ParseResult) -> tuple[str, str, int] | None:
|
|
if parsed_url.scheme not in {"http", "https"} or not parsed_url.hostname:
|
|
return None
|
|
try:
|
|
port = parsed_url.port
|
|
except ValueError:
|
|
return None
|
|
return parsed_url.scheme, parsed_url.hostname.lower(), port or _default_port(parsed_url.scheme)
|
|
|
|
|
|
def _default_port(scheme: str) -> int:
|
|
return 443 if scheme == "https" else 80
|
|
|
|
|
|
def _single_query_value(query: dict[str, list[str]], key: str) -> str | None:
|
|
values = query.get(key)
|
|
if not values or len(values) != 1:
|
|
return None
|
|
return values[0]
|
|
|
|
|
|
def _verify_signed_file_url(
|
|
*,
|
|
signed_file_url: _SignedFileUrl,
|
|
timestamp: str,
|
|
nonce: str,
|
|
sign: str,
|
|
) -> bool:
|
|
try:
|
|
current_time = int(time.time())
|
|
signed_at = int(timestamp)
|
|
except ValueError:
|
|
return False
|
|
|
|
if current_time - signed_at > dify_config.FILES_ACCESS_TIMEOUT:
|
|
return False
|
|
|
|
payload = f"{signed_file_url.preview_kind}|{signed_file_url.file_id}|{timestamp}|{nonce}"
|
|
recalculated = hmac.new(dify_config.SECRET_KEY.encode(), payload.encode(), hashlib.sha256).digest()
|
|
expected = base64.urlsafe_b64encode(recalculated).decode()
|
|
return hmac.compare_digest(sign, expected)
|
|
|
|
|
|
def _build_upload_file_response(*, method: Literal["GET", "HEAD"], url: str, file_id: str) -> httpx.Response:
|
|
with session_factory.create_session() as session:
|
|
upload_file = _file_access_controller.get_upload_file(session=session, file_id=file_id)
|
|
if upload_file is None:
|
|
return _build_response(method=method, url=url, status_code=404)
|
|
|
|
content = b"" if method == "HEAD" else storage.load_once(upload_file.key)
|
|
return _build_response(
|
|
method=method,
|
|
url=url,
|
|
status_code=200,
|
|
content=content,
|
|
content_length=upload_file.size,
|
|
content_type=upload_file.mime_type,
|
|
filename=upload_file.name,
|
|
)
|
|
|
|
|
|
def _build_tool_file_response(*, method: Literal["GET", "HEAD"], url: str, file_id: str) -> httpx.Response:
|
|
with session_factory.create_session() as session:
|
|
tool_file = _file_access_controller.get_tool_file(session=session, file_id=file_id)
|
|
if tool_file is None:
|
|
return _build_response(method=method, url=url, status_code=404)
|
|
|
|
content = b"" if method == "HEAD" else storage.load_once(tool_file.file_key)
|
|
return _build_response(
|
|
method=method,
|
|
url=url,
|
|
status_code=200,
|
|
content=content,
|
|
content_length=tool_file.size,
|
|
content_type=tool_file.mimetype,
|
|
filename=tool_file.name,
|
|
)
|
|
|
|
|
|
def _build_datasource_file_response(*, method: Literal["GET", "HEAD"], url: str, file_id: str) -> httpx.Response:
|
|
with session_factory.create_session() as session:
|
|
upload_file = _file_access_controller.get_upload_file(session=session, file_id=file_id)
|
|
if upload_file is not None:
|
|
return _build_upload_file_record_response(method=method, url=url, upload_file=upload_file)
|
|
|
|
tool_file = _file_access_controller.get_tool_file(session=session, file_id=file_id)
|
|
if tool_file is not None:
|
|
return _build_tool_file_record_response(method=method, url=url, tool_file=tool_file)
|
|
|
|
return _build_response(method=method, url=url, status_code=404)
|
|
|
|
|
|
def _build_upload_file_record_response(
|
|
*,
|
|
method: Literal["GET", "HEAD"],
|
|
url: str,
|
|
upload_file: UploadFile,
|
|
) -> httpx.Response:
|
|
content = b"" if method == "HEAD" else storage.load_once(upload_file.key)
|
|
return _build_response(
|
|
method=method,
|
|
url=url,
|
|
status_code=200,
|
|
content=content,
|
|
content_length=upload_file.size,
|
|
content_type=upload_file.mime_type,
|
|
filename=upload_file.name,
|
|
)
|
|
|
|
|
|
def _build_tool_file_record_response(
|
|
*,
|
|
method: Literal["GET", "HEAD"],
|
|
url: str,
|
|
tool_file: ToolFile,
|
|
) -> httpx.Response:
|
|
content = b"" if method == "HEAD" else storage.load_once(tool_file.file_key)
|
|
return _build_response(
|
|
method=method,
|
|
url=url,
|
|
status_code=200,
|
|
content=content,
|
|
content_length=tool_file.size,
|
|
content_type=tool_file.mimetype,
|
|
filename=tool_file.name,
|
|
)
|
|
|
|
|
|
def _build_response(
|
|
*,
|
|
method: Literal["GET", "HEAD"],
|
|
url: str,
|
|
status_code: int,
|
|
content: bytes = b"",
|
|
content_length: int | None = None,
|
|
content_type: str | None = None,
|
|
filename: str | None = None,
|
|
) -> httpx.Response:
|
|
headers: dict[str, str] = {}
|
|
if content_type:
|
|
headers["Content-Type"] = content_type
|
|
if content_length is not None and content_length >= 0:
|
|
headers["Content-Length"] = str(content_length)
|
|
if filename:
|
|
headers["Content-Disposition"] = f"attachment; filename*=UTF-8''{urllib.parse.quote(filename)}"
|
|
return httpx.Response(
|
|
status_code=status_code,
|
|
headers=headers,
|
|
content=content,
|
|
request=httpx.Request(method, url),
|
|
)
|
|
|
|
|
|
graphon_remote_file_fetcher = GraphonRemoteFileFetcher()
|