Fix: validate URL scheme and resolved IP before crawling to prevent SSRF (#14090)

### What problem does this PR solve? The POST /upload_info?url=<url> endpoint accepted a user-supplied URL and passed it directly to AsyncWebCrawler without any validation. There were no restrictions on URL scheme, destination hostname, or resolved IP address. This allowed any authenticated user to instruct the server to make outbound HTTP requests to internal infrastructure — including RFC 1918 private networks, loopback addresses, and cloud metadata services such as http://169.254.169.254 — effectively using the server as a proxy for internal network reconnaissance or credential theft. This PR adds an SSRF guard (_validate_url_for_crawl) that runs before any crawl is initiated. It enforces an allowlist of safe schemes (http/https), resolves the hostname at validation time, and rejects any URL whose resolved IP falls within a private or reserved network range. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-05-06 10:17:49 +08:00 · 2026-04-25 15:30:15 +09:00
parent 78188ce9e9
commit fb95136f39
10 changed files with 485 additions and 109 deletions
--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
@ -43,6 +43,7 @@ from common import settings
 from common.constants import SANDBOX_ARTIFACT_BUCKET, ParserType, RetCode, TaskStatus
 from common.file_utils import get_project_base_directory
 from common.misc_utils import get_uuid, thread_pool_exec
+from common.ssrf_guard import assert_url_is_safe
 from deepdoc.parser.html_parser import RAGFlowHtmlParser
 from rag.nlp import search

@ -333,6 +334,7 @@ async def run():
    except Exception as e:
        return server_error_response(e)

+
@manager.route("/get/<doc_id>", methods=["GET"])  # noqa: F821
@login_required
 async def get(doc_id):
@ -581,6 +583,7 @@ async def upload_info():

    try:
        if url and not file_objs:
+            assert_url_is_safe(url)
            return get_json_result(data=FileService.upload_info(current_user.id, None, url))

        if len(file_objs) == 1: