Files
ragflow/api/apps/document_app.py
Xing Hong fb95136f39 Fix: validate URL scheme and resolved IP before crawling to prevent SSRF (#14090)
### What problem does this PR solve?

The POST /upload_info?url=<url> endpoint accepted a user-supplied URL
and passed it directly to AsyncWebCrawler without any validation. There
were no restrictions on URL scheme, destination hostname, or resolved IP
address. This allowed any authenticated user to instruct the server to
make outbound HTTP requests to internal infrastructure — including RFC
1918 private networks, loopback addresses, and cloud metadata services
such as http://169.254.169.254 — effectively using the server as a proxy
for internal network reconnaissance or credential theft.

This PR adds an SSRF guard (_validate_url_for_crawl) that runs before
any crawl is initiated. It enforces an allowlist of safe schemes
(http/https), resolves the hostname at validation time, and rejects any
URL whose resolved IP falls within a private or reserved network range.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
2026-04-25 14:30:15 +08:00

596 lines
24 KiB
Python

#
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License
#
import os.path
import re
from pathlib import Path, PurePosixPath, PureWindowsPath
from quart import make_response, request
from api.apps import current_user, login_required
from api.common.check_team_permission import check_kb_team_permission
from api.constants import FILE_NAME_LEN_LIMIT, IMG_BASE64_PREFIX
from api.db import FileType
from api.db.db_models import Task
from api.db.services import duplicate_name
from api.db.services.document_service import DocumentService, doc_upload_and_parse
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
from api.db.services.knowledgebase_service import KnowledgebaseService
from api.db.services.task_service import TaskService, cancel_all_task_of
from api.utils.api_utils import (
get_data_error_result,
get_json_result,
get_request_json,
server_error_response,
validate_request,
)
from api.utils.file_utils import filename_type, thumbnail
from api.utils.web_utils import CONTENT_TYPE_MAP, apply_safe_file_response_headers, html2pdf, is_valid_url
from common import settings
from common.constants import SANDBOX_ARTIFACT_BUCKET, ParserType, RetCode, TaskStatus
from common.file_utils import get_project_base_directory
from common.misc_utils import get_uuid, thread_pool_exec
from common.ssrf_guard import assert_url_is_safe
from deepdoc.parser.html_parser import RAGFlowHtmlParser
from rag.nlp import search
def _is_safe_download_filename(name: str) -> bool:
if not name or name in {".", ".."}:
return False
if "\x00" in name or len(name) > 255:
return False
if name != PurePosixPath(name).name:
return False
if name != PureWindowsPath(name).name:
return False
return True
@manager.route("/web_crawl", methods=["POST"]) # noqa: F821
@login_required
@validate_request("kb_id", "name", "url")
async def web_crawl():
form = await request.form
kb_id = form.get("kb_id")
if not kb_id:
return get_json_result(data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR)
name = form.get("name")
url = form.get("url")
if not is_valid_url(url):
return get_json_result(data=False, message="The URL format is invalid", code=RetCode.ARGUMENT_ERROR)
e, kb = KnowledgebaseService.get_by_id(kb_id)
if not e:
raise LookupError("Can't find this dataset!")
if not check_kb_team_permission(kb, current_user.id):
return get_json_result(data=False, message="No authorization.", code=RetCode.AUTHENTICATION_ERROR)
blob = html2pdf(url)
if not blob:
return server_error_response(ValueError("Download failure."))
root_folder = FileService.get_root_folder(current_user.id)
pf_id = root_folder["id"]
FileService.init_knowledgebase_docs(pf_id, current_user.id)
kb_root_folder = FileService.get_kb_folder(current_user.id)
kb_folder = FileService.new_a_file_from_kb(kb.tenant_id, kb.name, kb_root_folder["id"])
try:
filename = duplicate_name(DocumentService.query, name=name + ".pdf", kb_id=kb.id)
filetype = filename_type(filename)
if filetype == FileType.OTHER.value:
raise RuntimeError("This type of file has not been supported yet!")
location = filename
while settings.STORAGE_IMPL.obj_exist(kb_id, location):
location += "_"
settings.STORAGE_IMPL.put(kb_id, location, blob)
doc = {
"id": get_uuid(),
"kb_id": kb.id,
"parser_id": kb.parser_id,
"parser_config": kb.parser_config,
"created_by": current_user.id,
"type": filetype,
"name": filename,
"location": location,
"size": len(blob),
"thumbnail": thumbnail(filename, blob),
"suffix": Path(filename).suffix.lstrip("."),
}
if doc["type"] == FileType.VISUAL:
doc["parser_id"] = ParserType.PICTURE.value
if doc["type"] == FileType.AURAL:
doc["parser_id"] = ParserType.AUDIO.value
if re.search(r"\.(ppt|pptx|pages)$", filename):
doc["parser_id"] = ParserType.PRESENTATION.value
if re.search(r"\.(eml)$", filename):
doc["parser_id"] = ParserType.EMAIL.value
DocumentService.insert(doc)
FileService.add_file_from_kb(doc, kb_folder["id"], kb.tenant_id)
except Exception as e:
return server_error_response(e)
return get_json_result(data=True)
@manager.route("/create", methods=["POST"]) # noqa: F821
@login_required
@validate_request("name", "kb_id")
async def create():
req = await get_request_json()
kb_id = req["kb_id"]
if not kb_id:
return get_json_result(data=False, message='Lack of "KB ID"', code=RetCode.ARGUMENT_ERROR)
if len(req["name"].encode("utf-8")) > FILE_NAME_LEN_LIMIT:
return get_json_result(data=False, message=f"File name must be {FILE_NAME_LEN_LIMIT} bytes or less.", code=RetCode.ARGUMENT_ERROR)
if req["name"].strip() == "":
return get_json_result(data=False, message="File name can't be empty.", code=RetCode.ARGUMENT_ERROR)
req["name"] = req["name"].strip()
try:
e, kb = KnowledgebaseService.get_by_id(kb_id)
if not e:
return get_data_error_result(message="Can't find this dataset!")
if DocumentService.query(name=req["name"], kb_id=kb_id):
return get_data_error_result(message="Duplicated document name in the same dataset.")
kb_root_folder = FileService.get_kb_folder(kb.tenant_id)
if not kb_root_folder:
return get_data_error_result(message="Cannot find the root folder.")
kb_folder = FileService.new_a_file_from_kb(
kb.tenant_id,
kb.name,
kb_root_folder["id"],
)
if not kb_folder:
return get_data_error_result(message="Cannot find the kb folder for this file.")
doc = DocumentService.insert(
{
"id": get_uuid(),
"kb_id": kb.id,
"parser_id": kb.parser_id,
"pipeline_id": kb.pipeline_id,
"parser_config": kb.parser_config,
"created_by": current_user.id,
"type": FileType.VIRTUAL,
"name": req["name"],
"suffix": Path(req["name"]).suffix.lstrip("."),
"location": "",
"size": 0,
}
)
FileService.add_file_from_kb(doc.to_dict(), kb_folder["id"], kb.tenant_id)
return get_json_result(data=doc.to_json())
except Exception as e:
return server_error_response(e)
@manager.route("/thumbnails", methods=["GET"]) # noqa: F821
# @login_required
def thumbnails():
doc_ids = request.args.getlist("doc_ids")
if not doc_ids:
return get_json_result(data=False, message='Lack of "Document ID"', code=RetCode.ARGUMENT_ERROR)
try:
docs = DocumentService.get_thumbnails(doc_ids)
for doc_item in docs:
if doc_item["thumbnail"] and not doc_item["thumbnail"].startswith(IMG_BASE64_PREFIX):
doc_item["thumbnail"] = f"/v1/document/image/{doc_item['kb_id']}-{doc_item['thumbnail']}"
return get_json_result(data={d["id"]: d["thumbnail"] for d in docs})
except Exception as e:
return server_error_response(e)
@manager.route("/change_status", methods=["POST"]) # noqa: F821
@login_required
@validate_request("doc_ids", "status")
async def change_status():
req = await get_request_json()
doc_ids = req.get("doc_ids", [])
status = str(req.get("status", ""))
if status not in ["0", "1"]:
return get_json_result(data=False, message='"Status" must be either 0 or 1!', code=RetCode.ARGUMENT_ERROR)
result = {}
has_error = False
for doc_id in doc_ids:
if not DocumentService.accessible(doc_id, current_user.id):
result[doc_id] = {"error": "No authorization."}
has_error = True
continue
try:
e, doc = DocumentService.get_by_id(doc_id)
if not e:
result[doc_id] = {"error": "No authorization."}
has_error = True
continue
e, kb = KnowledgebaseService.get_by_id(doc.kb_id)
if not e:
result[doc_id] = {"error": "Can't find this dataset!"}
has_error = True
continue
current_status = str(doc.status)
if current_status == status:
result[doc_id] = {"status": status}
continue
if not DocumentService.update_by_id(doc_id, {"status": str(status)}):
result[doc_id] = {"error": "Database error (Document update)!"}
has_error = True
continue
status_int = int(status)
if getattr(doc, "chunk_num", 0) > 0:
try:
ok = settings.docStoreConn.update(
{"doc_id": doc_id},
{"available_int": status_int},
search.index_name(kb.tenant_id),
doc.kb_id,
)
except Exception as exc:
msg = str(exc)
if "3022" in msg:
result[doc_id] = {"error": "Document store table missing."}
else:
result[doc_id] = {"error": f"Document store update failed: {msg}"}
has_error = True
continue
if not ok:
result[doc_id] = {"error": "Database error (docStore update)!"}
has_error = True
continue
result[doc_id] = {"status": status}
except Exception as e:
result[doc_id] = {"error": f"Internal server error: {str(e)}"}
has_error = True
if has_error:
return get_json_result(data=result, message="Partial failure", code=RetCode.SERVER_ERROR)
return get_json_result(data=result)
@manager.route("/run", methods=["POST"]) # noqa: F821
@login_required
@validate_request("doc_ids", "run")
async def run():
req = await get_request_json()
uid = current_user.id
try:
def _run_sync():
for doc_id in req["doc_ids"]:
if not DocumentService.accessible(doc_id, uid):
return get_json_result(data=False, message="No authorization.", code=RetCode.AUTHENTICATION_ERROR)
kb_table_num_map = {}
for id in req["doc_ids"]:
info = {"run": str(req["run"]), "progress": 0}
if str(req["run"]) == TaskStatus.RUNNING.value and req.get("delete", False):
info["progress_msg"] = ""
info["chunk_num"] = 0
info["token_num"] = 0
tenant_id = DocumentService.get_tenant_id(id)
if not tenant_id:
return get_data_error_result(message="Tenant not found!")
e, doc = DocumentService.get_by_id(id)
if not e:
return get_data_error_result(message="Document not found!")
if str(req["run"]) == TaskStatus.CANCEL.value:
tasks = list(TaskService.query(doc_id=id))
has_unfinished_task = any((task.progress or 0) < 1 for task in tasks)
if str(doc.run) in [TaskStatus.RUNNING.value, TaskStatus.CANCEL.value] or has_unfinished_task:
cancel_all_task_of(id)
else:
return get_data_error_result(message="Cannot cancel a task that is not in RUNNING status")
if all([("delete" not in req or req["delete"]), str(req["run"]) == TaskStatus.RUNNING.value, str(doc.run) == TaskStatus.DONE.value]):
DocumentService.clear_chunk_num_when_rerun(doc.id)
DocumentService.update_by_id(id, info)
if req.get("delete", False):
TaskService.filter_delete([Task.doc_id == id])
if settings.docStoreConn.index_exist(search.index_name(tenant_id), doc.kb_id):
settings.docStoreConn.delete({"doc_id": id}, search.index_name(tenant_id), doc.kb_id)
if str(req["run"]) == TaskStatus.RUNNING.value:
if req.get("apply_kb"):
e, kb = KnowledgebaseService.get_by_id(doc.kb_id)
if not e:
raise LookupError("Can't find this dataset!")
doc.parser_config["llm_id"] = kb.parser_config.get("llm_id")
doc.parser_config["enable_metadata"] = kb.parser_config.get("enable_metadata", False)
doc.parser_config["metadata"] = kb.parser_config.get("metadata", {})
DocumentService.update_parser_config(doc.id, doc.parser_config)
doc_dict = doc.to_dict()
DocumentService.run(tenant_id, doc_dict, kb_table_num_map)
return get_json_result(data=True)
return await thread_pool_exec(_run_sync)
except Exception as e:
return server_error_response(e)
@manager.route("/get/<doc_id>", methods=["GET"]) # noqa: F821
@login_required
async def get(doc_id):
try:
e, doc = DocumentService.get_by_id(doc_id)
if not e:
return get_data_error_result(message="Document not found!")
b, n = File2DocumentService.get_storage_address(doc_id=doc_id)
data = await thread_pool_exec(settings.STORAGE_IMPL.get, b, n)
response = await make_response(data)
ext = re.search(r"\.([^.]+)$", doc.name.lower())
ext = ext.group(1) if ext else None
content_type = None
if ext:
fallback_prefix = "image" if doc.type == FileType.VISUAL.value else "application"
content_type = CONTENT_TYPE_MAP.get(ext, f"{fallback_prefix}/{ext}")
apply_safe_file_response_headers(response, content_type, ext)
return response
except Exception as e:
return server_error_response(e)
@manager.route("/download/<attachment_id>", methods=["GET"]) # noqa: F821
@login_required
async def download_attachment(attachment_id):
try:
ext = request.args.get("ext", "markdown")
data = await thread_pool_exec(settings.STORAGE_IMPL.get, current_user.id, attachment_id)
response = await make_response(data)
content_type = CONTENT_TYPE_MAP.get(ext, f"application/{ext}")
apply_safe_file_response_headers(response, content_type, ext)
return response
except Exception as e:
return server_error_response(e)
@manager.route("/change_parser", methods=["POST"]) # noqa: F821
@login_required
@validate_request("doc_id")
async def change_parser():
req = await get_request_json()
if not DocumentService.accessible(req["doc_id"], current_user.id):
return get_json_result(data=False, message="No authorization.", code=RetCode.AUTHENTICATION_ERROR)
e, doc = DocumentService.get_by_id(req["doc_id"])
if not e:
return get_data_error_result(message="Document not found!")
def reset_doc():
nonlocal doc
e = DocumentService.update_by_id(doc.id, {"pipeline_id": req["pipeline_id"], "parser_id": req["parser_id"], "progress": 0, "progress_msg": "", "run": TaskStatus.UNSTART.value})
if not e:
return get_data_error_result(message="Document not found!")
if doc.token_num > 0:
e = DocumentService.increment_chunk_num(doc.id, doc.kb_id, doc.token_num * -1, doc.chunk_num * -1, doc.process_duration * -1)
if not e:
return get_data_error_result(message="Document not found!")
tenant_id = DocumentService.get_tenant_id(req["doc_id"])
if not tenant_id:
return get_data_error_result(message="Tenant not found!")
DocumentService.delete_chunk_images(doc, tenant_id)
if settings.docStoreConn.index_exist(search.index_name(tenant_id), doc.kb_id):
settings.docStoreConn.delete({"doc_id": doc.id}, search.index_name(tenant_id), doc.kb_id)
return None
try:
if "pipeline_id" in req and req["pipeline_id"] != "":
if doc.pipeline_id == req["pipeline_id"]:
return get_json_result(data=True)
DocumentService.update_by_id(doc.id, {"pipeline_id": req["pipeline_id"]})
reset_doc()
return get_json_result(data=True)
if doc.parser_id.lower() == req["parser_id"].lower():
if "parser_config" in req:
if req["parser_config"] == doc.parser_config:
return get_json_result(data=True)
else:
return get_json_result(data=True)
if (doc.type == FileType.VISUAL and req["parser_id"] != "picture") or (re.search(r"\.(ppt|pptx|pages)$", doc.name) and req["parser_id"] != "presentation"):
return get_data_error_result(message="Not supported yet!")
if "parser_config" in req:
DocumentService.update_parser_config(doc.id, req["parser_config"])
reset_doc()
return get_json_result(data=True)
except Exception as e:
return server_error_response(e)
@manager.route("/image/<image_id>", methods=["GET"]) # noqa: F821
# @login_required
async def get_image(image_id):
try:
arr = image_id.split("-")
if len(arr) != 2:
return get_data_error_result(message="Image not found.")
bkt, nm = image_id.split("-")
data = await thread_pool_exec(settings.STORAGE_IMPL.get, bkt, nm)
response = await make_response(data)
response.headers.set("Content-Type", "image/JPEG")
return response
except Exception as e:
return server_error_response(e)
ARTIFACT_CONTENT_TYPES = {
".png": "image/png",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".svg": "image/svg+xml",
".pdf": "application/pdf",
".csv": "text/csv",
".json": "application/json",
".html": "text/html",
}
@manager.route("/artifact/<filename>", methods=["GET"]) # noqa: F821
@login_required
async def get_artifact(filename):
try:
bucket = SANDBOX_ARTIFACT_BUCKET
# Validate filename: must be uuid hex + allowed extension, nothing else
basename = os.path.basename(filename)
if basename != filename or "/" in filename or "\\" in filename:
return get_data_error_result(message="Invalid filename.")
ext = os.path.splitext(basename)[1].lower()
if ext not in ARTIFACT_CONTENT_TYPES:
return get_data_error_result(message="Invalid file type.")
data = await thread_pool_exec(settings.STORAGE_IMPL.get, bucket, basename)
if not data:
return get_data_error_result(message="Artifact not found.")
content_type = ARTIFACT_CONTENT_TYPES.get(ext, "application/octet-stream")
response = await make_response(data)
safe_filename = re.sub(r"[^\w.\-]", "_", basename)
apply_safe_file_response_headers(response, content_type, ext)
if not response.headers.get("Content-Disposition"):
response.headers.set("Content-Disposition", f'inline; filename="{safe_filename}"')
return response
except Exception as e:
return server_error_response(e)
@manager.route("/upload_and_parse", methods=["POST"]) # noqa: F821
@login_required
@validate_request("conversation_id")
async def upload_and_parse():
files = await request.files
if "file" not in files:
return get_json_result(data=False, message="No file part!", code=RetCode.ARGUMENT_ERROR)
file_objs = files.getlist("file")
for file_obj in file_objs:
if file_obj.filename == "":
return get_json_result(data=False, message="No file selected!", code=RetCode.ARGUMENT_ERROR)
form = await request.form
doc_ids = doc_upload_and_parse(form.get("conversation_id"), file_objs, current_user.id)
return get_json_result(data=doc_ids)
@manager.route("/parse", methods=["POST"]) # noqa: F821
@login_required
async def parse():
req = await get_request_json()
url = req.get("url", "")
if url:
if not is_valid_url(url):
return get_json_result(data=False, message="The URL format is invalid", code=RetCode.ARGUMENT_ERROR)
download_path = os.path.join(get_project_base_directory(), "logs/downloads")
os.makedirs(download_path, exist_ok=True)
from seleniumwire.webdriver import Chrome, ChromeOptions
options = ChromeOptions()
options.add_argument("--headless")
options.add_argument("--disable-gpu")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
options.add_experimental_option("prefs", {"download.default_directory": download_path, "download.prompt_for_download": False, "download.directory_upgrade": True, "safebrowsing.enabled": True})
driver = Chrome(options=options)
driver.get(url)
res_headers = [r.response.headers for r in driver.requests if r and r.response]
if len(res_headers) > 1:
sections = RAGFlowHtmlParser().parser_txt(driver.page_source)
driver.quit()
return get_json_result(data="\n".join(sections))
class File:
filename: str
filepath: str
def __init__(self, filename, filepath):
self.filename = filename
self.filepath = filepath
def read(self):
with open(self.filepath, "rb") as f:
return f.read()
r = re.search(r"filename=\"([^\"]+)\"", str(res_headers))
if not r or not r.group(1):
return get_json_result(data=False, message="Can't not identify downloaded file", code=RetCode.ARGUMENT_ERROR)
filename = r.group(1).strip()
if not _is_safe_download_filename(filename):
return get_json_result(data=False, message="Invalid downloaded filename", code=RetCode.ARGUMENT_ERROR)
filepath = os.path.join(download_path, filename)
f = File(filename, filepath)
txt = FileService.parse_docs([f], current_user.id)
return get_json_result(data=txt)
files = await request.files
if "file" not in files:
return get_json_result(data=False, message="No file part!", code=RetCode.ARGUMENT_ERROR)
file_objs = files.getlist("file")
txt = FileService.parse_docs(file_objs, current_user.id)
return get_json_result(data=txt)
@manager.route("/upload_info", methods=["POST"]) # noqa: F821
@login_required
async def upload_info():
files = await request.files
file_objs = files.getlist("file") if files and files.get("file") else []
url = request.args.get("url")
if file_objs and url:
return get_json_result(
data=False,
message="Provide either multipart file(s) or ?url=..., not both.",
code=RetCode.BAD_REQUEST,
)
if not file_objs and not url:
return get_json_result(
data=False,
message="Missing input: provide multipart file(s) or url",
code=RetCode.BAD_REQUEST,
)
try:
if url and not file_objs:
assert_url_is_safe(url)
return get_json_result(data=FileService.upload_info(current_user.id, None, url))
if len(file_objs) == 1:
return get_json_result(data=FileService.upload_info(current_user.id, file_objs[0], None))
results = [FileService.upload_info(current_user.id, f, None) for f in file_objs]
return get_json_result(data=results)
except Exception as e:
return server_error_response(e)