Fix: LFI vulnerability in document parsing API (#13196)

### What problem does this PR solve?

Fix LFI vulnerability in document parsing API.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Yongteng Lei
2026-02-25 09:47:39 +08:00
committed by GitHub
parent f4cbdc3a3b
commit 72b89304c1
2 changed files with 19 additions and 3 deletions

View File

@ -17,7 +17,7 @@ import json
import os.path
import pathlib
import re
from pathlib import Path
from pathlib import Path, PurePosixPath, PureWindowsPath
from quart import request, make_response
from api.apps import current_user, login_required
from api.common.check_team_permission import check_kb_team_permission
@ -50,6 +50,18 @@ from rag.nlp import search, rag_tokenizer
from common import settings
def _is_safe_download_filename(name: str) -> bool:
if not name or name in {".", ".."}:
return False
if "\x00" in name or len(name) > 255:
return False
if name != PurePosixPath(name).name:
return False
if name != PureWindowsPath(name).name:
return False
return True
@manager.route("/upload", methods=["POST"]) # noqa: F821
@login_required
@validate_request("kb_id")
@ -874,7 +886,11 @@ async def parse():
r = re.search(r"filename=\"([^\"]+)\"", str(res_headers))
if not r or not r.group(1):
return get_json_result(data=False, message="Can't not identify downloaded file", code=RetCode.ARGUMENT_ERROR)
f = File(r.group(1), os.path.join(download_path, r.group(1)))
filename = r.group(1).strip()
if not _is_safe_download_filename(filename):
return get_json_result(data=False, message="Invalid downloaded filename", code=RetCode.ARGUMENT_ERROR)
filepath = os.path.join(download_path, filename)
f = File(filename, filepath)
txt = FileService.parse_docs([f], current_user.id)
return get_json_result(data=txt)

View File

@ -19,7 +19,7 @@ from rag.nlp import find_codec
def get_text(fnm: str, binary=None) -> str:
txt = ""
if binary:
if binary is not None:
encoding = find_codec(binary)
txt = binary.decode(encoding, errors="ignore")
else: