Fix: LFI vulnerability in document parsing API (#13196)

### What problem does this PR solve? Fix LFI vulnerability in document parsing API. ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
2026-04-23 20:26:11 +08:00 · 2026-02-25 09:47:39 +08:00
parent f4cbdc3a3b
commit 72b89304c1
2 changed files with 19 additions and 3 deletions
--- a/api/apps/document_app.py
+++ b/api/apps/document_app.py
@ -17,7 +17,7 @@ import json
 import os.path
 import pathlib
 import re
-from pathlib import Path
+from pathlib import Path, PurePosixPath, PureWindowsPath
 from quart import request, make_response
 from api.apps import current_user, login_required
 from api.common.check_team_permission import check_kb_team_permission
@ -50,6 +50,18 @@ from rag.nlp import search, rag_tokenizer
 from common import settings


+def _is_safe_download_filename(name: str) -> bool:
+    if not name or name in {".", ".."}:
+        return False
+    if "\x00" in name or len(name) > 255:
+        return False
+    if name != PurePosixPath(name).name:
+        return False
+    if name != PureWindowsPath(name).name:
+        return False
+    return True
+
+
@manager.route("/upload", methods=["POST"])  # noqa: F821
@login_required
@validate_request("kb_id")
@ -874,7 +886,11 @@ async def parse():
        r = re.search(r"filename=\"([^\"]+)\"", str(res_headers))
        if not r or not r.group(1):
            return get_json_result(data=False, message="Can't not identify downloaded file", code=RetCode.ARGUMENT_ERROR)
-        f = File(r.group(1), os.path.join(download_path, r.group(1)))
+        filename = r.group(1).strip()
+        if not _is_safe_download_filename(filename):
+            return get_json_result(data=False, message="Invalid downloaded filename", code=RetCode.ARGUMENT_ERROR)
+        filepath = os.path.join(download_path, filename)
+        f = File(filename, filepath)
        txt = FileService.parse_docs([f], current_user.id)
        return get_json_result(data=txt)

--- a/deepdoc/parser/utils.py
+++ b/deepdoc/parser/utils.py
@ -19,7 +19,7 @@ from rag.nlp import find_codec

 def get_text(fnm: str, binary=None) -> str:
    txt = ""
-    if binary:
+    if binary is not None:
        encoding = find_codec(binary)
        txt = binary.decode(encoding, errors="ignore")
    else: