mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-03 08:47:48 +08:00
feat(api/utils): Harden file_utils for robustness and edge cases (#12915)
## Summary Improves robustness and edge-case handling in `api.utils.file_utils` to avoid crashes, DoS/OOM risks, and timeouts when processing user-provided filenames, paths, and file blobs. ## Changes ### Resource limits & timeouts - **`MAX_BLOB_SIZE_THUMBNAIL`** (50 MiB) and **`MAX_BLOB_SIZE_PDF`** (100 MiB) to reject oversized inputs before thumbnail/PDF processing. - **`GHOSTSCRIPT_TIMEOUT_SEC`** (120 s) for `repair_pdf_with_ghostscript` subprocess to avoid hangs on malicious or broken PDFs. ### `filename_type` - Handles `None`, empty string, non-string (e.g. int/list), and path-only input via new **`_normalize_filename_for_type()`**. - Uses basename for type detection (e.g. `a/b/c.pdf` → PDF). - Enforces **`FILE_NAME_LEN_LIMIT`**; invalid input returns `FileType.OTHER`. ### `thumbnail_img` - Rejects `None`/empty/oversized blob and invalid filename; returns `None` instead of raising. - Wraps PDF, image, and PPT handling in try/except so corrupt or malformed files return `None`. - Ensures PDF has pages and PPT has slides before use. - Normalizes PIL image mode (RGBA/P/LA → RGB) for safe PNG export. ### `repair_pdf_with_ghostscript` - Handles `None`/empty input; skips repair when input size exceeds limit. - Uses `subprocess.run(..., timeout=GHOSTSCRIPT_TIMEOUT_SEC)` and catches `TimeoutExpired`. - Returns original bytes when Ghostscript output is empty. ### `read_potential_broken_pdf` - `None` → `b""`; non–sequence-like (no `len`) → `b""`; empty → return as-is. - Oversized blob returned as-is (no repair) to avoid DoS. ### `sanitize_path` - Explicit `None` and non-string check; strips whitespace before normalizing. ## Testing - **`test/unit_test/utils/test_api_file_utils.py`** added with 36 unit tests covering the above behavior (filename_type, sanitize_path, read_potential_broken_pdf, thumbnail_img, thumbnail, repair_pdf_with_ghostscript, constants). - All tests pass. --------- Co-authored-by: Gittensor Miner <miner@gittensor.io>
This commit is contained in:
@ -17,6 +17,7 @@
|
||||
|
||||
# Standard library imports
|
||||
import base64
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
@ -29,16 +30,37 @@ import pdfplumber
|
||||
from PIL import Image
|
||||
|
||||
# Local imports
|
||||
from api.constants import IMG_BASE64_PREFIX
|
||||
from api.constants import FILE_NAME_LEN_LIMIT, IMG_BASE64_PREFIX
|
||||
from api.db import FileType
|
||||
|
||||
# Robustness and resource limits: reject oversized inputs to avoid DoS and OOM.
|
||||
MAX_BLOB_SIZE_THUMBNAIL = 50 * 1024 * 1024 # 50 MiB for thumbnail generation
|
||||
MAX_BLOB_SIZE_PDF = 100 * 1024 * 1024 # 100 MiB for PDF repair / read
|
||||
GHOSTSCRIPT_TIMEOUT_SEC = 120 # Timeout for Ghostscript subprocess
|
||||
|
||||
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
|
||||
if LOCK_KEY_pdfplumber not in sys.modules:
|
||||
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
|
||||
|
||||
|
||||
def _normalize_filename_for_type(filename):
|
||||
"""Extract a safe basename for type detection. Returns (normalized_str, True) or ("", False)."""
|
||||
if filename is None:
|
||||
return "", False
|
||||
if not isinstance(filename, str):
|
||||
return "", False
|
||||
base = os.path.basename(filename).strip()
|
||||
if not base or len(base) > FILE_NAME_LEN_LIMIT:
|
||||
return "", False
|
||||
return base.lower(), True
|
||||
|
||||
|
||||
def filename_type(filename):
|
||||
filename = filename.lower()
|
||||
"""Return file type from extension. Handles None, empty, path-only, and oversized names."""
|
||||
normalized, ok = _normalize_filename_for_type(filename)
|
||||
if not ok:
|
||||
return FileType.OTHER.value
|
||||
filename = normalized
|
||||
if re.match(r".*\.pdf$", filename):
|
||||
return FileType.PDF.value
|
||||
|
||||
@ -56,34 +78,68 @@ def filename_type(filename):
|
||||
|
||||
def thumbnail_img(filename, blob):
|
||||
"""
|
||||
MySQL LongText max length is 65535
|
||||
Generate thumbnail image bytes for PDF, image, or PPT. MySQL LongText max length is 65535.
|
||||
|
||||
Robustness and edge cases:
|
||||
- Rejects None, empty, or oversized blob to avoid DoS/OOM.
|
||||
- Uses basename for type detection (handles paths like "a/b/c.pdf").
|
||||
- Catches corrupt or malformed files and returns None instead of raising.
|
||||
- Normalizes PIL image mode (e.g. RGBA -> RGB) for safe PNG export.
|
||||
"""
|
||||
filename = filename.lower()
|
||||
if blob is None:
|
||||
return None
|
||||
try:
|
||||
blob_len = len(blob)
|
||||
except TypeError:
|
||||
return None
|
||||
if blob_len == 0 or blob_len > MAX_BLOB_SIZE_THUMBNAIL:
|
||||
return None
|
||||
|
||||
normalized, ok = _normalize_filename_for_type(filename)
|
||||
if not ok:
|
||||
return None
|
||||
filename = normalized
|
||||
|
||||
if re.match(r".*\.pdf$", filename):
|
||||
with sys.modules[LOCK_KEY_pdfplumber]:
|
||||
pdf = pdfplumber.open(BytesIO(blob))
|
||||
try:
|
||||
with sys.modules[LOCK_KEY_pdfplumber]:
|
||||
pdf = pdfplumber.open(BytesIO(blob))
|
||||
if not pdf.pages:
|
||||
pdf.close()
|
||||
return None
|
||||
buffered = BytesIO()
|
||||
resolution = 32
|
||||
img = None
|
||||
for _ in range(10):
|
||||
pdf.pages[0].to_image(resolution=resolution).annotated.save(buffered, format="png")
|
||||
img = buffered.getvalue()
|
||||
if len(img) >= 64000 and resolution >= 2:
|
||||
resolution = resolution / 2
|
||||
buffered = BytesIO()
|
||||
else:
|
||||
break
|
||||
pdf.close()
|
||||
return img
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
|
||||
try:
|
||||
image = Image.open(BytesIO(blob))
|
||||
image.load()
|
||||
if image.mode in ("RGBA", "P", "LA"):
|
||||
image = image.convert("RGB")
|
||||
image.thumbnail((30, 30))
|
||||
buffered = BytesIO()
|
||||
resolution = 32
|
||||
img = None
|
||||
for _ in range(10):
|
||||
# https://github.com/jsvine/pdfplumber?tab=readme-ov-file#creating-a-pageimage-with-to_image
|
||||
pdf.pages[0].to_image(resolution=resolution).annotated.save(buffered, format="png")
|
||||
img = buffered.getvalue()
|
||||
if len(img) >= 64000 and resolution >= 2:
|
||||
resolution = resolution / 2
|
||||
buffered = BytesIO()
|
||||
else:
|
||||
break
|
||||
pdf.close()
|
||||
return img
|
||||
image.save(buffered, format="png")
|
||||
return buffered.getvalue()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
# PPT/PPTX thumbnail would require a licensed library; skip and return None.
|
||||
if re.match(r".*\.(ppt|pptx)$", filename):
|
||||
return None
|
||||
|
||||
elif re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
|
||||
image = Image.open(BytesIO(blob))
|
||||
image.thumbnail((30, 30))
|
||||
buffered = BytesIO()
|
||||
image.save(buffered, format="png")
|
||||
return buffered.getvalue()
|
||||
return None
|
||||
|
||||
|
||||
@ -96,6 +152,12 @@ def thumbnail(filename, blob):
|
||||
|
||||
|
||||
def repair_pdf_with_ghostscript(input_bytes):
|
||||
"""Attempt to repair corrupt PDF bytes via Ghostscript. Returns original bytes on failure or timeout."""
|
||||
if input_bytes is None or len(input_bytes) == 0:
|
||||
return input_bytes if input_bytes is not None else b""
|
||||
if len(input_bytes) > MAX_BLOB_SIZE_PDF:
|
||||
return input_bytes
|
||||
|
||||
if shutil.which("gs") is None:
|
||||
return input_bytes
|
||||
|
||||
@ -112,22 +174,46 @@ def repair_pdf_with_ghostscript(input_bytes):
|
||||
temp_in.name,
|
||||
]
|
||||
try:
|
||||
proc = subprocess.run(cmd, capture_output=True, text=True)
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=GHOSTSCRIPT_TIMEOUT_SEC,
|
||||
)
|
||||
if proc.returncode != 0:
|
||||
return input_bytes
|
||||
temp_out.seek(0)
|
||||
repaired_bytes = temp_out.read()
|
||||
if not repaired_bytes:
|
||||
return input_bytes
|
||||
return repaired_bytes
|
||||
except subprocess.TimeoutExpired:
|
||||
return input_bytes
|
||||
except Exception:
|
||||
return input_bytes
|
||||
|
||||
temp_out.seek(0)
|
||||
repaired_bytes = temp_out.read()
|
||||
|
||||
return repaired_bytes
|
||||
|
||||
|
||||
def read_potential_broken_pdf(blob):
|
||||
def try_open(blob):
|
||||
"""
|
||||
Return PDF bytes, optionally repaired via Ghostscript if initially unreadable.
|
||||
|
||||
Edge cases and robustness:
|
||||
- None blob returns b"" to avoid callers receiving None.
|
||||
- Empty blob returned as-is.
|
||||
- Oversized blob (> MAX_BLOB_SIZE_PDF) returned as-is without repair to avoid DoS.
|
||||
"""
|
||||
if blob is None:
|
||||
return b""
|
||||
try:
|
||||
blob_len = len(blob)
|
||||
except TypeError:
|
||||
return b""
|
||||
if blob_len == 0:
|
||||
return blob
|
||||
|
||||
def try_open(data):
|
||||
try:
|
||||
with pdfplumber.open(BytesIO(blob)) as pdf:
|
||||
with pdfplumber.open(BytesIO(data)) as pdf:
|
||||
if pdf.pages:
|
||||
return True
|
||||
except Exception:
|
||||
@ -137,6 +223,9 @@ def read_potential_broken_pdf(blob):
|
||||
if try_open(blob):
|
||||
return blob
|
||||
|
||||
if blob_len > MAX_BLOB_SIZE_PDF:
|
||||
return blob
|
||||
|
||||
repaired = repair_pdf_with_ghostscript(blob)
|
||||
if try_open(repaired):
|
||||
return repaired
|
||||
@ -151,7 +240,11 @@ def sanitize_path(raw_path: str | None) -> str:
|
||||
- Strips leading/trailing slashes
|
||||
- Removes '.' and '..' segments
|
||||
- Restricts characters to A-Za-z0-9, underscore, dash, and '/'
|
||||
- Returns "" for None, empty, or non-string input (robustness).
|
||||
"""
|
||||
if raw_path is None or not isinstance(raw_path, str):
|
||||
return ""
|
||||
raw_path = raw_path.strip()
|
||||
if not raw_path:
|
||||
return ""
|
||||
backslash_re = re.compile(r"[\\]+")
|
||||
|
||||
Reference in New Issue
Block a user