feat(api/utils): Harden file_utils for robustness and edge cases (#12915)

## Summary
Improves robustness and edge-case handling in `api.utils.file_utils` to
avoid crashes, DoS/OOM risks, and timeouts when processing user-provided
filenames, paths, and file blobs.

## Changes

### Resource limits & timeouts
- **`MAX_BLOB_SIZE_THUMBNAIL`** (50 MiB) and **`MAX_BLOB_SIZE_PDF`**
(100 MiB) to reject oversized inputs before thumbnail/PDF processing.
- **`GHOSTSCRIPT_TIMEOUT_SEC`** (120 s) for
`repair_pdf_with_ghostscript` subprocess to avoid hangs on malicious or
broken PDFs.

### `filename_type`
- Handles `None`, empty string, non-string (e.g. int/list), and
path-only input via new **`_normalize_filename_for_type()`**.
- Uses basename for type detection (e.g. `a/b/c.pdf` → PDF).
- Enforces **`FILE_NAME_LEN_LIMIT`**; invalid input returns
`FileType.OTHER`.

### `thumbnail_img`
- Rejects `None`/empty/oversized blob and invalid filename; returns
`None` instead of raising.
- Wraps PDF, image, and PPT handling in try/except so corrupt or
malformed files return `None`.
- Ensures PDF has pages and PPT has slides before use.
- Normalizes PIL image mode (RGBA/P/LA → RGB) for safe PNG export.

### `repair_pdf_with_ghostscript`
- Handles `None`/empty input; skips repair when input size exceeds
limit.
- Uses `subprocess.run(..., timeout=GHOSTSCRIPT_TIMEOUT_SEC)` and
catches `TimeoutExpired`.
- Returns original bytes when Ghostscript output is empty.

### `read_potential_broken_pdf`
- `None` → `b""`; non–sequence-like (no `len`) → `b""`; empty → return
as-is.
- Oversized blob returned as-is (no repair) to avoid DoS.

### `sanitize_path`
- Explicit `None` and non-string check; strips whitespace before
normalizing.

## Testing
- **`test/unit_test/utils/test_api_file_utils.py`** added with 36 unit
tests covering the above behavior (filename_type, sanitize_path,
read_potential_broken_pdf, thumbnail_img, thumbnail,
repair_pdf_with_ghostscript, constants).
- All tests pass.

---------

Co-authored-by: Gittensor Miner <miner@gittensor.io>
This commit is contained in:
Phives
2026-02-25 01:34:47 -05:00
committed by GitHub
parent 8ad47bf242
commit 4ceb668d40
3 changed files with 280 additions and 41 deletions

View File

@ -17,6 +17,7 @@
# Standard library imports
import base64
import os
import re
import shutil
import subprocess
@ -29,16 +30,37 @@ import pdfplumber
from PIL import Image
# Local imports
from api.constants import IMG_BASE64_PREFIX
from api.constants import FILE_NAME_LEN_LIMIT, IMG_BASE64_PREFIX
from api.db import FileType
# Robustness and resource limits: reject oversized inputs to avoid DoS and OOM.
MAX_BLOB_SIZE_THUMBNAIL = 50 * 1024 * 1024 # 50 MiB for thumbnail generation
MAX_BLOB_SIZE_PDF = 100 * 1024 * 1024 # 100 MiB for PDF repair / read
GHOSTSCRIPT_TIMEOUT_SEC = 120 # Timeout for Ghostscript subprocess
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
if LOCK_KEY_pdfplumber not in sys.modules:
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
def _normalize_filename_for_type(filename):
"""Extract a safe basename for type detection. Returns (normalized_str, True) or ("", False)."""
if filename is None:
return "", False
if not isinstance(filename, str):
return "", False
base = os.path.basename(filename).strip()
if not base or len(base) > FILE_NAME_LEN_LIMIT:
return "", False
return base.lower(), True
def filename_type(filename):
filename = filename.lower()
"""Return file type from extension. Handles None, empty, path-only, and oversized names."""
normalized, ok = _normalize_filename_for_type(filename)
if not ok:
return FileType.OTHER.value
filename = normalized
if re.match(r".*\.pdf$", filename):
return FileType.PDF.value
@ -56,34 +78,68 @@ def filename_type(filename):
def thumbnail_img(filename, blob):
"""
MySQL LongText max length is 65535
Generate thumbnail image bytes for PDF, image, or PPT. MySQL LongText max length is 65535.
Robustness and edge cases:
- Rejects None, empty, or oversized blob to avoid DoS/OOM.
- Uses basename for type detection (handles paths like "a/b/c.pdf").
- Catches corrupt or malformed files and returns None instead of raising.
- Normalizes PIL image mode (e.g. RGBA -> RGB) for safe PNG export.
"""
filename = filename.lower()
if blob is None:
return None
try:
blob_len = len(blob)
except TypeError:
return None
if blob_len == 0 or blob_len > MAX_BLOB_SIZE_THUMBNAIL:
return None
normalized, ok = _normalize_filename_for_type(filename)
if not ok:
return None
filename = normalized
if re.match(r".*\.pdf$", filename):
with sys.modules[LOCK_KEY_pdfplumber]:
pdf = pdfplumber.open(BytesIO(blob))
try:
with sys.modules[LOCK_KEY_pdfplumber]:
pdf = pdfplumber.open(BytesIO(blob))
if not pdf.pages:
pdf.close()
return None
buffered = BytesIO()
resolution = 32
img = None
for _ in range(10):
pdf.pages[0].to_image(resolution=resolution).annotated.save(buffered, format="png")
img = buffered.getvalue()
if len(img) >= 64000 and resolution >= 2:
resolution = resolution / 2
buffered = BytesIO()
else:
break
pdf.close()
return img
except Exception:
return None
if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
try:
image = Image.open(BytesIO(blob))
image.load()
if image.mode in ("RGBA", "P", "LA"):
image = image.convert("RGB")
image.thumbnail((30, 30))
buffered = BytesIO()
resolution = 32
img = None
for _ in range(10):
# https://github.com/jsvine/pdfplumber?tab=readme-ov-file#creating-a-pageimage-with-to_image
pdf.pages[0].to_image(resolution=resolution).annotated.save(buffered, format="png")
img = buffered.getvalue()
if len(img) >= 64000 and resolution >= 2:
resolution = resolution / 2
buffered = BytesIO()
else:
break
pdf.close()
return img
image.save(buffered, format="png")
return buffered.getvalue()
except Exception:
return None
# PPT/PPTX thumbnail would require a licensed library; skip and return None.
if re.match(r".*\.(ppt|pptx)$", filename):
return None
elif re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
image = Image.open(BytesIO(blob))
image.thumbnail((30, 30))
buffered = BytesIO()
image.save(buffered, format="png")
return buffered.getvalue()
return None
@ -96,6 +152,12 @@ def thumbnail(filename, blob):
def repair_pdf_with_ghostscript(input_bytes):
"""Attempt to repair corrupt PDF bytes via Ghostscript. Returns original bytes on failure or timeout."""
if input_bytes is None or len(input_bytes) == 0:
return input_bytes if input_bytes is not None else b""
if len(input_bytes) > MAX_BLOB_SIZE_PDF:
return input_bytes
if shutil.which("gs") is None:
return input_bytes
@ -112,22 +174,46 @@ def repair_pdf_with_ghostscript(input_bytes):
temp_in.name,
]
try:
proc = subprocess.run(cmd, capture_output=True, text=True)
proc = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=GHOSTSCRIPT_TIMEOUT_SEC,
)
if proc.returncode != 0:
return input_bytes
temp_out.seek(0)
repaired_bytes = temp_out.read()
if not repaired_bytes:
return input_bytes
return repaired_bytes
except subprocess.TimeoutExpired:
return input_bytes
except Exception:
return input_bytes
temp_out.seek(0)
repaired_bytes = temp_out.read()
return repaired_bytes
def read_potential_broken_pdf(blob):
def try_open(blob):
"""
Return PDF bytes, optionally repaired via Ghostscript if initially unreadable.
Edge cases and robustness:
- None blob returns b"" to avoid callers receiving None.
- Empty blob returned as-is.
- Oversized blob (> MAX_BLOB_SIZE_PDF) returned as-is without repair to avoid DoS.
"""
if blob is None:
return b""
try:
blob_len = len(blob)
except TypeError:
return b""
if blob_len == 0:
return blob
def try_open(data):
try:
with pdfplumber.open(BytesIO(blob)) as pdf:
with pdfplumber.open(BytesIO(data)) as pdf:
if pdf.pages:
return True
except Exception:
@ -137,6 +223,9 @@ def read_potential_broken_pdf(blob):
if try_open(blob):
return blob
if blob_len > MAX_BLOB_SIZE_PDF:
return blob
repaired = repair_pdf_with_ghostscript(blob)
if try_open(repaired):
return repaired
@ -151,7 +240,11 @@ def sanitize_path(raw_path: str | None) -> str:
- Strips leading/trailing slashes
- Removes '.' and '..' segments
- Restricts characters to A-Za-z0-9, underscore, dash, and '/'
- Returns "" for None, empty, or non-string input (robustness).
"""
if raw_path is None or not isinstance(raw_path, str):
return ""
raw_path = raw_path.strip()
if not raw_path:
return ""
backslash_re = re.compile(r"[\\]+")

View File

@ -166,20 +166,14 @@ async def async_request(
if attempt >= retries:
if not _is_sensitive_url(url):
log_url = _redact_sensitive_url_params(url)
logger.warning(f"async_request exhausted retries for {method}")
logger.warning(f"async_request exhausted retries for {method} {log_url}")
raise
delay = _get_delay(backoff_factor, attempt)
if not _is_sensitive_url(url):
log_url = _redact_sensitive_url_params(url)
logger.warning(
f"async_request attempt {attempt + 1}/{retries + 1} failed for {method}; retrying in {delay:.2f}s"
f"async_request attempt {attempt + 1}/{retries + 1} failed for {method} {log_url}; retrying in {delay:.2f}s"
)
raise
delay = _get_delay(backoff_factor, attempt)
# Avoid including the (potentially sensitive) URL in retry logs.
logger.warning(
f"async_request attempt {attempt + 1}/{retries + 1} failed for {method}; retrying in {delay:.2f}s"
)
await asyncio.sleep(delay)
raise last_exc # pragma: no cover

View File

@ -0,0 +1,152 @@
#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""Unit tests for api.utils.file_utils (filename_type, thumbnail_img, sanitize_path, read_potential_broken_pdf)."""
import pytest
from api.db import FileType
from api.utils.file_utils import (
MAX_BLOB_SIZE_PDF,
MAX_BLOB_SIZE_THUMBNAIL,
GHOSTSCRIPT_TIMEOUT_SEC,
filename_type,
thumbnail_img,
thumbnail,
sanitize_path,
read_potential_broken_pdf,
repair_pdf_with_ghostscript,
)
class TestFilenameType:
"""Edge cases and robustness for filename_type."""
@pytest.mark.parametrize("filename,expected", [
("doc.pdf", FileType.PDF.value),
("a.PDF", FileType.PDF.value),
("x.png", FileType.VISUAL.value),
("file.docx", FileType.DOC.value),
("a/b/c.pdf", FileType.PDF.value),
("path/to/file.txt", FileType.DOC.value),
])
def test_valid_filenames(self, filename, expected):
assert filename_type(filename) == expected
@pytest.mark.parametrize("filename", [
None,
"",
" ",
123,
[],
])
def test_invalid_or_empty_returns_other(self, filename):
assert filename_type(filename) == FileType.OTHER.value
def test_path_with_basename_uses_extension(self):
assert filename_type("folder/subfolder/document.pdf") == FileType.PDF.value
class TestSanitizePath:
"""Edge cases for sanitize_path."""
@pytest.mark.parametrize("raw,expected", [
(None, ""),
("", ""),
(" ", ""),
(42, ""),
("a/b", "a/b"),
("a/../b", "a/b"),
("/leading/", "leading"),
("\\mixed\\path", "mixed/path"),
])
def test_sanitize_cases(self, raw, expected):
assert sanitize_path(raw) == expected
class TestReadPotentialBrokenPdf:
"""Edge cases and robustness for read_potential_broken_pdf."""
def test_none_returns_empty_bytes(self):
assert read_potential_broken_pdf(None) == b""
def test_empty_bytes_returns_as_is(self):
assert read_potential_broken_pdf(b"") == b""
def test_non_len_raises_or_returns_empty(self):
class NoLen:
pass
result = read_potential_broken_pdf(NoLen())
assert result == b""
class TestThumbnailImg:
"""Edge cases for thumbnail_img."""
def test_none_blob_returns_none(self):
assert thumbnail_img("x.pdf", None) is None
def test_none_filename_returns_none(self):
assert thumbnail_img(None, b"fake pdf content") is None
def test_empty_blob_returns_none(self):
assert thumbnail_img("x.pdf", b"") is None
def test_empty_filename_returns_none(self):
assert thumbnail_img("", b"x") is None
def test_oversized_blob_returns_none(self):
huge = b"x" * (MAX_BLOB_SIZE_THUMBNAIL + 1)
assert thumbnail_img("x.pdf", huge) is None
class TestThumbnail:
"""thumbnail() wraps thumbnail_img and returns base64 or empty string."""
def test_none_img_returns_empty_string(self):
assert thumbnail("x.xyz", b"garbage") == ""
def test_valid_img_returns_base64_prefix(self):
from api.constants import IMG_BASE64_PREFIX
result = thumbnail("x.png", b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x02\x00\x00\x00\x90wS\xde\x00\x00\x00\x0cIDATx\x9cc\xf8\x0f\x00\x00\x01\x01\x00\x05\x18\xd8N\x00\x00\x00\x00IEND\xaeB`\x82")
assert result.startswith(IMG_BASE64_PREFIX) or result == ""
class TestRepairPdfWithGhostscript:
"""repair_pdf_with_ghostscript edge cases."""
def test_none_returns_empty_bytes(self):
assert repair_pdf_with_ghostscript(None) == b""
def test_empty_bytes_returns_empty(self):
assert repair_pdf_with_ghostscript(b"") == b""
def test_oversized_returns_original_without_calling_gs(self):
huge = b"%" * (MAX_BLOB_SIZE_PDF + 1)
result = repair_pdf_with_ghostscript(huge)
assert result == huge
class TestConstants:
"""Resource limit constants are positive and reasonable."""
def test_thumbnail_limit_positive(self):
assert MAX_BLOB_SIZE_THUMBNAIL > 0
def test_pdf_limit_positive(self):
assert MAX_BLOB_SIZE_PDF > 0
def test_gs_timeout_positive(self):
assert GHOSTSCRIPT_TIMEOUT_SEC > 0