Files
ragflow/api/utils/file_utils.py
Daniil Sivak 60ad32a0c2 Feat: support epub parsing (#13650)
Closes #1398

### What problem does this PR solve?

Adds native support for EPUB files. EPUB content is extracted in spine
(reading) order and parsed using the existing HTML parser. No new
dependencies required.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)

To check this parser manually:

```python
uv run --python 3.12 python -c "
from deepdoc.parser import EpubParser

with open('$HOME/some_epub_book.epub', 'rb') as f:
  data = f.read()

sections = EpubParser()(None, binary=data, chunk_token_num=512)
print(f'Got {len(sections)} sections')
for i, s in enumerate(sections[:5]):
  print(f'\n--- Section {i} ---')
  print(s[:200])
"
```
2026-03-17 20:14:06 +08:00

262 lines
8.3 KiB
Python

#
# Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Standard library imports
import base64
import os
import re
import shutil
import subprocess
import sys
import tempfile
import threading
from io import BytesIO
import pdfplumber
from PIL import Image
# Local imports
from api.constants import FILE_NAME_LEN_LIMIT, IMG_BASE64_PREFIX
from api.db import FileType
# Robustness and resource limits: reject oversized inputs to avoid DoS and OOM.
MAX_BLOB_SIZE_THUMBNAIL = 50 * 1024 * 1024 # 50 MiB for thumbnail generation
MAX_BLOB_SIZE_PDF = 100 * 1024 * 1024 # 100 MiB for PDF repair / read
GHOSTSCRIPT_TIMEOUT_SEC = 120 # Timeout for Ghostscript subprocess
LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber"
if LOCK_KEY_pdfplumber not in sys.modules:
sys.modules[LOCK_KEY_pdfplumber] = threading.Lock()
def _normalize_filename_for_type(filename):
"""Extract a safe basename for type detection. Returns (normalized_str, True) or ("", False)."""
if filename is None:
return "", False
if not isinstance(filename, str):
return "", False
base = os.path.basename(filename).strip()
if not base or len(base) > FILE_NAME_LEN_LIMIT:
return "", False
return base.lower(), True
def filename_type(filename):
"""Return file type from extension. Handles None, empty, path-only, and oversized names."""
normalized, ok = _normalize_filename_for_type(filename)
if not ok:
return FileType.OTHER.value
filename = normalized
if re.match(r".*\.pdf$", filename):
return FileType.PDF.value
if re.match(
r".*\.(msg|eml|doc|docx|ppt|pptx|yml|xml|htm|json|jsonl|ldjson|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|mdx|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql|epub)$", filename
):
return FileType.DOC.value
if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus)$", filename):
return FileType.AURAL.value
if re.match(
r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4|avi|mkv)$", filename
):
return FileType.VISUAL.value
return FileType.OTHER.value
def thumbnail_img(filename, blob):
"""
Generate thumbnail image bytes for PDF, image, or PPT. MySQL LongText max length is 65535.
Robustness and edge cases:
- Rejects None, empty, or oversized blob to avoid DoS/OOM.
- Uses basename for type detection (handles paths like "a/b/c.pdf").
- Catches corrupt or malformed files and returns None instead of raising.
- Normalizes PIL image mode (e.g. RGBA -> RGB) for safe PNG export.
"""
if blob is None:
return None
try:
blob_len = len(blob)
except TypeError:
return None
if blob_len == 0 or blob_len > MAX_BLOB_SIZE_THUMBNAIL:
return None
normalized, ok = _normalize_filename_for_type(filename)
if not ok:
return None
filename = normalized
if re.match(r".*\.pdf$", filename):
try:
with sys.modules[LOCK_KEY_pdfplumber]:
pdf = pdfplumber.open(BytesIO(blob))
if not pdf.pages:
pdf.close()
return None
buffered = BytesIO()
resolution = 32
img = None
for _ in range(10):
pdf.pages[0].to_image(resolution=resolution).annotated.save(buffered, format="png")
img = buffered.getvalue()
if len(img) >= 64000 and resolution >= 2:
resolution = resolution / 2
buffered = BytesIO()
else:
break
pdf.close()
return img
except Exception:
return None
if re.match(r".*\.(jpg|jpeg|png|tif|gif|icon|ico|webp)$", filename):
try:
image = Image.open(BytesIO(blob))
image.load()
if image.mode in ("RGBA", "P", "LA"):
image = image.convert("RGB")
image.thumbnail((30, 30))
buffered = BytesIO()
image.save(buffered, format="png")
return buffered.getvalue()
except Exception:
return None
# PPT/PPTX thumbnail would require a licensed library; skip and return None.
if re.match(r".*\.(ppt|pptx)$", filename):
return None
return None
def thumbnail(filename, blob):
img = thumbnail_img(filename, blob)
if img is not None:
return IMG_BASE64_PREFIX + base64.b64encode(img).decode("utf-8")
else:
return ""
def repair_pdf_with_ghostscript(input_bytes):
"""Attempt to repair corrupt PDF bytes via Ghostscript. Returns original bytes on failure or timeout."""
if input_bytes is None or len(input_bytes) == 0:
return input_bytes if input_bytes is not None else b""
if len(input_bytes) > MAX_BLOB_SIZE_PDF:
return input_bytes
if shutil.which("gs") is None:
return input_bytes
with tempfile.NamedTemporaryFile(suffix=".pdf") as temp_in, tempfile.NamedTemporaryFile(suffix=".pdf") as temp_out:
temp_in.write(input_bytes)
temp_in.flush()
cmd = [
"gs",
"-o",
temp_out.name,
"-sDEVICE=pdfwrite",
"-dPDFSETTINGS=/prepress",
temp_in.name,
]
try:
proc = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=GHOSTSCRIPT_TIMEOUT_SEC,
)
if proc.returncode != 0:
return input_bytes
temp_out.seek(0)
repaired_bytes = temp_out.read()
if not repaired_bytes:
return input_bytes
return repaired_bytes
except subprocess.TimeoutExpired:
return input_bytes
except Exception:
return input_bytes
def read_potential_broken_pdf(blob):
"""
Return PDF bytes, optionally repaired via Ghostscript if initially unreadable.
Edge cases and robustness:
- None blob returns b"" to avoid callers receiving None.
- Empty blob returned as-is.
- Oversized blob (> MAX_BLOB_SIZE_PDF) returned as-is without repair to avoid DoS.
"""
if blob is None:
return b""
try:
blob_len = len(blob)
except TypeError:
return b""
if blob_len == 0:
return blob
def try_open(data):
try:
with pdfplumber.open(BytesIO(data)) as pdf:
if pdf.pages:
return True
except Exception:
return False
return False
if try_open(blob):
return blob
if blob_len > MAX_BLOB_SIZE_PDF:
return blob
repaired = repair_pdf_with_ghostscript(blob)
if try_open(repaired):
return repaired
return blob
def sanitize_path(raw_path: str | None) -> str:
"""Normalize and sanitize a user-provided path segment.
- Converts backslashes to forward slashes
- Strips leading/trailing slashes
- Removes '.' and '..' segments
- Restricts characters to A-Za-z0-9, underscore, dash, and '/'
- Returns "" for None, empty, or non-string input (robustness).
"""
if raw_path is None or not isinstance(raw_path, str):
return ""
raw_path = raw_path.strip()
if not raw_path:
return ""
backslash_re = re.compile(r"[\\]+")
unsafe_re = re.compile(r"[^A-Za-z0-9_\-/]")
normalized = backslash_re.sub("/", raw_path)
normalized = normalized.strip("/")
parts = [seg for seg in normalized.split("/") if seg and seg not in (".", "..")]
sanitized = "/".join(parts)
sanitized = unsafe_re.sub("", sanitized)
return sanitized