mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-03-12 18:49:00 +08:00
**Summary** This PR tackles a significant memory bottleneck when processing image-heavy Word documents. Previously, our pipeline eagerly decoded DOCX images into `PIL.Image` objects, which caused high peak memory usage. To solve this, I've introduced a **lazy-loading approach**: images are now stored as raw blobs and only decoded exactly when and where they are consumed. This successfully reduces the memory footprint while keeping the parsing output completely identical to before. **What's Changed** Instead of a dry file-by-file list, here is the logical breakdown of the updates: * **The Core Abstraction (`lazy_image.py`)**: Introduced `LazyDocxImage` along with helper APIs to handle lazy decoding, image-type checks, and NumPy compatibility. It also supports `.close()` and detached PIL access to ensure safe lifecycle management and prevent memory leaks. * **Pipeline Integration (`naive.py`, `figure_parser.py`, etc.)**: Updated the general DOCX picture extraction to return these new lazy images. Downstream consumers (like the figure/VLM flow and base64 encoding paths) now decode images right at the use site using detached PIL instances, avoiding shared-instance side effects. * **Compatibility Hooks (`operators.py`, `book.py`, etc.)**: Added necessary compatibility conversions so these lazy images flow smoothly through existing merging, filtering, and presentation steps without breaking. **Scope & What is Intentionally Left Out** To keep this PR focused, I have restricted these changes strictly to the **general Word pipeline** and its downstream consumers. The `QA` and `manual` Word parsing pipelines are explicitly **not modified** in this PR. They can be safely migrated to this new lazy-load model in a subsequent, standalone PR. **Design Considerations** I briefly considered adding image compression during processing, but decided against it to avoid any potential quality degradation in the derived outputs. I also held off on a massive pipeline re-architecture to avoid overly invasive changes right now. **Validation & Testing** I've tested this to ensure no regressions: * Compared identical DOCX inputs before and after this branch: chunk counts, extracted text, table HTML, and image descriptions match perfectly. * **Confirmed a noticeable drop in peak memory usage when processing image-dense documents.** For a 30MB Word document containing 243 1080p screenshots, memory consumption is reduced by approximately 1.5GB. **Breaking Changes** None.
117 lines
2.9 KiB
Python
117 lines
2.9 KiB
Python
import logging
|
|
from io import BytesIO
|
|
|
|
from PIL import Image
|
|
|
|
from rag.nlp import concat_img
|
|
|
|
|
|
class LazyDocxImage:
|
|
def __init__(self, blobs, source=None):
|
|
self._blobs = [b for b in (blobs or []) if b]
|
|
self.source = source
|
|
self._pil = None
|
|
|
|
def __bool__(self):
|
|
return bool(self._blobs)
|
|
|
|
def to_pil(self):
|
|
if self._pil is not None:
|
|
try:
|
|
self._pil.load()
|
|
return self._pil
|
|
except Exception:
|
|
try:
|
|
self._pil.close()
|
|
except Exception:
|
|
pass
|
|
self._pil = None
|
|
res_img = None
|
|
for blob in self._blobs:
|
|
try:
|
|
image = Image.open(BytesIO(blob)).convert("RGB")
|
|
except Exception as e:
|
|
logging.info(f"LazyDocxImage: skip bad image blob: {e}")
|
|
continue
|
|
|
|
if res_img is None:
|
|
res_img = image
|
|
continue
|
|
|
|
new_img = concat_img(res_img, image)
|
|
if new_img is not res_img:
|
|
try:
|
|
res_img.close()
|
|
except Exception:
|
|
pass
|
|
try:
|
|
image.close()
|
|
except Exception:
|
|
pass
|
|
res_img = new_img
|
|
|
|
self._pil = res_img
|
|
return self._pil
|
|
|
|
def to_pil_detached(self):
|
|
pil = self.to_pil()
|
|
self._pil = None
|
|
return pil
|
|
|
|
def close(self):
|
|
if self._pil is not None:
|
|
try:
|
|
self._pil.close()
|
|
except Exception:
|
|
pass
|
|
self._pil = None
|
|
return None
|
|
|
|
def __getattr__(self, name):
|
|
pil = self.to_pil()
|
|
if pil is None:
|
|
raise AttributeError(name)
|
|
return getattr(pil, name)
|
|
|
|
def __array__(self, dtype=None):
|
|
import numpy as np
|
|
|
|
pil = self.to_pil()
|
|
if pil is None:
|
|
return np.array([], dtype=dtype)
|
|
return np.array(pil, dtype=dtype)
|
|
|
|
def __enter__(self):
|
|
return self.to_pil()
|
|
|
|
def __exit__(self, exc_type, exc, tb):
|
|
self.close()
|
|
return False
|
|
|
|
|
|
def ensure_pil_image(img):
|
|
if isinstance(img, Image.Image):
|
|
return img
|
|
if isinstance(img, LazyDocxImage):
|
|
return img.to_pil()
|
|
return None
|
|
|
|
|
|
def is_image_like(img):
|
|
return isinstance(img, Image.Image) or isinstance(img, LazyDocxImage)
|
|
|
|
|
|
def open_image_for_processing(img, *, allow_bytes=False):
|
|
if isinstance(img, Image.Image):
|
|
return img, False
|
|
if isinstance(img, LazyDocxImage):
|
|
return img.to_pil_detached(), True
|
|
if allow_bytes and isinstance(img, (bytes, bytearray)):
|
|
try:
|
|
pil = Image.open(BytesIO(img)).convert("RGB")
|
|
return pil, True
|
|
except Exception as e:
|
|
logging.info(f"open_image_for_processing: bad bytes: {e}")
|
|
return None, False
|
|
return img, False
|