refactor: let excel use lazy image loader (#13558)

### What problem does this PR solve? let excel use lazy image loader ### Type of change - [x] Refactoring --------- Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
2026-04-25 13:05:58 +08:00 · 2026-03-23 21:24:40 +08:00
parent f991cd362e
commit d32967eda8
7 changed files with 25 additions and 23 deletions
--- a/deepdoc/parser/docx_parser.py
+++ b/deepdoc/parser/docx_parser.py
@ -26,7 +26,7 @@ from docx.image.exceptions import (
    UnexpectedEndOfFileError,
    UnrecognizedImageError,
 )
-from rag.utils.lazy_image import LazyDocxImage
+from rag.utils.lazy_image import LazyImage

 class RAGFlowDocxParser:
    def get_picture(self, document, paragraph):
@ -66,7 +66,7 @@ class RAGFlowDocxParser:
                image_blobs.append(image_blob)
        if not image_blobs:
            return None
-        return LazyDocxImage(image_blobs)
+        return LazyImage(image_blobs)


    def __extract_table_content(self, tb):
--- a/deepdoc/parser/excel_parser.py
+++ b/deepdoc/parser/excel_parser.py
@ -18,9 +18,9 @@ from io import BytesIO

 import pandas as pd
 from openpyxl import Workbook, load_workbook
-from PIL import Image

 from rag.nlp import find_codec
+from rag.utils.lazy_image import LazyImage

 # copied from `/openpyxl/cell/cell.py`
 ILLEGAL_CHARACTERS_RE = re.compile(r"[\000-\010]|[\013-\014]|[\016-\037]")
@ -122,7 +122,7 @@ class RAGFlowExcelParser:
        for img in images:
            try:
                img_bytes = img._data()
-                pil_img = Image.open(BytesIO(img_bytes)).convert("RGB")
+                lazy_img = LazyImage([img_bytes])

                anchor = img.anchor
                if hasattr(anchor, "_from") and hasattr(anchor, "_to"):
@ -139,7 +139,7 @@ class RAGFlowExcelParser:

                item = {
                    "sheet": sheetname or ws.title,
-                    "image": pil_img,
+                    "image": lazy_img,
                    "image_description": "",
                    "row_from": r1,
                    "col_from": c1,
--- a/deepdoc/parser/figure_parser.py
+++ b/deepdoc/parser/figure_parser.py
@ -75,7 +75,7 @@ def vision_figure_parser_figure_xlsx_wrapper(images,callback=None, **kwargs):
        vision_model = None
    if vision_model:
        figures_data = [((
-                        img["image"],   # Image.Image
+                        img["image"],   # Image.Image or LazyImage (converted by ensure_pil_image)
                        [img["image_description"]]     # description list (must be list)
                    ),
                    [
--- a/rag/app/book.py
+++ b/rag/app/book.py
@ -27,7 +27,7 @@ from rag.nlp import rag_tokenizer
 from deepdoc.parser import PdfParser, HtmlParser
 from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
 from PIL import Image
-from rag.utils.lazy_image import LazyDocxImage
+from rag.utils.lazy_image import LazyImage


 class Pdf(PdfParser):
@ -89,7 +89,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
        sections = [
            (item[0], item[1] if item[1] is not None else "")
            for item in sections
-            if not isinstance(item[1], (Image.Image, LazyDocxImage))
+            if not isinstance(item[1], (Image.Image, LazyImage))
        ]
        callback(0.8, "Finish parsing.")

--- a/rag/app/table.py
+++ b/rag/app/table.py
@ -115,7 +115,7 @@ class Excel(ExcelParser):
            tables.append(
                (
                    (
-                        img["image"],  # Image.Image
+                        img["image"],  # Image.Image or LazyImage
                        [img["image_description"]]  # description list (must be list)
                    ),
                    [
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -1212,18 +1212,17 @@ def docx_question_level(p, bull=-1):


 def concat_img(img1, img2):
-    from rag.utils.lazy_image import ensure_pil_image, LazyDocxImage
+    from rag.utils.lazy_image import ensure_pil_image, LazyImage

-    # Fast path: preserve laziness when both sides are LazyDocxImage or None.
-    if (img1 is None or isinstance(img1, LazyDocxImage)) and \
-       (img2 is None or isinstance(img2, LazyDocxImage)):
+    if (img1 is None or isinstance(img1, LazyImage)) and \
+       (img2 is None or isinstance(img2, LazyImage)):
        if img1 and not img2:
            return img1
        if not img1 and img2:
            return img2
        if not img1 and not img2:
            return None
-        return LazyDocxImage.merge(img1, img2)
+        return LazyImage.merge(img1, img2)

    img1 = ensure_pil_image(img1) or img1
    img2 = ensure_pil_image(img2) or img2
--- a/rag/utils/lazy_image.py
+++ b/rag/utils/lazy_image.py
@ -6,7 +6,7 @@ from PIL import Image
 from rag.nlp import concat_img


-class LazyDocxImage:
+class LazyImage:
    def __init__(self, blobs, source=None):
        self._blobs = [b for b in (blobs or []) if b]
        self.source = source
@ -31,7 +31,7 @@ class LazyDocxImage:
            try:
                image = Image.open(BytesIO(blob)).convert("RGB")
            except Exception as e:
-                logging.info(f"LazyDocxImage: skip bad image blob: {e}")
+                logging.info(f"LazyImage: skip bad image blob: {e}")
                continue

            if res_img is None:
@ -91,33 +91,36 @@ class LazyDocxImage:
    @staticmethod
    def merge(a, b):
        """
-        Merge two LazyDocxImage instances by combining their blob lists.
+        Merge two LazyImage instances by combining their blob lists.
        """
-        a_blobs = a._blobs if isinstance(a, LazyDocxImage) else []
-        b_blobs = b._blobs if isinstance(b, LazyDocxImage) else []
+        a_blobs = a._blobs if isinstance(a, LazyImage) else []
+        b_blobs = b._blobs if isinstance(b, LazyImage) else []
        combined = a_blobs + b_blobs
        if not combined:
            return None
-        merged = LazyDocxImage(combined)
+        merged = LazyImage(combined)
        return merged


+LazyDocxImage = LazyImage
+
+
 def ensure_pil_image(img):
    if isinstance(img, Image.Image):
        return img
-    if isinstance(img, LazyDocxImage):
+    if isinstance(img, LazyImage):
        return img.to_pil()
    return None


 def is_image_like(img):
-    return isinstance(img, Image.Image) or isinstance(img, LazyDocxImage)
+    return isinstance(img, Image.Image) or isinstance(img, LazyImage)


 def open_image_for_processing(img, *, allow_bytes=False):
    if isinstance(img, Image.Image):
        return img, False
-    if isinstance(img, LazyDocxImage):
+    if isinstance(img, LazyImage):
        return img.to_pil_detached(), True
    if allow_bytes and isinstance(img, (bytes, bytearray)):
        try: