refactor: let excel use lazy image loader (#13558)

### What problem does this PR solve?

let excel use lazy image loader

### Type of change

- [x] Refactoring

---------

Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
This commit is contained in:
Stephen Hu
2026-03-23 21:24:40 +08:00
committed by GitHub
parent f991cd362e
commit d32967eda8
7 changed files with 25 additions and 23 deletions

View File

@ -26,7 +26,7 @@ from docx.image.exceptions import (
UnexpectedEndOfFileError,
UnrecognizedImageError,
)
from rag.utils.lazy_image import LazyDocxImage
from rag.utils.lazy_image import LazyImage
class RAGFlowDocxParser:
def get_picture(self, document, paragraph):
@ -66,7 +66,7 @@ class RAGFlowDocxParser:
image_blobs.append(image_blob)
if not image_blobs:
return None
return LazyDocxImage(image_blobs)
return LazyImage(image_blobs)
def __extract_table_content(self, tb):

View File

@ -18,9 +18,9 @@ from io import BytesIO
import pandas as pd
from openpyxl import Workbook, load_workbook
from PIL import Image
from rag.nlp import find_codec
from rag.utils.lazy_image import LazyImage
# copied from `/openpyxl/cell/cell.py`
ILLEGAL_CHARACTERS_RE = re.compile(r"[\000-\010]|[\013-\014]|[\016-\037]")
@ -122,7 +122,7 @@ class RAGFlowExcelParser:
for img in images:
try:
img_bytes = img._data()
pil_img = Image.open(BytesIO(img_bytes)).convert("RGB")
lazy_img = LazyImage([img_bytes])
anchor = img.anchor
if hasattr(anchor, "_from") and hasattr(anchor, "_to"):
@ -139,7 +139,7 @@ class RAGFlowExcelParser:
item = {
"sheet": sheetname or ws.title,
"image": pil_img,
"image": lazy_img,
"image_description": "",
"row_from": r1,
"col_from": c1,

View File

@ -75,7 +75,7 @@ def vision_figure_parser_figure_xlsx_wrapper(images,callback=None, **kwargs):
vision_model = None
if vision_model:
figures_data = [((
img["image"], # Image.Image
img["image"], # Image.Image or LazyImage (converted by ensure_pil_image)
[img["image_description"]] # description list (must be list)
),
[

View File

@ -27,7 +27,7 @@ from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, HtmlParser
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
from PIL import Image
from rag.utils.lazy_image import LazyDocxImage
from rag.utils.lazy_image import LazyImage
class Pdf(PdfParser):
@ -89,7 +89,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
sections = [
(item[0], item[1] if item[1] is not None else "")
for item in sections
if not isinstance(item[1], (Image.Image, LazyDocxImage))
if not isinstance(item[1], (Image.Image, LazyImage))
]
callback(0.8, "Finish parsing.")

View File

@ -115,7 +115,7 @@ class Excel(ExcelParser):
tables.append(
(
(
img["image"], # Image.Image
img["image"], # Image.Image or LazyImage
[img["image_description"]] # description list (must be list)
),
[

View File

@ -1212,18 +1212,17 @@ def docx_question_level(p, bull=-1):
def concat_img(img1, img2):
from rag.utils.lazy_image import ensure_pil_image, LazyDocxImage
from rag.utils.lazy_image import ensure_pil_image, LazyImage
# Fast path: preserve laziness when both sides are LazyDocxImage or None.
if (img1 is None or isinstance(img1, LazyDocxImage)) and \
(img2 is None or isinstance(img2, LazyDocxImage)):
if (img1 is None or isinstance(img1, LazyImage)) and \
(img2 is None or isinstance(img2, LazyImage)):
if img1 and not img2:
return img1
if not img1 and img2:
return img2
if not img1 and not img2:
return None
return LazyDocxImage.merge(img1, img2)
return LazyImage.merge(img1, img2)
img1 = ensure_pil_image(img1) or img1
img2 = ensure_pil_image(img2) or img2

View File

@ -6,7 +6,7 @@ from PIL import Image
from rag.nlp import concat_img
class LazyDocxImage:
class LazyImage:
def __init__(self, blobs, source=None):
self._blobs = [b for b in (blobs or []) if b]
self.source = source
@ -31,7 +31,7 @@ class LazyDocxImage:
try:
image = Image.open(BytesIO(blob)).convert("RGB")
except Exception as e:
logging.info(f"LazyDocxImage: skip bad image blob: {e}")
logging.info(f"LazyImage: skip bad image blob: {e}")
continue
if res_img is None:
@ -91,33 +91,36 @@ class LazyDocxImage:
@staticmethod
def merge(a, b):
"""
Merge two LazyDocxImage instances by combining their blob lists.
Merge two LazyImage instances by combining their blob lists.
"""
a_blobs = a._blobs if isinstance(a, LazyDocxImage) else []
b_blobs = b._blobs if isinstance(b, LazyDocxImage) else []
a_blobs = a._blobs if isinstance(a, LazyImage) else []
b_blobs = b._blobs if isinstance(b, LazyImage) else []
combined = a_blobs + b_blobs
if not combined:
return None
merged = LazyDocxImage(combined)
merged = LazyImage(combined)
return merged
LazyDocxImage = LazyImage
def ensure_pil_image(img):
if isinstance(img, Image.Image):
return img
if isinstance(img, LazyDocxImage):
if isinstance(img, LazyImage):
return img.to_pil()
return None
def is_image_like(img):
return isinstance(img, Image.Image) or isinstance(img, LazyDocxImage)
return isinstance(img, Image.Image) or isinstance(img, LazyImage)
def open_image_for_processing(img, *, allow_bytes=False):
if isinstance(img, Image.Image):
return img, False
if isinstance(img, LazyDocxImage):
if isinstance(img, LazyImage):
return img.to_pil_detached(), True
if allow_bytes and isinstance(img, (bytes, bytearray)):
try: