mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-04-25 13:05:58 +08:00
refactor: let excel use lazy image loader (#13558)
### What problem does this PR solve? let excel use lazy image loader ### Type of change - [x] Refactoring --------- Co-authored-by: Yingfeng <yingfeng.zhang@gmail.com>
This commit is contained in:
@ -26,7 +26,7 @@ from docx.image.exceptions import (
|
||||
UnexpectedEndOfFileError,
|
||||
UnrecognizedImageError,
|
||||
)
|
||||
from rag.utils.lazy_image import LazyDocxImage
|
||||
from rag.utils.lazy_image import LazyImage
|
||||
|
||||
class RAGFlowDocxParser:
|
||||
def get_picture(self, document, paragraph):
|
||||
@ -66,7 +66,7 @@ class RAGFlowDocxParser:
|
||||
image_blobs.append(image_blob)
|
||||
if not image_blobs:
|
||||
return None
|
||||
return LazyDocxImage(image_blobs)
|
||||
return LazyImage(image_blobs)
|
||||
|
||||
|
||||
def __extract_table_content(self, tb):
|
||||
|
||||
@ -18,9 +18,9 @@ from io import BytesIO
|
||||
|
||||
import pandas as pd
|
||||
from openpyxl import Workbook, load_workbook
|
||||
from PIL import Image
|
||||
|
||||
from rag.nlp import find_codec
|
||||
from rag.utils.lazy_image import LazyImage
|
||||
|
||||
# copied from `/openpyxl/cell/cell.py`
|
||||
ILLEGAL_CHARACTERS_RE = re.compile(r"[\000-\010]|[\013-\014]|[\016-\037]")
|
||||
@ -122,7 +122,7 @@ class RAGFlowExcelParser:
|
||||
for img in images:
|
||||
try:
|
||||
img_bytes = img._data()
|
||||
pil_img = Image.open(BytesIO(img_bytes)).convert("RGB")
|
||||
lazy_img = LazyImage([img_bytes])
|
||||
|
||||
anchor = img.anchor
|
||||
if hasattr(anchor, "_from") and hasattr(anchor, "_to"):
|
||||
@ -139,7 +139,7 @@ class RAGFlowExcelParser:
|
||||
|
||||
item = {
|
||||
"sheet": sheetname or ws.title,
|
||||
"image": pil_img,
|
||||
"image": lazy_img,
|
||||
"image_description": "",
|
||||
"row_from": r1,
|
||||
"col_from": c1,
|
||||
|
||||
@ -75,7 +75,7 @@ def vision_figure_parser_figure_xlsx_wrapper(images,callback=None, **kwargs):
|
||||
vision_model = None
|
||||
if vision_model:
|
||||
figures_data = [((
|
||||
img["image"], # Image.Image
|
||||
img["image"], # Image.Image or LazyImage (converted by ensure_pil_image)
|
||||
[img["image_description"]] # description list (must be list)
|
||||
),
|
||||
[
|
||||
|
||||
@ -27,7 +27,7 @@ from rag.nlp import rag_tokenizer
|
||||
from deepdoc.parser import PdfParser, HtmlParser
|
||||
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
|
||||
from PIL import Image
|
||||
from rag.utils.lazy_image import LazyDocxImage
|
||||
from rag.utils.lazy_image import LazyImage
|
||||
|
||||
|
||||
class Pdf(PdfParser):
|
||||
@ -89,7 +89,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
sections = [
|
||||
(item[0], item[1] if item[1] is not None else "")
|
||||
for item in sections
|
||||
if not isinstance(item[1], (Image.Image, LazyDocxImage))
|
||||
if not isinstance(item[1], (Image.Image, LazyImage))
|
||||
]
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
|
||||
@ -115,7 +115,7 @@ class Excel(ExcelParser):
|
||||
tables.append(
|
||||
(
|
||||
(
|
||||
img["image"], # Image.Image
|
||||
img["image"], # Image.Image or LazyImage
|
||||
[img["image_description"]] # description list (must be list)
|
||||
),
|
||||
[
|
||||
|
||||
@ -1212,18 +1212,17 @@ def docx_question_level(p, bull=-1):
|
||||
|
||||
|
||||
def concat_img(img1, img2):
|
||||
from rag.utils.lazy_image import ensure_pil_image, LazyDocxImage
|
||||
from rag.utils.lazy_image import ensure_pil_image, LazyImage
|
||||
|
||||
# Fast path: preserve laziness when both sides are LazyDocxImage or None.
|
||||
if (img1 is None or isinstance(img1, LazyDocxImage)) and \
|
||||
(img2 is None or isinstance(img2, LazyDocxImage)):
|
||||
if (img1 is None or isinstance(img1, LazyImage)) and \
|
||||
(img2 is None or isinstance(img2, LazyImage)):
|
||||
if img1 and not img2:
|
||||
return img1
|
||||
if not img1 and img2:
|
||||
return img2
|
||||
if not img1 and not img2:
|
||||
return None
|
||||
return LazyDocxImage.merge(img1, img2)
|
||||
return LazyImage.merge(img1, img2)
|
||||
|
||||
img1 = ensure_pil_image(img1) or img1
|
||||
img2 = ensure_pil_image(img2) or img2
|
||||
|
||||
@ -6,7 +6,7 @@ from PIL import Image
|
||||
from rag.nlp import concat_img
|
||||
|
||||
|
||||
class LazyDocxImage:
|
||||
class LazyImage:
|
||||
def __init__(self, blobs, source=None):
|
||||
self._blobs = [b for b in (blobs or []) if b]
|
||||
self.source = source
|
||||
@ -31,7 +31,7 @@ class LazyDocxImage:
|
||||
try:
|
||||
image = Image.open(BytesIO(blob)).convert("RGB")
|
||||
except Exception as e:
|
||||
logging.info(f"LazyDocxImage: skip bad image blob: {e}")
|
||||
logging.info(f"LazyImage: skip bad image blob: {e}")
|
||||
continue
|
||||
|
||||
if res_img is None:
|
||||
@ -91,33 +91,36 @@ class LazyDocxImage:
|
||||
@staticmethod
|
||||
def merge(a, b):
|
||||
"""
|
||||
Merge two LazyDocxImage instances by combining their blob lists.
|
||||
Merge two LazyImage instances by combining their blob lists.
|
||||
"""
|
||||
a_blobs = a._blobs if isinstance(a, LazyDocxImage) else []
|
||||
b_blobs = b._blobs if isinstance(b, LazyDocxImage) else []
|
||||
a_blobs = a._blobs if isinstance(a, LazyImage) else []
|
||||
b_blobs = b._blobs if isinstance(b, LazyImage) else []
|
||||
combined = a_blobs + b_blobs
|
||||
if not combined:
|
||||
return None
|
||||
merged = LazyDocxImage(combined)
|
||||
merged = LazyImage(combined)
|
||||
return merged
|
||||
|
||||
|
||||
LazyDocxImage = LazyImage
|
||||
|
||||
|
||||
def ensure_pil_image(img):
|
||||
if isinstance(img, Image.Image):
|
||||
return img
|
||||
if isinstance(img, LazyDocxImage):
|
||||
if isinstance(img, LazyImage):
|
||||
return img.to_pil()
|
||||
return None
|
||||
|
||||
|
||||
def is_image_like(img):
|
||||
return isinstance(img, Image.Image) or isinstance(img, LazyDocxImage)
|
||||
return isinstance(img, Image.Image) or isinstance(img, LazyImage)
|
||||
|
||||
|
||||
def open_image_for_processing(img, *, allow_bytes=False):
|
||||
if isinstance(img, Image.Image):
|
||||
return img, False
|
||||
if isinstance(img, LazyDocxImage):
|
||||
if isinstance(img, LazyImage):
|
||||
return img.to_pil_detached(), True
|
||||
if allow_bytes and isinstance(img, (bytes, bytearray)):
|
||||
try:
|
||||
|
||||
Reference in New Issue
Block a user