mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-04-27 05:47:57 +08:00
Refa: implement unified lazy image loading for Docx parsers (qa/manual) (#13329)
## Summary This PR is the direct successor to the previous `docx` lazy-loading implementation. It addresses the technical debt intentionally left out in the last PR by fully migrating the `qa` and `manual` parsing strategies to the new lazy-loading model. Additionally, this PR comprehensively refactors the underlying `docx` parsing pipeline to eliminate significant code redundancy and introduces robust fallback mechanisms to handle completely corrupted image streams safely. ## What's Changed * **Centralized Abstraction (`docx_parser.py`)**: Moved the `get_picture` extraction logic up to the `RAGFlowDocxParser` base class. Previously, `naive`, `qa`, and `manual` parsers maintained separate, redundant copies of this method. All downstream strategies now natively gather raw blobs and return `LazyDocxImage` objects automatically. * **Robust Corrupted Image Fallback (`docx_parser.py`)**: Handled edge cases where `python-docx` encounters critically malformed magic headers. Implemented an explicit `try-except` structure that safely intercepts `UnrecognizedImageError` (and similar exceptions) and seamlessly falls back to retrieving the raw binary via `getattr(related_part, "blob", None)`, preventing parser crashes on damaged documents. * **Legacy Code & Redundancy Purge**: * Removed the duplicate `get_picture` methods from `naive.py`, `qa.py`, and `manual.py`. * Removed the standalone, immediate-decoding `concat_img` method in `manual.py`. It has been completely replaced by the globally unified, lazy-loading-compatible `rag.nlp.concat_img`. * Cleaned up unused legacy imports (e.g., `PIL.Image`, docx exception packages) across all updated strategy files. ## Scope To keep this PR focused, I have restricted these changes strictly to the unification of `docx` extraction logic and the lazy-load migration of `qa` and `manual`. ## Validation & Testing I've tested this to ensure no regressions and validated the fallback logic: * **Output Consistency**: Compared identical `.docx` inputs using `qa` and `manual` strategies before and after this branch: chunk counts, extracted text, table HTML, and attached images match perfectly. * **Memory Footprint Drop**: Confirmed a noticeable drop in peak memory usage when processing image-dense documents through the `qa` and `manual` pipelines, bringing them up to parity with the `naive` strategy's performance gains. ## Breaking Changes * None.
This commit is contained in:
@ -20,9 +20,54 @@ import pandas as pd
|
||||
from collections import Counter
|
||||
from rag.nlp import rag_tokenizer
|
||||
from io import BytesIO
|
||||
|
||||
import logging
|
||||
from docx.image.exceptions import (
|
||||
InvalidImageStreamError,
|
||||
UnexpectedEndOfFileError,
|
||||
UnrecognizedImageError,
|
||||
)
|
||||
from rag.utils.lazy_image import LazyDocxImage
|
||||
|
||||
class RAGFlowDocxParser:
|
||||
def get_picture(self, document, paragraph):
|
||||
imgs = paragraph._element.xpath(".//pic:pic")
|
||||
if not imgs:
|
||||
return None
|
||||
image_blobs = []
|
||||
for img in imgs:
|
||||
embed = img.xpath(".//a:blip/@r:embed")
|
||||
if not embed:
|
||||
continue
|
||||
embed = embed[0]
|
||||
image_blob = None
|
||||
try:
|
||||
related_part = document.part.related_parts[embed]
|
||||
except Exception as e:
|
||||
logging.warning(f"Skipping image due to unexpected error getting related_part: {e}")
|
||||
continue
|
||||
|
||||
try:
|
||||
image = related_part.image
|
||||
if image is not None:
|
||||
image_blob = image.blob
|
||||
except (
|
||||
UnrecognizedImageError,
|
||||
UnexpectedEndOfFileError,
|
||||
InvalidImageStreamError,
|
||||
UnicodeDecodeError,
|
||||
) as e:
|
||||
logging.info(f"Damaged image encountered, attempting blob fallback: {e}")
|
||||
except Exception as e:
|
||||
logging.warning(f"Unexpected error getting image, attempting blob fallback: {e}")
|
||||
|
||||
if image_blob is None:
|
||||
image_blob = getattr(related_part, "blob", None)
|
||||
if image_blob:
|
||||
image_blobs.append(image_blob)
|
||||
if not image_blobs:
|
||||
return None
|
||||
return LazyDocxImage(image_blobs)
|
||||
|
||||
|
||||
def __extract_table_content(self, tb):
|
||||
df = []
|
||||
|
||||
@ -20,12 +20,11 @@ import re
|
||||
|
||||
from common.constants import ParserType
|
||||
from io import BytesIO
|
||||
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context
|
||||
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context, concat_img
|
||||
from common.token_utils import num_tokens_from_string
|
||||
from deepdoc.parser import PdfParser, DocxParser
|
||||
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper, vision_figure_parser_docx_wrapper
|
||||
from docx import Document
|
||||
from PIL import Image
|
||||
from rag.app.naive import by_plaintext, PARSERS
|
||||
from common.parser_config_utils import normalize_layout_recognizer
|
||||
|
||||
@ -71,45 +70,6 @@ class Docx(DocxParser):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def get_picture(self, document, paragraph):
|
||||
img = paragraph._element.xpath(".//pic:pic")
|
||||
if not img:
|
||||
return None
|
||||
try:
|
||||
img = img[0]
|
||||
embed = img.xpath(".//a:blip/@r:embed")[0]
|
||||
related_part = document.part.related_parts[embed]
|
||||
image = related_part.image
|
||||
if image is not None:
|
||||
image = Image.open(BytesIO(image.blob))
|
||||
return image
|
||||
elif related_part.blob is not None:
|
||||
image = Image.open(BytesIO(related_part.blob))
|
||||
return image
|
||||
else:
|
||||
return None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def concat_img(self, img1, img2):
|
||||
if img1 and not img2:
|
||||
return img1
|
||||
if not img1 and img2:
|
||||
return img2
|
||||
if not img1 and not img2:
|
||||
return None
|
||||
width1, height1 = img1.size
|
||||
width2, height2 = img2.size
|
||||
|
||||
new_width = max(width1, width2)
|
||||
new_height = height1 + height2
|
||||
new_image = Image.new("RGB", (new_width, new_height))
|
||||
|
||||
new_image.paste(img1, (0, 0))
|
||||
new_image.paste(img2, (0, height1))
|
||||
|
||||
return new_image
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
|
||||
self.doc = Document(filename) if not binary else Document(BytesIO(binary))
|
||||
pn = 0
|
||||
@ -125,7 +85,7 @@ class Docx(DocxParser):
|
||||
if not question_level or question_level > 6: # not a question
|
||||
last_answer = f"{last_answer}\n{p_text}"
|
||||
current_image = self.get_picture(self.doc, p)
|
||||
last_image = self.concat_img(last_image, current_image)
|
||||
last_image = concat_img(last_image, current_image)
|
||||
else: # is a question
|
||||
if last_answer or last_image:
|
||||
sum_question = "\n".join(question_stack)
|
||||
|
||||
@ -21,7 +21,6 @@ from functools import reduce
|
||||
from io import BytesIO
|
||||
from timeit import default_timer as timer
|
||||
from docx import Document
|
||||
from docx.image.exceptions import InvalidImageStreamError, UnexpectedEndOfFileError, UnrecognizedImageError
|
||||
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
|
||||
from docx.table import Table as DocxTable
|
||||
from docx.text.paragraph import Paragraph
|
||||
@ -34,7 +33,6 @@ from common.constants import LLMType
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from api.db.joint_services.tenant_model_service import get_model_config_by_type_and_name, get_tenant_default_model_by_type
|
||||
from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
|
||||
from rag.utils.lazy_image import LazyDocxImage
|
||||
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
|
||||
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, vision_figure_parser_pdf_wrapper
|
||||
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
||||
@ -265,40 +263,6 @@ class Docx(DocxParser):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def get_picture(self, document, paragraph):
|
||||
imgs = paragraph._element.xpath(".//pic:pic")
|
||||
if not imgs:
|
||||
return None
|
||||
image_blobs = []
|
||||
for img in imgs:
|
||||
embed = img.xpath(".//a:blip/@r:embed")
|
||||
if not embed:
|
||||
continue
|
||||
embed = embed[0]
|
||||
try:
|
||||
related_part = document.part.related_parts[embed]
|
||||
image_blob = related_part.image.blob
|
||||
except UnrecognizedImageError:
|
||||
logging.info("Unrecognized image format. Skipping image.")
|
||||
continue
|
||||
except UnexpectedEndOfFileError:
|
||||
logging.info("EOF was unexpectedly encountered while reading an image stream. Skipping image.")
|
||||
continue
|
||||
except InvalidImageStreamError:
|
||||
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
|
||||
continue
|
||||
except UnicodeDecodeError:
|
||||
logging.info("The recognized image stream appears to be corrupted. Skipping image.")
|
||||
continue
|
||||
except Exception as e:
|
||||
logging.warning(f"The recognized image stream appears to be corrupted. Skipping image, exception: {e}")
|
||||
continue
|
||||
image_blobs.append(image_blob)
|
||||
|
||||
if not image_blobs:
|
||||
return None
|
||||
return LazyDocxImage(image_blobs)
|
||||
|
||||
def __clean(self, line):
|
||||
line = re.sub(r"\u3000", " ", line).strip()
|
||||
return line
|
||||
|
||||
@ -27,7 +27,6 @@ from rag.nlp import is_english, random_choices, qbullets_category, add_positions
|
||||
from rag.nlp import rag_tokenizer, tokenize_table, concat_img
|
||||
from deepdoc.parser import PdfParser, ExcelParser, DocxParser
|
||||
from docx import Document
|
||||
from PIL import Image
|
||||
from markdown import markdown
|
||||
|
||||
from common.float_utils import get_float
|
||||
@ -192,17 +191,6 @@ class Docx(DocxParser):
|
||||
def __init__(self):
|
||||
pass
|
||||
|
||||
def get_picture(self, document, paragraph):
|
||||
img = paragraph._element.xpath('.//pic:pic')
|
||||
if not img:
|
||||
return None
|
||||
img = img[0]
|
||||
embed = img.xpath('.//a:blip/@r:embed')[0]
|
||||
related_part = document.part.related_parts[embed]
|
||||
image = related_part.image
|
||||
image = Image.open(BytesIO(image.blob)).convert('RGB')
|
||||
return image
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
|
||||
self.doc = Document(
|
||||
filename) if not binary else Document(BytesIO(binary))
|
||||
|
||||
@ -1200,7 +1200,7 @@ def naive_merge_with_images(texts, images, chunk_token_num=128, delimiter="\n。
|
||||
|
||||
def docx_question_level(p, bull=-1):
|
||||
txt = re.sub(r"\u3000", " ", p.text).strip()
|
||||
if p.style.name.startswith('Heading'):
|
||||
if hasattr(p.style, 'name') and p.style.name and p.style.name.startswith('Heading'):
|
||||
return int(p.style.name.split(' ')[-1]), txt
|
||||
else:
|
||||
if bull < 0:
|
||||
@ -1212,7 +1212,18 @@ def docx_question_level(p, bull=-1):
|
||||
|
||||
|
||||
def concat_img(img1, img2):
|
||||
from rag.utils.lazy_image import ensure_pil_image
|
||||
from rag.utils.lazy_image import ensure_pil_image, LazyDocxImage
|
||||
|
||||
# Fast path: preserve laziness when both sides are LazyDocxImage or None.
|
||||
if (img1 is None or isinstance(img1, LazyDocxImage)) and \
|
||||
(img2 is None or isinstance(img2, LazyDocxImage)):
|
||||
if img1 and not img2:
|
||||
return img1
|
||||
if not img1 and img2:
|
||||
return img2
|
||||
if not img1 and not img2:
|
||||
return None
|
||||
return LazyDocxImage.merge(img1, img2)
|
||||
|
||||
img1 = ensure_pil_image(img1) or img1
|
||||
img2 = ensure_pil_image(img2) or img2
|
||||
|
||||
@ -88,6 +88,19 @@ class LazyDocxImage:
|
||||
self.close()
|
||||
return False
|
||||
|
||||
@staticmethod
|
||||
def merge(a, b):
|
||||
"""
|
||||
Merge two LazyDocxImage instances by combining their blob lists.
|
||||
"""
|
||||
a_blobs = a._blobs if isinstance(a, LazyDocxImage) else []
|
||||
b_blobs = b._blobs if isinstance(b, LazyDocxImage) else []
|
||||
combined = a_blobs + b_blobs
|
||||
if not combined:
|
||||
return None
|
||||
merged = LazyDocxImage(combined)
|
||||
return merged
|
||||
|
||||
|
||||
def ensure_pil_image(img):
|
||||
if isinstance(img, Image.Image):
|
||||
|
||||
Reference in New Issue
Block a user