mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-03-16 12:27:49 +08:00
## Summary This PR is the direct successor to the previous `docx` lazy-loading implementation. It addresses the technical debt intentionally left out in the last PR by fully migrating the `qa` and `manual` parsing strategies to the new lazy-loading model. Additionally, this PR comprehensively refactors the underlying `docx` parsing pipeline to eliminate significant code redundancy and introduces robust fallback mechanisms to handle completely corrupted image streams safely. ## What's Changed * **Centralized Abstraction (`docx_parser.py`)**: Moved the `get_picture` extraction logic up to the `RAGFlowDocxParser` base class. Previously, `naive`, `qa`, and `manual` parsers maintained separate, redundant copies of this method. All downstream strategies now natively gather raw blobs and return `LazyDocxImage` objects automatically. * **Robust Corrupted Image Fallback (`docx_parser.py`)**: Handled edge cases where `python-docx` encounters critically malformed magic headers. Implemented an explicit `try-except` structure that safely intercepts `UnrecognizedImageError` (and similar exceptions) and seamlessly falls back to retrieving the raw binary via `getattr(related_part, "blob", None)`, preventing parser crashes on damaged documents. * **Legacy Code & Redundancy Purge**: * Removed the duplicate `get_picture` methods from `naive.py`, `qa.py`, and `manual.py`. * Removed the standalone, immediate-decoding `concat_img` method in `manual.py`. It has been completely replaced by the globally unified, lazy-loading-compatible `rag.nlp.concat_img`. * Cleaned up unused legacy imports (e.g., `PIL.Image`, docx exception packages) across all updated strategy files. ## Scope To keep this PR focused, I have restricted these changes strictly to the unification of `docx` extraction logic and the lazy-load migration of `qa` and `manual`. ## Validation & Testing I've tested this to ensure no regressions and validated the fallback logic: * **Output Consistency**: Compared identical `.docx` inputs using `qa` and `manual` strategies before and after this branch: chunk counts, extracted text, table HTML, and attached images match perfectly. * **Memory Footprint Drop**: Confirmed a noticeable drop in peak memory usage when processing image-dense documents through the `qa` and `manual` pipelines, bringing them up to parity with the `naive` strategy's performance gains. ## Breaking Changes * None.
185 lines
6.6 KiB
Python
185 lines
6.6 KiB
Python
#
|
||
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
||
#
|
||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
# you may not use this file except in compliance with the License.
|
||
# You may obtain a copy of the License at
|
||
#
|
||
# http://www.apache.org/licenses/LICENSE-2.0
|
||
#
|
||
# Unless required by applicable law or agreed to in writing, software
|
||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
# See the License for the specific language governing permissions and
|
||
# limitations under the License.
|
||
#
|
||
|
||
from docx import Document
|
||
import re
|
||
import pandas as pd
|
||
from collections import Counter
|
||
from rag.nlp import rag_tokenizer
|
||
from io import BytesIO
|
||
import logging
|
||
from docx.image.exceptions import (
|
||
InvalidImageStreamError,
|
||
UnexpectedEndOfFileError,
|
||
UnrecognizedImageError,
|
||
)
|
||
from rag.utils.lazy_image import LazyDocxImage
|
||
|
||
class RAGFlowDocxParser:
|
||
def get_picture(self, document, paragraph):
|
||
imgs = paragraph._element.xpath(".//pic:pic")
|
||
if not imgs:
|
||
return None
|
||
image_blobs = []
|
||
for img in imgs:
|
||
embed = img.xpath(".//a:blip/@r:embed")
|
||
if not embed:
|
||
continue
|
||
embed = embed[0]
|
||
image_blob = None
|
||
try:
|
||
related_part = document.part.related_parts[embed]
|
||
except Exception as e:
|
||
logging.warning(f"Skipping image due to unexpected error getting related_part: {e}")
|
||
continue
|
||
|
||
try:
|
||
image = related_part.image
|
||
if image is not None:
|
||
image_blob = image.blob
|
||
except (
|
||
UnrecognizedImageError,
|
||
UnexpectedEndOfFileError,
|
||
InvalidImageStreamError,
|
||
UnicodeDecodeError,
|
||
) as e:
|
||
logging.info(f"Damaged image encountered, attempting blob fallback: {e}")
|
||
except Exception as e:
|
||
logging.warning(f"Unexpected error getting image, attempting blob fallback: {e}")
|
||
|
||
if image_blob is None:
|
||
image_blob = getattr(related_part, "blob", None)
|
||
if image_blob:
|
||
image_blobs.append(image_blob)
|
||
if not image_blobs:
|
||
return None
|
||
return LazyDocxImage(image_blobs)
|
||
|
||
|
||
def __extract_table_content(self, tb):
|
||
df = []
|
||
for row in tb.rows:
|
||
df.append([c.text for c in row.cells])
|
||
return self.__compose_table_content(pd.DataFrame(df))
|
||
|
||
def __compose_table_content(self, df):
|
||
|
||
def blockType(b):
|
||
pattern = [
|
||
("^(20|19)[0-9]{2}[年/-][0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
|
||
(r"^(20|19)[0-9]{2}年$", "Dt"),
|
||
(r"^(20|19)[0-9]{2}[年/-][0-9]{1,2}月*$", "Dt"),
|
||
("^[0-9]{1,2}[月/-][0-9]{1,2}日*$", "Dt"),
|
||
(r"^第*[一二三四1-4]季度$", "Dt"),
|
||
(r"^(20|19)[0-9]{2}年*[一二三四1-4]季度$", "Dt"),
|
||
(r"^(20|19)[0-9]{2}[ABCDE]$", "DT"),
|
||
("^[0-9.,+%/ -]+$", "Nu"),
|
||
(r"^[0-9A-Z/\._~-]+$", "Ca"),
|
||
(r"^[A-Z]*[a-z' -]+$", "En"),
|
||
(r"^[0-9.,+-]+[0-9A-Za-z/$¥%<>()()' -]+$", "NE"),
|
||
(r"^.{1}$", "Sg")
|
||
]
|
||
for p, n in pattern:
|
||
if re.search(p, b):
|
||
return n
|
||
tks = [t for t in rag_tokenizer.tokenize(b).split() if len(t) > 1]
|
||
if len(tks) > 3:
|
||
if len(tks) < 12:
|
||
return "Tx"
|
||
else:
|
||
return "Lx"
|
||
|
||
if len(tks) == 1 and rag_tokenizer.tag(tks[0]) == "nr":
|
||
return "Nr"
|
||
|
||
return "Ot"
|
||
|
||
if len(df) < 2:
|
||
return []
|
||
max_type = Counter([blockType(str(df.iloc[i, j])) for i in range(
|
||
1, len(df)) for j in range(len(df.iloc[i, :]))])
|
||
max_type = max(max_type.items(), key=lambda x: x[1])[0]
|
||
|
||
colnm = len(df.iloc[0, :])
|
||
hdrows = [0] # header is not necessarily appear in the first line
|
||
if max_type == "Nu":
|
||
for r in range(1, len(df)):
|
||
tys = Counter([blockType(str(df.iloc[r, j]))
|
||
for j in range(len(df.iloc[r, :]))])
|
||
tys = max(tys.items(), key=lambda x: x[1])[0]
|
||
if tys != max_type:
|
||
hdrows.append(r)
|
||
|
||
lines = []
|
||
for i in range(1, len(df)):
|
||
if i in hdrows:
|
||
continue
|
||
hr = [r - i for r in hdrows]
|
||
hr = [r for r in hr if r < 0]
|
||
t = len(hr) - 1
|
||
while t > 0:
|
||
if hr[t] - hr[t - 1] > 1:
|
||
hr = hr[t:]
|
||
break
|
||
t -= 1
|
||
headers = []
|
||
for j in range(len(df.iloc[i, :])):
|
||
t = []
|
||
for h in hr:
|
||
x = str(df.iloc[i + h, j]).strip()
|
||
if x in t:
|
||
continue
|
||
t.append(x)
|
||
t = ",".join(t)
|
||
if t:
|
||
t += ": "
|
||
headers.append(t)
|
||
cells = []
|
||
for j in range(len(df.iloc[i, :])):
|
||
if not str(df.iloc[i, j]):
|
||
continue
|
||
cells.append(headers[j] + str(df.iloc[i, j]))
|
||
lines.append(";".join(cells))
|
||
|
||
if colnm > 3:
|
||
return lines
|
||
return ["\n".join(lines)]
|
||
|
||
def __call__(self, fnm, from_page=0, to_page=100000000):
|
||
self.doc = Document(fnm) if isinstance(
|
||
fnm, str) else Document(BytesIO(fnm))
|
||
pn = 0 # parsed page
|
||
secs = [] # parsed contents
|
||
for p in self.doc.paragraphs:
|
||
if pn > to_page:
|
||
break
|
||
|
||
runs_within_single_paragraph = [] # save runs within the range of pages
|
||
for run in p.runs:
|
||
if pn > to_page:
|
||
break
|
||
if from_page <= pn < to_page and p.text.strip():
|
||
runs_within_single_paragraph.append(run.text) # append run.text first
|
||
|
||
# wrap page break checker into a static method
|
||
if 'lastRenderedPageBreak' in run._element.xml:
|
||
pn += 1
|
||
|
||
secs.append(("".join(runs_within_single_paragraph), p.style.name if hasattr(p.style, 'name') else '')) # then concat run.text as part of the paragraph
|
||
|
||
tbls = [self.__extract_table_content(tb) for tb in self.doc.tables]
|
||
return secs, tbls
|