mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-01 15:57:47 +08:00
### What problem does this PR solve? Fixes #14196 ## Problem When using DeepDOC to parse large PDFs (over 1000 pages), the parser silently truncated processing at 300 pages due to a hardcoded default `page_to=299` in `RAGFlowPdfParser.__images__()`. This caused: - **Errors** on pages beyond the limit - **Poor image quality** as the parser attempted to compensate with missing page data - **Inconsistent chunk splitting** between full PDF imports and partial imports Additionally, the codebase scattered magic numbers (`299`, `600`, `10000`, `100000`, `100000000`, `10000000000`, `10**9`) across 22 files as sentinel values for "parse all pages", making future maintenance error-prone. ## Root Cause ```python # deepdoc/parser/pdf_parser.py (before) def __images__(self, fnm, zoomin=3, page_from=0, page_to=299, callback=None): # Only the first 300 pages were rendered; everything beyond was silently dropped ``` While most callers in `rag/app/*.py` correctly passed `to_page=100000`, the base class `RAGFlowPdfParser.__call__()` and `parse_into_bboxes()` invoked `__images__` **without** forwarding `page_from`/`page_to`, falling back to the restrictive default of 299. ## Solution ### 1. Define constants in `common/constants.py` ```python MAXIMUM_PAGE_NUMBER = 100000 # Used by the parsing layer MAXIMUM_TASK_PAGE_NUMBER = MAXIMUM_PAGE_NUMBER * 1000 # Used by the task/DB layer ``` ### 2. Replace all hardcoded sentinel values | Layer | Files Changed | Old Values | New Value | |---|---|---|---| | **Deepdoc parsers** | `pdf_parser.py`, `mineru_parser.py`, `docling_parser.py`, `opendataloader_parser.py`, `paddleocr_parser.py`, `docx_parser.py` | `299`, `600`, `10**9`, `100000000` | `MAXIMUM_PAGE_NUMBER` | | **Chunk parsers** | `naive.py`, `book.py`, `qa.py`, `one.py`, `manual.py`, `paper.py`, `presentation.py`, `laws.py`, `resume.py`, `email.py`, `table.py` | `100000`, `10000`, `10000000000` | `MAXIMUM_PAGE_NUMBER` | | **Task/DB layer** | `db_models.py`, `task_service.py`, `document_service.py`, `file_service.py` | `100000000` | `MAXIMUM_TASK_PAGE_NUMBER` | ### 3. Fix `parse_into_bboxes()` missing parameters Added `from_page`/`to_page` parameters to `parse_into_bboxes()` so that the `rag/flow/parser/parser.py` DeepDOC path no longer falls back to the restrictive default. ## Files Changed (22) - `common/constants.py` - `deepdoc/parser/pdf_parser.py` - `deepdoc/parser/mineru_parser.py` - `deepdoc/parser/docling_parser.py` - `deepdoc/parser/opendataloader_parser.py` - `deepdoc/parser/paddleocr_parser.py` - `deepdoc/parser/docx_parser.py` - `rag/app/naive.py` - `rag/app/book.py` - `rag/app/qa.py` - `rag/app/one.py` - `rag/app/manual.py` - `rag/app/paper.py` - `rag/app/presentation.py` - `rag/app/laws.py` - `rag/app/resume.py` - `rag/app/email.py` - `rag/app/table.py` - `api/db/db_models.py` - `api/db/services/task_service.py` - `api/db/services/document_service.py` - `api/db/services/file_service.py` ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue) - [x] Refactoring --------- Signed-off-by: noob <yixiao121314@outlook.com>
137 lines
4.6 KiB
Python
137 lines
4.6 KiB
Python
#
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import logging
|
|
from email import policy
|
|
from email.parser import BytesParser
|
|
from rag.app.naive import chunk as naive_chunk
|
|
from common.constants import MAXIMUM_PAGE_NUMBER
|
|
import re
|
|
from rag.nlp import rag_tokenizer, naive_merge, tokenize_chunks
|
|
from deepdoc.parser import HtmlParser, TxtParser
|
|
from timeit import default_timer as timer
|
|
import io
|
|
|
|
|
|
def chunk(
|
|
filename,
|
|
binary=None,
|
|
from_page=0,
|
|
to_page=MAXIMUM_PAGE_NUMBER,
|
|
lang="Chinese",
|
|
callback=None,
|
|
**kwargs,
|
|
):
|
|
"""
|
|
Only eml is supported
|
|
"""
|
|
eng = lang.lower() == "english" # is_english(cks)
|
|
parser_config = kwargs.get(
|
|
"parser_config",
|
|
{"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"},
|
|
)
|
|
doc = {
|
|
"docnm_kwd": filename,
|
|
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)),
|
|
}
|
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
|
main_res = []
|
|
attachment_res = []
|
|
|
|
if binary:
|
|
with io.BytesIO(binary) as buffer:
|
|
msg = BytesParser(policy=policy.default).parse(buffer)
|
|
else:
|
|
with open(filename, "rb") as buffer:
|
|
msg = BytesParser(policy=policy.default).parse(buffer)
|
|
|
|
text_txt, html_txt = [], []
|
|
# get the email header info
|
|
for header, value in msg.items():
|
|
text_txt.append(f"{header}: {value}")
|
|
|
|
# get the email main info
|
|
def _add_content(msg, content_type):
|
|
def _decode_payload(payload, charset, target_list):
|
|
try:
|
|
target_list.append(payload.decode(charset))
|
|
except (UnicodeDecodeError, LookupError):
|
|
for enc in ["utf-8", "gb2312", "gbk", "gb18030", "latin1"]:
|
|
try:
|
|
target_list.append(payload.decode(enc))
|
|
break
|
|
except UnicodeDecodeError:
|
|
continue
|
|
else:
|
|
target_list.append(payload.decode("utf-8", errors="ignore"))
|
|
|
|
if content_type == "text/plain":
|
|
payload = msg.get_payload(decode=True)
|
|
charset = msg.get_content_charset() or "utf-8"
|
|
_decode_payload(payload, charset, text_txt)
|
|
elif content_type == "text/html":
|
|
payload = msg.get_payload(decode=True)
|
|
charset = msg.get_content_charset() or "utf-8"
|
|
_decode_payload(payload, charset, html_txt)
|
|
elif "multipart" in content_type:
|
|
if msg.is_multipart():
|
|
for part in msg.iter_parts():
|
|
_add_content(part, part.get_content_type())
|
|
|
|
_add_content(msg, msg.get_content_type())
|
|
|
|
sections = TxtParser.parser_txt("\n".join(text_txt)) + [
|
|
(line, "") for line in
|
|
HtmlParser.parser_txt("\n".join(html_txt), chunk_token_num=parser_config["chunk_token_num"]) if line
|
|
]
|
|
|
|
st = timer()
|
|
chunks = naive_merge(
|
|
sections,
|
|
int(parser_config.get("chunk_token_num", 128)),
|
|
parser_config.get("delimiter", "\n!?。;!?"),
|
|
)
|
|
|
|
main_res.extend(tokenize_chunks(chunks, doc, eng, None))
|
|
logging.debug("naive_merge({}): {}".format(filename, timer() - st))
|
|
# get the attachment info
|
|
for part in msg.iter_attachments():
|
|
content_disposition = part.get("Content-Disposition")
|
|
if content_disposition:
|
|
dispositions = content_disposition.strip().split(";")
|
|
if dispositions[0].lower() == "attachment":
|
|
filename = part.get_filename()
|
|
payload = part.get_payload(decode=True)
|
|
try:
|
|
attachment_res.extend(
|
|
naive_chunk(filename, payload, callback=callback, **kwargs)
|
|
)
|
|
except Exception:
|
|
pass
|
|
|
|
return main_res + attachment_res
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
|
|
def dummy(prog=None, msg=""):
|
|
pass
|
|
|
|
|
|
chunk(sys.argv[1], callback=dummy)
|