Fix MinerU output fallback and NameError regression (#14538)

### What problem does this PR solve?

This fixes a MinerU parsing failure where output JSON was not found in
nested v0.24.0 layouts, and also fixes a `content_names` NameError in
`_read_output()`. As a result, successful MinerU API runs no longer end
with false “MinerU not found” parsing failures.

### Type of change

- [x] Bug Fix (non-breaking change which fixes an issue)
This commit is contained in:
Idriss Sbaaoui
2026-05-06 14:03:57 +08:00
committed by GitHub
parent c0fc8b32f2
commit c502001d9e

View File

@ -514,7 +514,7 @@ class MinerUParser(RAGFlowPdfParser):
return sanitized or "unnamed"
safe_stem = _sanitize_filename(file_stem)
content_names = (f"{file_stem}_content_list.json", f"{safe_stem}_content_list.json")
content_names = tuple(dict.fromkeys((f"{file_stem}_content_list.json", f"{safe_stem}_content_list.json")))
allowed_names = set(content_names)
self.logger.info(f"[MinerU] Expected output files: {', '.join(sorted(allowed_names))}")
self.logger.info(f"[MinerU] Searching output in: {output_dir}")
@ -554,13 +554,55 @@ class MinerUParser(RAGFlowPdfParser):
for candidate in output_dir.glob(f"**/{parse_subdir}/{content_name}"):
self.logger.info(f"[MinerU] Trying parse-method path: {candidate}")
attempted.append(candidate)
if candidate.exists():
subdir = candidate.parent
json_file = candidate
break
subdir = candidate.parent
json_file = candidate
break
if json_file:
break
if not json_file:
stem_dirs = tuple(dict.fromkeys((file_stem, safe_stem)))
patterns = []
if parse_subdir:
for stem_dir in stem_dirs:
patterns.extend(
[
f"**/{stem_dir}/{parse_subdir}/content_list.json",
f"**/{stem_dir}/{parse_subdir}/*_content_list.json",
]
)
patterns.extend(
[
f"**/{parse_subdir}/content_list.json",
f"**/{parse_subdir}/*_content_list.json",
]
)
for stem_dir in stem_dirs:
patterns.extend(
[
f"**/{stem_dir}/content_list.json",
f"**/{stem_dir}/*_content_list.json",
]
)
patterns.extend(["**/content_list.json", "**/*_content_list.json"])
for pattern in patterns:
for candidate in sorted(output_dir.glob(pattern)):
self.logger.info(f"[MinerU] Trying fallback path: {candidate}")
if candidate.name.endswith("_content_list.json"):
rel_parts = candidate.relative_to(output_dir).parts
in_stem_dir = any(stem_dir in rel_parts for stem_dir in stem_dirs)
stem_match = candidate.stem.startswith(file_stem) or candidate.stem.startswith(safe_stem)
if not (stem_match or in_stem_dir):
self.logger.info(f"[MinerU] Skip unrelated fallback candidate: {candidate}")
continue
attempted.append(candidate)
subdir = candidate.parent
json_file = candidate
break
if json_file:
break
if not json_file:
raise FileNotFoundError(f"[MinerU] Missing output file, tried: {', '.join(str(p) for p in attempted)}")