Feat: VLM image descriptions in MinerU parser (#14869) (#14946)

## Summary

Closes #14869.

Adds VLM-based semantic descriptions to **image chunks produced by the
MinerU parser**, closing a long-standing parity gap with the deepdoc
parser's `VisionFigureParser`. A maintainer flagged this in #13342
("We may add the VLM enhancement to MinerU parser as well") and an
earlier proposal exists in #13824; this PR lands the change end-to-end
inside the existing parser plumbing.

## Why

Today the MinerU parser returns image chunks containing only the
native `image_caption` and `image_footnote` strings from MinerU's
JSON. When neither is present (or when both are sparse), the chunk
carries effectively no searchable content for the figure and
retrieval misses it entirely. Users who configured a local VLM
(reporter's case: Gemma-4-31B) had to post-process MinerU's
`tmp/*.json` themselves.

The deepdoc parser already solves this via
[`VisionFigureParser`](deepdoc/parser/figure_parser.py): when the
tenant has an `IMAGE2TEXT` model configured, each figure gets a
semantic description merged into its chunk. This PR brings the same
behavior to MinerU.

## What changed

### `deepdoc/parser/mineru_parser.py`

- **New method `_enhance_images_with_vlm(outputs, vision_model,
callback=None)`** —
  collects every `IMAGE` block with a readable `img_path`, runs
  `rag.app.picture.vision_llm_chunk` in a 10-worker
  `ThreadPoolExecutor` using the existing
  `vision_llm_figure_describe_prompt`, and writes the result back as
  `vlm_description`. Per-image failures are logged and skipped — they
  never abort the run.
- **`_transfer_to_sections` (IMAGE branch)** — folds
  `vlm_description` into the section text alongside caption +
  footnote, so the description becomes part of the chunk and is
  searchable / retrievable.
- **`parse_pdf`** — after `_read_output`, calls
  `_enhance_images_with_vlm(outputs, vision_model, callback=callback)`
  when a `vision_model` kwarg is supplied. Wrapped in `try / except`
  so a VLM outage cannot break parsing.

### `rag/app/naive.py` (`by_mineru`)

After successfully resolving the MinerU OCR parser, also resolves the
tenant's default `LLMType.IMAGE2TEXT` model via
`get_tenant_default_model_by_type`, wraps it in an `LLMBundle`, and
injects it as `kwargs["vision_model"]` before delegating to
`parse_pdf`.

## Behavior

| Tenant config | Behavior |
|---|---|
| `IMAGE2TEXT` model configured | MinerU image chunks contain `caption +
footnote + VLM description`. Retrieval against figures now actually
works. |
| No `IMAGE2TEXT` model configured | Exact same output as today (caption
+ footnote only). Lookup fails silently with an info log; no error, no
regression. |
| VLM call fails for a single image | That image silently falls back to
caption + footnote; other images proceed. |
| Caller already passes `vision_model` in kwargs | We don't override it
— `if "vision_model" not in kwargs` guards the lookup. |

## Files

- `deepdoc/parser/mineru_parser.py` (+56)
- `rag/app/naive.py` (+13)
This commit is contained in:
Rene Arredondo
2026-05-19 01:08:10 -07:00
committed by GitHub
parent 95b56e73f2
commit f58e0b3eca
2 changed files with 69 additions and 0 deletions

View File

@ -644,6 +644,12 @@ class MinerUParser(RAGFlowPdfParser):
case MinerUContentType.IMAGE:
section = "".join(output.get("image_caption", [])) + "\n" + "".join(
output.get("image_footnote", []))
# If a vision model enriched this image with a semantic
# description (see _enhance_images_with_vlm), embed it in
# the chunk so it becomes searchable / retrievable.
vlm_description = (output.get("vlm_description") or "").strip()
if vlm_description:
section = (section.strip("\n") + "\n" + vlm_description).strip("\n") if section.strip() else vlm_description
case MinerUContentType.EQUATION:
section = output.get("text", "")
case MinerUContentType.CODE:
@ -664,6 +670,49 @@ class MinerUParser(RAGFlowPdfParser):
def _transfer_to_tables(self, outputs: list[dict[str, Any]]):
return []
def _enhance_images_with_vlm(self, outputs: list[dict[str, Any]], vision_model, callback: Optional[Callable] = None):
"""Generate semantic descriptions for image blocks via the tenant's
IMAGE2TEXT model, mirroring deepdoc's VisionFigureParser. Each
IMAGE block with a readable img_path gets a ``vlm_description``
field that ``_transfer_to_sections`` then folds into the chunk
text — closing issue #14869.
"""
from concurrent.futures import ThreadPoolExecutor, as_completed
from rag.app.picture import vision_llm_chunk
from rag.prompts.generator import vision_llm_figure_describe_prompt
image_jobs = [
(idx, item)
for idx, item in enumerate(outputs)
if item.get("type") == MinerUContentType.IMAGE
and item.get("img_path")
and os.path.exists(item["img_path"])
]
if not image_jobs:
return
if callback:
callback(0.78, f"[MinerU] Generating VLM descriptions for {len(image_jobs)} images...")
prompt = vision_llm_figure_describe_prompt()
def worker(idx, item):
try:
with Image.open(item["img_path"]) as img:
img.load()
desc = vision_llm_chunk(binary=img, vision_model=vision_model, prompt=prompt)
return idx, (desc or "").strip()
except Exception as e:
logging.warning(f"[MinerU] VLM description failed for image #{idx}: {e}")
return idx, ""
with ThreadPoolExecutor(max_workers=10) as executor:
futures = [executor.submit(worker, idx, item) for idx, item in image_jobs]
for fut in as_completed(futures):
idx, desc = fut.result()
if desc:
outputs[idx]["vlm_description"] = desc
def parse_pdf(
self,
filepath: str | PathLike[str],
@ -744,6 +793,13 @@ class MinerUParser(RAGFlowPdfParser):
if callback:
callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.")
vision_model = kwargs.get("vision_model")
if vision_model is not None:
try:
self._enhance_images_with_vlm(outputs, vision_model, callback=callback)
except Exception as e:
self.logger.warning(f"[MinerU] VLM image enhancement failed: {e}. Continuing without descriptions.")
return self._transfer_to_sections(outputs, parse_method), self._transfer_to_tables(outputs)
finally:
if temp_pdf and temp_pdf.exists():

View File

@ -131,6 +131,19 @@ def by_mineru(
ocr_model_config = get_model_config_by_type_and_name(tenant_id, LLMType.OCR, mineru_llm_name)
ocr_model = LLMBundle(tenant_id=tenant_id, model_config=ocr_model_config, lang=lang)
pdf_parser = ocr_model.mdl
# Closes #14869: when the tenant has an IMAGE2TEXT model
# configured, let the MinerU parser enrich image chunks with
# VLM-generated semantic descriptions (parity with deepdoc's
# VisionFigureParser). Best-effort — fall back silently if
# no vision model is available.
if "vision_model" not in kwargs:
try:
vision_model_config = get_tenant_default_model_by_type(tenant_id, LLMType.IMAGE2TEXT)
kwargs["vision_model"] = LLMBundle(tenant_id=tenant_id, model_config=vision_model_config, lang=lang)
except Exception as vlm_err:
logging.info(f"[MinerU] no IMAGE2TEXT model for tenant; skipping image VLM enhancement: {vlm_err}")
sections, tables = pdf_parser.parse_pdf(
filepath=filename,
binary=binary,