From f58e0b3ecac52dcf0c12992b168bb296456beebf Mon Sep 17 00:00:00 2001 From: Rene Arredondo <120709323+Rene0422@users.noreply.github.com> Date: Tue, 19 May 2026 01:08:10 -0700 Subject: [PATCH] Feat: VLM image descriptions in MinerU parser (#14869) (#14946) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Summary Closes #14869. Adds VLM-based semantic descriptions to **image chunks produced by the MinerU parser**, closing a long-standing parity gap with the deepdoc parser's `VisionFigureParser`. A maintainer flagged this in #13342 ("We may add the VLM enhancement to MinerU parser as well") and an earlier proposal exists in #13824; this PR lands the change end-to-end inside the existing parser plumbing. ## Why Today the MinerU parser returns image chunks containing only the native `image_caption` and `image_footnote` strings from MinerU's JSON. When neither is present (or when both are sparse), the chunk carries effectively no searchable content for the figure and retrieval misses it entirely. Users who configured a local VLM (reporter's case: Gemma-4-31B) had to post-process MinerU's `tmp/*.json` themselves. The deepdoc parser already solves this via [`VisionFigureParser`](deepdoc/parser/figure_parser.py): when the tenant has an `IMAGE2TEXT` model configured, each figure gets a semantic description merged into its chunk. This PR brings the same behavior to MinerU. ## What changed ### `deepdoc/parser/mineru_parser.py` - **New method `_enhance_images_with_vlm(outputs, vision_model, callback=None)`** — collects every `IMAGE` block with a readable `img_path`, runs `rag.app.picture.vision_llm_chunk` in a 10-worker `ThreadPoolExecutor` using the existing `vision_llm_figure_describe_prompt`, and writes the result back as `vlm_description`. Per-image failures are logged and skipped — they never abort the run. - **`_transfer_to_sections` (IMAGE branch)** — folds `vlm_description` into the section text alongside caption + footnote, so the description becomes part of the chunk and is searchable / retrievable. - **`parse_pdf`** — after `_read_output`, calls `_enhance_images_with_vlm(outputs, vision_model, callback=callback)` when a `vision_model` kwarg is supplied. Wrapped in `try / except` so a VLM outage cannot break parsing. ### `rag/app/naive.py` (`by_mineru`) After successfully resolving the MinerU OCR parser, also resolves the tenant's default `LLMType.IMAGE2TEXT` model via `get_tenant_default_model_by_type`, wraps it in an `LLMBundle`, and injects it as `kwargs["vision_model"]` before delegating to `parse_pdf`. ## Behavior | Tenant config | Behavior | |---|---| | `IMAGE2TEXT` model configured | MinerU image chunks contain `caption + footnote + VLM description`. Retrieval against figures now actually works. | | No `IMAGE2TEXT` model configured | Exact same output as today (caption + footnote only). Lookup fails silently with an info log; no error, no regression. | | VLM call fails for a single image | That image silently falls back to caption + footnote; other images proceed. | | Caller already passes `vision_model` in kwargs | We don't override it — `if "vision_model" not in kwargs` guards the lookup. | ## Files - `deepdoc/parser/mineru_parser.py` (+56) - `rag/app/naive.py` (+13) --- deepdoc/parser/mineru_parser.py | 56 +++++++++++++++++++++++++++++++++ rag/app/naive.py | 13 ++++++++ 2 files changed, 69 insertions(+) diff --git a/deepdoc/parser/mineru_parser.py b/deepdoc/parser/mineru_parser.py index b369f9122..2c35ead98 100644 --- a/deepdoc/parser/mineru_parser.py +++ b/deepdoc/parser/mineru_parser.py @@ -644,6 +644,12 @@ class MinerUParser(RAGFlowPdfParser): case MinerUContentType.IMAGE: section = "".join(output.get("image_caption", [])) + "\n" + "".join( output.get("image_footnote", [])) + # If a vision model enriched this image with a semantic + # description (see _enhance_images_with_vlm), embed it in + # the chunk so it becomes searchable / retrievable. + vlm_description = (output.get("vlm_description") or "").strip() + if vlm_description: + section = (section.strip("\n") + "\n" + vlm_description).strip("\n") if section.strip() else vlm_description case MinerUContentType.EQUATION: section = output.get("text", "") case MinerUContentType.CODE: @@ -664,6 +670,49 @@ class MinerUParser(RAGFlowPdfParser): def _transfer_to_tables(self, outputs: list[dict[str, Any]]): return [] + def _enhance_images_with_vlm(self, outputs: list[dict[str, Any]], vision_model, callback: Optional[Callable] = None): + """Generate semantic descriptions for image blocks via the tenant's + IMAGE2TEXT model, mirroring deepdoc's VisionFigureParser. Each + IMAGE block with a readable img_path gets a ``vlm_description`` + field that ``_transfer_to_sections`` then folds into the chunk + text — closing issue #14869. + """ + from concurrent.futures import ThreadPoolExecutor, as_completed + from rag.app.picture import vision_llm_chunk + from rag.prompts.generator import vision_llm_figure_describe_prompt + + image_jobs = [ + (idx, item) + for idx, item in enumerate(outputs) + if item.get("type") == MinerUContentType.IMAGE + and item.get("img_path") + and os.path.exists(item["img_path"]) + ] + if not image_jobs: + return + + if callback: + callback(0.78, f"[MinerU] Generating VLM descriptions for {len(image_jobs)} images...") + + prompt = vision_llm_figure_describe_prompt() + + def worker(idx, item): + try: + with Image.open(item["img_path"]) as img: + img.load() + desc = vision_llm_chunk(binary=img, vision_model=vision_model, prompt=prompt) + return idx, (desc or "").strip() + except Exception as e: + logging.warning(f"[MinerU] VLM description failed for image #{idx}: {e}") + return idx, "" + + with ThreadPoolExecutor(max_workers=10) as executor: + futures = [executor.submit(worker, idx, item) for idx, item in image_jobs] + for fut in as_completed(futures): + idx, desc = fut.result() + if desc: + outputs[idx]["vlm_description"] = desc + def parse_pdf( self, filepath: str | PathLike[str], @@ -744,6 +793,13 @@ class MinerUParser(RAGFlowPdfParser): if callback: callback(0.75, f"[MinerU] Parsed {len(outputs)} blocks from PDF.") + vision_model = kwargs.get("vision_model") + if vision_model is not None: + try: + self._enhance_images_with_vlm(outputs, vision_model, callback=callback) + except Exception as e: + self.logger.warning(f"[MinerU] VLM image enhancement failed: {e}. Continuing without descriptions.") + return self._transfer_to_sections(outputs, parse_method), self._transfer_to_tables(outputs) finally: if temp_pdf and temp_pdf.exists(): diff --git a/rag/app/naive.py b/rag/app/naive.py index f91e2a8f9..7bf4743e7 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -131,6 +131,19 @@ def by_mineru( ocr_model_config = get_model_config_by_type_and_name(tenant_id, LLMType.OCR, mineru_llm_name) ocr_model = LLMBundle(tenant_id=tenant_id, model_config=ocr_model_config, lang=lang) pdf_parser = ocr_model.mdl + + # Closes #14869: when the tenant has an IMAGE2TEXT model + # configured, let the MinerU parser enrich image chunks with + # VLM-generated semantic descriptions (parity with deepdoc's + # VisionFigureParser). Best-effort — fall back silently if + # no vision model is available. + if "vision_model" not in kwargs: + try: + vision_model_config = get_tenant_default_model_by_type(tenant_id, LLMType.IMAGE2TEXT) + kwargs["vision_model"] = LLMBundle(tenant_id=tenant_id, model_config=vision_model_config, lang=lang) + except Exception as vlm_err: + logging.info(f"[MinerU] no IMAGE2TEXT model for tenant; skipping image VLM enhancement: {vlm_err}") + sections, tables = pdf_parser.parse_pdf( filepath=filename, binary=binary,