Feat: PDF vision figure parser supports reading context (#12416)

### What problem does this PR solve? PDF vision figure parser supports reading context. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-19 11:45:10 +08:00 · 2026-01-05 09:55:43 +08:00
parent cc8a10376a
commit 4cd4526492
8 changed files with 263 additions and 41 deletions
--- a/deepdoc/parser/figure_parser.py
+++ b/deepdoc/parser/figure_parser.py
@ -14,6 +14,7 @@
 #  limitations under the License.
 #
 from concurrent.futures import ThreadPoolExecutor, as_completed
+import logging

 from PIL import Image

@ -21,7 +22,8 @@ from common.constants import LLMType
 from api.db.services.llm_service import LLMBundle
 from common.connection_utils import timeout
 from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk
-from rag.prompts.generator import vision_llm_figure_describe_prompt
+from rag.prompts.generator import vision_llm_figure_describe_prompt, vision_llm_figure_describe_prompt_with_context
+from rag.nlp import append_context2table_image4pdf


 def vision_figure_parser_figure_data_wrapper(figures_data_without_positions):
@ -84,20 +86,36 @@ def vision_figure_parser_figure_xlsx_wrapper(images,callback=None, **kwargs):
 def vision_figure_parser_pdf_wrapper(tbls, callback=None, **kwargs):
    if not tbls:
        return []
+    sections = kwargs.get("sections")
+    parser_config = kwargs.get("parser_config", {})
+    context_size = max(0, int(parser_config.get("image_context_size", 0) or 0))
    try:
        vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
        callback(0.7, "Visual model detected. Attempting to enhance figure extraction...")
    except Exception:
        vision_model = None
    if vision_model:
+
        def is_figure_item(item):
-            return (
-                isinstance(item[0][0], Image.Image) and
-                isinstance(item[0][1], list)
-            )
+            return isinstance(item[0][0], Image.Image) and isinstance(item[0][1], list)
+
        figures_data = [item for item in tbls if is_figure_item(item)]
+        figure_contexts = []
+        if sections and figures_data and context_size > 0:
+            figure_contexts = append_context2table_image4pdf(
+                sections,
+                figures_data,
+                context_size,
+                return_context=True,
+            )
        try:
-            docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
+            docx_vision_parser = VisionFigureParser(
+                vision_model=vision_model,
+                figures_data=figures_data,
+                figure_contexts=figure_contexts,
+                context_size=context_size,
+                **kwargs,
+            )
            boosted_figures = docx_vision_parser(callback=callback)
            tbls = [item for item in tbls if not is_figure_item(item)]
            tbls.extend(boosted_figures)
@ -112,6 +130,8 @@ shared_executor = ThreadPoolExecutor(max_workers=10)
 class VisionFigureParser:
    def __init__(self, vision_model, figures_data, *args, **kwargs):
        self.vision_model = vision_model
+        self.figure_contexts = kwargs.get("figure_contexts") or []
+        self.context_size = max(0, int(kwargs.get("context_size", 0) or 0))
        self._extract_figures_info(figures_data)
        assert len(self.figures) == len(self.descriptions)
        assert not self.positions or (len(self.figures) == len(self.positions))
@ -156,10 +176,25 @@ class VisionFigureParser:

        @timeout(30, 3)
        def process(figure_idx, figure_binary):
+            context_above = ""
+            context_below = ""
+            if figure_idx < len(self.figure_contexts):
+                context_above, context_below = self.figure_contexts[figure_idx]
+            if context_above or context_below:
+                prompt = vision_llm_figure_describe_prompt_with_context(
+                    context_above=context_above,
+                    context_below=context_below,
+                )
+                logging.info(f"[VisionFigureParser] figure={figure_idx} context_size={self.context_size} context_above_len={len(context_above)} context_below_len={len(context_below)} prompt=with_context")
+                logging.info(f"[VisionFigureParser] figure={figure_idx} context_above_snippet={context_above[:512]}")
+                logging.info(f"[VisionFigureParser] figure={figure_idx} context_below_snippet={context_below[:512]}")
+            else:
+                prompt = vision_llm_figure_describe_prompt()
+                logging.info(f"[VisionFigureParser] figure={figure_idx} context_size={self.context_size} context_len=0 prompt=default")
            description_text = picture_vision_llm_chunk(
                binary=figure_binary,
                vision_model=self.vision_model,
-                prompt=vision_llm_figure_describe_prompt(),
+                prompt=prompt,
                callback=callback,
            )
            return figure_idx, description_text
--- a/rag/app/manual.py
+++ b/rag/app/manual.py
@ -314,7 +314,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            tk_cnt = num_tokens_from_string(txt)
            if sec_id > -1:
                last_sid = sec_id
-        tbls = vision_figure_parser_pdf_wrapper(tbls=tbls, callback=callback, **kwargs)
+        tbls = vision_figure_parser_pdf_wrapper(
+            tbls=tbls,
+            sections=sections,
+            callback=callback,
+            **kwargs,
+        )
        res = tokenize_table(tbls, doc, eng)
        res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
        table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))
--- a/rag/app/naive.py
+++ b/rag/app/naive.py
@ -55,9 +55,12 @@ def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
        callback=callback
    )

-    tables = vision_figure_parser_pdf_wrapper(tbls=tables,
-                                              callback=callback,
-                                              **kwargs)
+    tables = vision_figure_parser_pdf_wrapper(
+        tbls=tables,
+        sections=sections,
+        callback=callback,
+        **kwargs,
+    )
    return sections, tables, pdf_parser


--- a/rag/app/paper.py
+++ b/rag/app/paper.py
@ -166,6 +166,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            pdf_parser = Pdf()
            paper = pdf_parser(filename if not binary else binary,
                               from_page=from_page, to_page=to_page, callback=callback)
+            sections = paper.get("sections", [])
        else:
            kwargs.pop("parse_method", None)
            kwargs.pop("mineru_llm_name", None)
@ -192,7 +193,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
            }

        tbls = paper["tables"]
-        tbls = vision_figure_parser_pdf_wrapper(tbls=tbls, callback=callback, **kwargs)
+        tbls = vision_figure_parser_pdf_wrapper(
+            tbls=tbls,
+            sections=sections,
+            callback=callback,
+            **kwargs,
+        )
        paper["tables"] = tbls
    else:
        raise NotImplementedError("file type not supported yet(pdf supported)")
--- a/rag/nlp/init.py
+++ b/rag/nlp/init.py
@ -667,17 +667,42 @@ def attach_media_context(chunks, table_context_size=0, image_context_size=0):
    return chunks


-def append_context2table_image4pdf(sections: list, tabls: list, table_context_size=0):
+def append_context2table_image4pdf(sections: list, tabls: list, table_context_size=0, return_context=False):
    from deepdoc.parser import PdfParser
    if table_context_size <=0:
-        return tabls
+        return [] if return_context else tabls

    page_bucket = defaultdict(list)
-    for i, (txt, poss) in enumerate(sections):
-        poss = PdfParser.extract_positions(poss)
+    for i, item in enumerate(sections):
+        if isinstance(item, (tuple, list)):
+            if len(item) > 2:
+                txt, _sec_id, poss = item[0], item[1], item[2]
+            else:
+                txt = item[0] if item else ""
+                poss = item[1] if len(item) > 1 else ""
+        else:
+            txt = item
+            poss = ""
+        # Normal: (text, "@@...##") from naive parser -> poss is a position tag string.
+        # Manual: (text, sec_id, poss_list) -> poss is a list of (page, left, right, top, bottom).
+        # Paper: (text_with_@@tag, layoutno) -> poss is layoutno; parse from txt when it contains @@ tags.
+        if isinstance(poss, list):
+            poss = poss
+        elif isinstance(poss, str):
+            if "@@" not in poss and isinstance(txt, str) and "@@" in txt:
+                poss = txt
+            poss = PdfParser.extract_positions(poss)
+        else:
+            if isinstance(txt, str) and "@@" in txt:
+                poss = PdfParser.extract_positions(txt)
+            else:
+                poss = []
+        if isinstance(txt, str) and "@@" in txt:
+            txt = re.sub(r"@@[0-9-]+\t[0-9.\t]+##", "", txt).strip()
        for page, left, right, top, bottom in poss:
-            page = page[0]
-            page_bucket[page].append(((left, top, right, bottom), txt))
+            if isinstance(page, list):
+                page = page[0] if page else 0
+            page_bucket[page].append(((left, right, top, bottom), txt))

    def upper_context(page, i):
        txt = ""
@ -720,9 +745,10 @@ def append_context2table_image4pdf(sections: list, tabls: list, table_context_si
        return txt

    res = []
+    contexts = []
    for (img, tb), poss in tabls:
-        page, left, top, right, bott = poss[0]
-        _page, _left, _top, _right, _bott = poss[-1]
+        page, left, right, top, bott = poss[0]
+        _page, _left, _right, _top, _bott = poss[-1]
        if isinstance(tb, list):
            tb = "\n".join(tb)

@ -736,23 +762,34 @@ def append_context2table_image4pdf(sections: list, tabls: list, table_context_si
                    i = 0
                    blks = page_bucket.get(page, [])
                    continue
-                tb = upper_context(page, i) + tb + lower_context(page+1, 0)
+                upper = upper_context(page, i)
+                lower = lower_context(page + 1, 0)
+                tb = upper + tb + lower
+                contexts.append((upper.strip(), lower.strip()))
                break
-            (_, t, r, b), txt = blks[i]
+            (_, _, t, b), txt = blks[i]
            if b > top:
                break
-            (_, _t, _r, _b), _txt = blks[i+1]
+            (_, _, _t, _b), _txt = blks[i+1]
            if _t < _bott:
                i += 1
                continue

-            tb = upper_context(page, i) + tb + lower_context(page, i)
+            upper = upper_context(page, i)
+            lower = lower_context(page, i)
+            tb = upper + tb + lower
+            contexts.append((upper.strip(), lower.strip()))
            break

        if _tb == tb:
-            tb = upper_context(page, -1) + tb + lower_context(page+1, 0)
+            upper = upper_context(page, -1)
+            lower = lower_context(page + 1, 0)
+            tb = upper + tb + lower
+            contexts.append((upper.strip(), lower.strip()))
+        if len(contexts) < len(res) + 1:
+            contexts.append(("", ""))
        res.append(((img, tb), poss))
-    return res
+    return contexts if return_context else res


 def add_positions(d, poss):
--- a/rag/prompts/generator.py
+++ b/rag/prompts/generator.py
@ -158,6 +158,7 @@ KEYWORD_PROMPT_TEMPLATE = load_prompt("keyword_prompt")
 QUESTION_PROMPT_TEMPLATE = load_prompt("question_prompt")
 VISION_LLM_DESCRIBE_PROMPT = load_prompt("vision_llm_describe_prompt")
 VISION_LLM_FIGURE_DESCRIBE_PROMPT = load_prompt("vision_llm_figure_describe_prompt")
+VISION_LLM_FIGURE_DESCRIBE_PROMPT_WITH_CONTEXT = load_prompt("vision_llm_figure_describe_prompt_with_context")
 STRUCTURED_OUTPUT_PROMPT = load_prompt("structured_output_prompt")

 ANALYZE_TASK_SYSTEM = load_prompt("analyze_task_system")
@ -321,6 +322,11 @@ def vision_llm_figure_describe_prompt() -> str:
    return template.render()


+def vision_llm_figure_describe_prompt_with_context(context_above: str, context_below: str) -> str:
+    template = PROMPT_JINJA_ENV.from_string(VISION_LLM_FIGURE_DESCRIBE_PROMPT_WITH_CONTEXT)
+    return template.render(context_above=context_above, context_below=context_below)
+
+
 def tool_schema(tools_description: list[dict], complete_task=False):
    if not tools_description:
        return ""
--- a/rag/prompts/vision_llm_figure_describe_prompt.md
+++ b/rag/prompts/vision_llm_figure_describe_prompt.md
@ -1,24 +1,72 @@
 ## ROLE
+
 You are an expert visual data analyst.

 ## GOAL
-Analyze the image and provide a comprehensive description of its content. Focus on identifying the type of visual data representation (e.g., bar chart, pie chart, line graph, table, flowchart), its structure, and any text captions or labels included in the image.
+
+Analyze the image and produce a textual representation strictly based on what is visible in the image.
+
+## DECISION RULE (CRITICAL)
+
+First, determine whether the image contains an explicit visual data representation with enumerable data units forming a coherent dataset.
+
+Enumerable data units are clearly separable, repeatable elements intended for comparison, measurement, or aggregation, such as:
+
+- rows or columns in a table
+- individual bars in a bar chart
+- identifiable data points or series in a line graph
+- labeled segments in a pie chart
+
+The mere presence of numbers, icons, UI elements, or labels does NOT qualify unless they together form such a dataset.

 ## TASKS
-1. Describe the overall structure of the visual representation. Specify if it is a chart, graph, table, or diagram.
-2. Identify and extract any axes, legends, titles, or labels present in the image. Provide the exact text where available.
-3. Extract the data points from the visual elements (e.g., bar heights, line graph coordinates, pie chart segments, table rows and columns).
-4. Analyze and explain any trends, comparisons, or patterns shown in the data.
-5. Capture any annotations, captions, or footnotes, and explain their relevance to the image.
-6. Only include details that are explicitly present in the image. If an element (e.g., axis, legend, or caption) does not exist or is not visible, do not mention it.

-## OUTPUT FORMAT (Include only sections relevant to the image content)
- Visual Type: [Type]
- Title: [Title text, if available]
- Axes / Legends / Labels: [Details, if available]
- Data Points: [Extracted data]
- Trends / Insights: [Analysis and interpretation]
- Captions / Annotations: [Text and relevance, if available]
+1. Inspect the image and determine which output mode applies based on the decision rule.
+2. Follow the output rules strictly.
+3. Include only content that is explicitly visible in the image.
+4. Do not infer intent, functionality, process logic, or meaning beyond what is visually or textually shown.

-> Ensure high accuracy, clarity, and completeness in your analysis, and include only the information present in the image. Avoid unnecessary statements about missing elements.
+## OUTPUT RULES (STRICT)

+- Produce output in **exactly one** of the two modes defined below.
+- Do NOT mention, label, or reference the modes in the output.
+- Do NOT combine content from both modes.
+- Do NOT explain or justify the choice of mode.
+- Do NOT add any headings, titles, or commentary beyond what the mode requires.
+
+---
+
+## MODE 1: STRUCTURED VISUAL DATA OUTPUT
+
+(Use only if the image contains enumerable data units forming a coherent dataset.)
+
+Output **only** the following fields, in list form.
+Do NOT add free-form paragraphs or additional sections.
+
+- Visual Type:
+- Title:
+- Axes / Legends / Labels:
+- Data Points:
+- Captions / Annotations:
+
+---
+
+## MODE 2: GENERAL FIGURE CONTENT
+
+(Use only if the image does NOT contain enumerable data units.)
+
+Write the content directly, starting from the first sentence.
+Do NOT add any introductory labels, titles, headings, or prefixes.
+
+Requirements:
+
+- Describe visible regions and components in a stable order (e.g., top-to-bottom, left-to-right).
+- Explicitly name interface elements or visual objects exactly as they appear (e.g., tabs, panels, buttons, icons, input fields).
+- Transcribe all visible text verbatim; do not paraphrase, summarize, or reinterpret labels.
+- Describe spatial grouping, containment, and alignment of elements.
+- Do NOT interpret intent, behavior, workflows, gameplay rules, or processes.
+- Do NOT describe the figure as a chart, diagram, process, phase, or sequence unless such words explicitly appear in the image text.
+- Avoid narrative or stylistic language unless it is a dominant and functional visual element.
+
+Use concise, information-dense sentences.
+Do not use bullet lists or structured fields in this mode.
--- a/rag/prompts/vision_llm_figure_describe_prompt_with_context.md
+++ b/rag/prompts/vision_llm_figure_describe_prompt_with_context.md
@ -0,0 +1,82 @@
+## ROLE
+
+You are an expert visual data analyst.
+
+## GOAL
+
+Analyze the image and produce a textual representation strictly based on what is visible in the image.
+Surrounding context may be used only for minimal clarification or disambiguation of terms that appear in the image, not as a source of new information.
+
+## CONTEXT (ABOVE)
+
+{{ context_above }}
+
+## CONTEXT (BELOW)
+
+{{ context_below }}
+
+## DECISION RULE (CRITICAL)
+
+First, determine whether the image contains an explicit visual data representation with enumerable data units forming a coherent dataset.
+
+Enumerable data units are clearly separable, repeatable elements intended for comparison, measurement, or aggregation, such as:
+
+- rows or columns in a table
+- individual bars in a bar chart
+- identifiable data points or series in a line graph
+- labeled segments in a pie chart
+
+The mere presence of numbers, icons, UI elements, or labels does NOT qualify unless they together form such a dataset.
+
+## TASKS
+
+1. Inspect the image and determine which output mode applies based on the decision rule.
+2. Use surrounding context only to disambiguate terms that appear in the image.
+3. Follow the output rules strictly.
+4. Include only content that is explicitly visible in the image.
+5. Do not infer intent, functionality, process logic, or meaning beyond what is visually or textually shown.
+
+## OUTPUT RULES (STRICT)
+
+- Produce output in **exactly one** of the two modes defined below.
+- Do NOT mention, label, or reference the modes in the output.
+- Do NOT combine content from both modes.
+- Do NOT explain or justify the choice of mode.
+- Do NOT add any headings, titles, or commentary beyond what the mode requires.
+
+---
+
+## MODE 1: STRUCTURED VISUAL DATA OUTPUT
+
+(Use only if the image contains enumerable data units forming a coherent dataset.)
+
+Output **only** the following fields, in list form.
+Do NOT add free-form paragraphs or additional sections.
+
+- Visual Type:
+- Title:
+- Axes / Legends / Labels:
+- Data Points:
+- Captions / Annotations:
+
+---
+
+## MODE 2: GENERAL FIGURE CONTENT
+
+(Use only if the image does NOT contain enumerable data units.)
+
+Write the content directly, starting from the first sentence.
+Do NOT add any introductory labels, titles, headings, or prefixes.
+
+Requirements:
+
+- Describe visible regions and components in a stable order (e.g., top-to-bottom, left-to-right).
+- Explicitly name interface elements or visual objects exactly as they appear (e.g., tabs, panels, buttons, icons, input fields).
+- Transcribe all visible text verbatim; do not paraphrase, summarize, or reinterpret labels.
+- Describe spatial grouping, containment, and alignment of elements.
+- Do NOT interpret intent, behavior, workflows, gameplay rules, or processes.
+- Do NOT describe the figure as a chart, diagram, process, phase, or sequence unless such words explicitly appear in the image text.
+- Avoid narrative or stylistic language unless it is a dominant and functional visual element.
+
+Use concise, information-dense sentences.
+Do not use bullet lists or structured fields in this mode.