Feat: PDF vision figure parser supports reading context (#12416)

### What problem does this PR solve?

PDF vision figure parser supports reading context.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Yongteng Lei
2026-01-05 09:55:43 +08:00
committed by GitHub
parent cc8a10376a
commit 4cd4526492
8 changed files with 263 additions and 41 deletions

View File

@ -14,6 +14,7 @@
# limitations under the License.
#
from concurrent.futures import ThreadPoolExecutor, as_completed
import logging
from PIL import Image
@ -21,7 +22,8 @@ from common.constants import LLMType
from api.db.services.llm_service import LLMBundle
from common.connection_utils import timeout
from rag.app.picture import vision_llm_chunk as picture_vision_llm_chunk
from rag.prompts.generator import vision_llm_figure_describe_prompt
from rag.prompts.generator import vision_llm_figure_describe_prompt, vision_llm_figure_describe_prompt_with_context
from rag.nlp import append_context2table_image4pdf
def vision_figure_parser_figure_data_wrapper(figures_data_without_positions):
@ -84,20 +86,36 @@ def vision_figure_parser_figure_xlsx_wrapper(images,callback=None, **kwargs):
def vision_figure_parser_pdf_wrapper(tbls, callback=None, **kwargs):
if not tbls:
return []
sections = kwargs.get("sections")
parser_config = kwargs.get("parser_config", {})
context_size = max(0, int(parser_config.get("image_context_size", 0) or 0))
try:
vision_model = LLMBundle(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
callback(0.7, "Visual model detected. Attempting to enhance figure extraction...")
except Exception:
vision_model = None
if vision_model:
def is_figure_item(item):
return (
isinstance(item[0][0], Image.Image) and
isinstance(item[0][1], list)
)
return isinstance(item[0][0], Image.Image) and isinstance(item[0][1], list)
figures_data = [item for item in tbls if is_figure_item(item)]
figure_contexts = []
if sections and figures_data and context_size > 0:
figure_contexts = append_context2table_image4pdf(
sections,
figures_data,
context_size,
return_context=True,
)
try:
docx_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=figures_data, **kwargs)
docx_vision_parser = VisionFigureParser(
vision_model=vision_model,
figures_data=figures_data,
figure_contexts=figure_contexts,
context_size=context_size,
**kwargs,
)
boosted_figures = docx_vision_parser(callback=callback)
tbls = [item for item in tbls if not is_figure_item(item)]
tbls.extend(boosted_figures)
@ -112,6 +130,8 @@ shared_executor = ThreadPoolExecutor(max_workers=10)
class VisionFigureParser:
def __init__(self, vision_model, figures_data, *args, **kwargs):
self.vision_model = vision_model
self.figure_contexts = kwargs.get("figure_contexts") or []
self.context_size = max(0, int(kwargs.get("context_size", 0) or 0))
self._extract_figures_info(figures_data)
assert len(self.figures) == len(self.descriptions)
assert not self.positions or (len(self.figures) == len(self.positions))
@ -156,10 +176,25 @@ class VisionFigureParser:
@timeout(30, 3)
def process(figure_idx, figure_binary):
context_above = ""
context_below = ""
if figure_idx < len(self.figure_contexts):
context_above, context_below = self.figure_contexts[figure_idx]
if context_above or context_below:
prompt = vision_llm_figure_describe_prompt_with_context(
context_above=context_above,
context_below=context_below,
)
logging.info(f"[VisionFigureParser] figure={figure_idx} context_size={self.context_size} context_above_len={len(context_above)} context_below_len={len(context_below)} prompt=with_context")
logging.info(f"[VisionFigureParser] figure={figure_idx} context_above_snippet={context_above[:512]}")
logging.info(f"[VisionFigureParser] figure={figure_idx} context_below_snippet={context_below[:512]}")
else:
prompt = vision_llm_figure_describe_prompt()
logging.info(f"[VisionFigureParser] figure={figure_idx} context_size={self.context_size} context_len=0 prompt=default")
description_text = picture_vision_llm_chunk(
binary=figure_binary,
vision_model=self.vision_model,
prompt=vision_llm_figure_describe_prompt(),
prompt=prompt,
callback=callback,
)
return figure_idx, description_text

View File

@ -314,7 +314,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
tk_cnt = num_tokens_from_string(txt)
if sec_id > -1:
last_sid = sec_id
tbls = vision_figure_parser_pdf_wrapper(tbls=tbls, callback=callback, **kwargs)
tbls = vision_figure_parser_pdf_wrapper(
tbls=tbls,
sections=sections,
callback=callback,
**kwargs,
)
res = tokenize_table(tbls, doc, eng)
res.extend(tokenize_chunks(chunks, doc, eng, pdf_parser))
table_ctx = max(0, int(parser_config.get("table_context_size", 0) or 0))

View File

@ -55,9 +55,12 @@ def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
callback=callback
)
tables = vision_figure_parser_pdf_wrapper(tbls=tables,
callback=callback,
**kwargs)
tables = vision_figure_parser_pdf_wrapper(
tbls=tables,
sections=sections,
callback=callback,
**kwargs,
)
return sections, tables, pdf_parser

View File

@ -166,6 +166,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pdf_parser = Pdf()
paper = pdf_parser(filename if not binary else binary,
from_page=from_page, to_page=to_page, callback=callback)
sections = paper.get("sections", [])
else:
kwargs.pop("parse_method", None)
kwargs.pop("mineru_llm_name", None)
@ -192,7 +193,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
}
tbls = paper["tables"]
tbls = vision_figure_parser_pdf_wrapper(tbls=tbls, callback=callback, **kwargs)
tbls = vision_figure_parser_pdf_wrapper(
tbls=tbls,
sections=sections,
callback=callback,
**kwargs,
)
paper["tables"] = tbls
else:
raise NotImplementedError("file type not supported yet(pdf supported)")

View File

@ -667,17 +667,42 @@ def attach_media_context(chunks, table_context_size=0, image_context_size=0):
return chunks
def append_context2table_image4pdf(sections: list, tabls: list, table_context_size=0):
def append_context2table_image4pdf(sections: list, tabls: list, table_context_size=0, return_context=False):
from deepdoc.parser import PdfParser
if table_context_size <=0:
return tabls
return [] if return_context else tabls
page_bucket = defaultdict(list)
for i, (txt, poss) in enumerate(sections):
poss = PdfParser.extract_positions(poss)
for i, item in enumerate(sections):
if isinstance(item, (tuple, list)):
if len(item) > 2:
txt, _sec_id, poss = item[0], item[1], item[2]
else:
txt = item[0] if item else ""
poss = item[1] if len(item) > 1 else ""
else:
txt = item
poss = ""
# Normal: (text, "@@...##") from naive parser -> poss is a position tag string.
# Manual: (text, sec_id, poss_list) -> poss is a list of (page, left, right, top, bottom).
# Paper: (text_with_@@tag, layoutno) -> poss is layoutno; parse from txt when it contains @@ tags.
if isinstance(poss, list):
poss = poss
elif isinstance(poss, str):
if "@@" not in poss and isinstance(txt, str) and "@@" in txt:
poss = txt
poss = PdfParser.extract_positions(poss)
else:
if isinstance(txt, str) and "@@" in txt:
poss = PdfParser.extract_positions(txt)
else:
poss = []
if isinstance(txt, str) and "@@" in txt:
txt = re.sub(r"@@[0-9-]+\t[0-9.\t]+##", "", txt).strip()
for page, left, right, top, bottom in poss:
page = page[0]
page_bucket[page].append(((left, top, right, bottom), txt))
if isinstance(page, list):
page = page[0] if page else 0
page_bucket[page].append(((left, right, top, bottom), txt))
def upper_context(page, i):
txt = ""
@ -720,9 +745,10 @@ def append_context2table_image4pdf(sections: list, tabls: list, table_context_si
return txt
res = []
contexts = []
for (img, tb), poss in tabls:
page, left, top, right, bott = poss[0]
_page, _left, _top, _right, _bott = poss[-1]
page, left, right, top, bott = poss[0]
_page, _left, _right, _top, _bott = poss[-1]
if isinstance(tb, list):
tb = "\n".join(tb)
@ -736,23 +762,34 @@ def append_context2table_image4pdf(sections: list, tabls: list, table_context_si
i = 0
blks = page_bucket.get(page, [])
continue
tb = upper_context(page, i) + tb + lower_context(page+1, 0)
upper = upper_context(page, i)
lower = lower_context(page + 1, 0)
tb = upper + tb + lower
contexts.append((upper.strip(), lower.strip()))
break
(_, t, r, b), txt = blks[i]
(_, _, t, b), txt = blks[i]
if b > top:
break
(_, _t, _r, _b), _txt = blks[i+1]
(_, _, _t, _b), _txt = blks[i+1]
if _t < _bott:
i += 1
continue
tb = upper_context(page, i) + tb + lower_context(page, i)
upper = upper_context(page, i)
lower = lower_context(page, i)
tb = upper + tb + lower
contexts.append((upper.strip(), lower.strip()))
break
if _tb == tb:
tb = upper_context(page, -1) + tb + lower_context(page+1, 0)
upper = upper_context(page, -1)
lower = lower_context(page + 1, 0)
tb = upper + tb + lower
contexts.append((upper.strip(), lower.strip()))
if len(contexts) < len(res) + 1:
contexts.append(("", ""))
res.append(((img, tb), poss))
return res
return contexts if return_context else res
def add_positions(d, poss):

View File

@ -158,6 +158,7 @@ KEYWORD_PROMPT_TEMPLATE = load_prompt("keyword_prompt")
QUESTION_PROMPT_TEMPLATE = load_prompt("question_prompt")
VISION_LLM_DESCRIBE_PROMPT = load_prompt("vision_llm_describe_prompt")
VISION_LLM_FIGURE_DESCRIBE_PROMPT = load_prompt("vision_llm_figure_describe_prompt")
VISION_LLM_FIGURE_DESCRIBE_PROMPT_WITH_CONTEXT = load_prompt("vision_llm_figure_describe_prompt_with_context")
STRUCTURED_OUTPUT_PROMPT = load_prompt("structured_output_prompt")
ANALYZE_TASK_SYSTEM = load_prompt("analyze_task_system")
@ -321,6 +322,11 @@ def vision_llm_figure_describe_prompt() -> str:
return template.render()
def vision_llm_figure_describe_prompt_with_context(context_above: str, context_below: str) -> str:
template = PROMPT_JINJA_ENV.from_string(VISION_LLM_FIGURE_DESCRIBE_PROMPT_WITH_CONTEXT)
return template.render(context_above=context_above, context_below=context_below)
def tool_schema(tools_description: list[dict], complete_task=False):
if not tools_description:
return ""

View File

@ -1,24 +1,72 @@
## ROLE
You are an expert visual data analyst.
## GOAL
Analyze the image and provide a comprehensive description of its content. Focus on identifying the type of visual data representation (e.g., bar chart, pie chart, line graph, table, flowchart), its structure, and any text captions or labels included in the image.
Analyze the image and produce a textual representation strictly based on what is visible in the image.
## DECISION RULE (CRITICAL)
First, determine whether the image contains an explicit visual data representation with enumerable data units forming a coherent dataset.
Enumerable data units are clearly separable, repeatable elements intended for comparison, measurement, or aggregation, such as:
- rows or columns in a table
- individual bars in a bar chart
- identifiable data points or series in a line graph
- labeled segments in a pie chart
The mere presence of numbers, icons, UI elements, or labels does NOT qualify unless they together form such a dataset.
## TASKS
1. Describe the overall structure of the visual representation. Specify if it is a chart, graph, table, or diagram.
2. Identify and extract any axes, legends, titles, or labels present in the image. Provide the exact text where available.
3. Extract the data points from the visual elements (e.g., bar heights, line graph coordinates, pie chart segments, table rows and columns).
4. Analyze and explain any trends, comparisons, or patterns shown in the data.
5. Capture any annotations, captions, or footnotes, and explain their relevance to the image.
6. Only include details that are explicitly present in the image. If an element (e.g., axis, legend, or caption) does not exist or is not visible, do not mention it.
## OUTPUT FORMAT (Include only sections relevant to the image content)
- Visual Type: [Type]
- Title: [Title text, if available]
- Axes / Legends / Labels: [Details, if available]
- Data Points: [Extracted data]
- Trends / Insights: [Analysis and interpretation]
- Captions / Annotations: [Text and relevance, if available]
1. Inspect the image and determine which output mode applies based on the decision rule.
2. Follow the output rules strictly.
3. Include only content that is explicitly visible in the image.
4. Do not infer intent, functionality, process logic, or meaning beyond what is visually or textually shown.
> Ensure high accuracy, clarity, and completeness in your analysis, and include only the information present in the image. Avoid unnecessary statements about missing elements.
## OUTPUT RULES (STRICT)
- Produce output in **exactly one** of the two modes defined below.
- Do NOT mention, label, or reference the modes in the output.
- Do NOT combine content from both modes.
- Do NOT explain or justify the choice of mode.
- Do NOT add any headings, titles, or commentary beyond what the mode requires.
---
## MODE 1: STRUCTURED VISUAL DATA OUTPUT
(Use only if the image contains enumerable data units forming a coherent dataset.)
Output **only** the following fields, in list form.
Do NOT add free-form paragraphs or additional sections.
- Visual Type:
- Title:
- Axes / Legends / Labels:
- Data Points:
- Captions / Annotations:
---
## MODE 2: GENERAL FIGURE CONTENT
(Use only if the image does NOT contain enumerable data units.)
Write the content directly, starting from the first sentence.
Do NOT add any introductory labels, titles, headings, or prefixes.
Requirements:
- Describe visible regions and components in a stable order (e.g., top-to-bottom, left-to-right).
- Explicitly name interface elements or visual objects exactly as they appear (e.g., tabs, panels, buttons, icons, input fields).
- Transcribe all visible text verbatim; do not paraphrase, summarize, or reinterpret labels.
- Describe spatial grouping, containment, and alignment of elements.
- Do NOT interpret intent, behavior, workflows, gameplay rules, or processes.
- Do NOT describe the figure as a chart, diagram, process, phase, or sequence unless such words explicitly appear in the image text.
- Avoid narrative or stylistic language unless it is a dominant and functional visual element.
Use concise, information-dense sentences.
Do not use bullet lists or structured fields in this mode.

View File

@ -0,0 +1,82 @@
## ROLE
You are an expert visual data analyst.
## GOAL
Analyze the image and produce a textual representation strictly based on what is visible in the image.
Surrounding context may be used only for minimal clarification or disambiguation of terms that appear in the image, not as a source of new information.
## CONTEXT (ABOVE)
{{ context_above }}
## CONTEXT (BELOW)
{{ context_below }}
## DECISION RULE (CRITICAL)
First, determine whether the image contains an explicit visual data representation with enumerable data units forming a coherent dataset.
Enumerable data units are clearly separable, repeatable elements intended for comparison, measurement, or aggregation, such as:
- rows or columns in a table
- individual bars in a bar chart
- identifiable data points or series in a line graph
- labeled segments in a pie chart
The mere presence of numbers, icons, UI elements, or labels does NOT qualify unless they together form such a dataset.
## TASKS
1. Inspect the image and determine which output mode applies based on the decision rule.
2. Use surrounding context only to disambiguate terms that appear in the image.
3. Follow the output rules strictly.
4. Include only content that is explicitly visible in the image.
5. Do not infer intent, functionality, process logic, or meaning beyond what is visually or textually shown.
## OUTPUT RULES (STRICT)
- Produce output in **exactly one** of the two modes defined below.
- Do NOT mention, label, or reference the modes in the output.
- Do NOT combine content from both modes.
- Do NOT explain or justify the choice of mode.
- Do NOT add any headings, titles, or commentary beyond what the mode requires.
---
## MODE 1: STRUCTURED VISUAL DATA OUTPUT
(Use only if the image contains enumerable data units forming a coherent dataset.)
Output **only** the following fields, in list form.
Do NOT add free-form paragraphs or additional sections.
- Visual Type:
- Title:
- Axes / Legends / Labels:
- Data Points:
- Captions / Annotations:
---
## MODE 2: GENERAL FIGURE CONTENT
(Use only if the image does NOT contain enumerable data units.)
Write the content directly, starting from the first sentence.
Do NOT add any introductory labels, titles, headings, or prefixes.
Requirements:
- Describe visible regions and components in a stable order (e.g., top-to-bottom, left-to-right).
- Explicitly name interface elements or visual objects exactly as they appear (e.g., tabs, panels, buttons, icons, input fields).
- Transcribe all visible text verbatim; do not paraphrase, summarize, or reinterpret labels.
- Describe spatial grouping, containment, and alignment of elements.
- Do NOT interpret intent, behavior, workflows, gameplay rules, or processes.
- Do NOT describe the figure as a chart, diagram, process, phase, or sequence unless such words explicitly appear in the image text.
- Avoid narrative or stylistic language unless it is a dominant and functional visual element.
Use concise, information-dense sentences.
Do not use bullet lists or structured fields in this mode.