mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-22 00:50:10 +08:00
### What problem does this PR solve? Fix: support vlm fall back in pipeline for img/table parsing ### Type of change - [x] Bug Fix (non-breaking change which fixes an issue)
174 lines
5.8 KiB
Python
174 lines
5.8 KiB
Python
#
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
import re
|
|
from io import BytesIO
|
|
|
|
from docx import Document
|
|
from api.db.services.llm_service import LLMBundle
|
|
from api.db.joint_services.tenant_model_service import (
|
|
get_model_config_by_type_and_name,
|
|
get_tenant_default_model_by_type,
|
|
)
|
|
from common.constants import LLMType
|
|
from deepdoc.parser.figure_parser import VisionFigureParser
|
|
from rag.nlp import is_english, random_choices, remove_contents_table
|
|
|
|
|
|
def remove_toc(items):
|
|
indexed = [(_item_text(item), i) for i, item in enumerate(items)]
|
|
remove_contents_table(indexed, eng=_is_english(indexed))
|
|
kept_indices = [i for _, i in indexed]
|
|
return [items[i] for i in kept_indices], kept_indices
|
|
|
|
|
|
def extract_word_outlines(filename, binary=None):
|
|
doc = Document(filename) if binary is None else Document(BytesIO(binary))
|
|
outlines = []
|
|
for paragraph in doc.paragraphs:
|
|
text = paragraph.text.strip()
|
|
if not text:
|
|
continue
|
|
style_name = paragraph.style.name if paragraph.style else ""
|
|
match = re.search(r"Heading\s*(\d+)", style_name, re.I)
|
|
if not match:
|
|
continue
|
|
outlines.append((text, int(match.group(1)) - 1, None))
|
|
return outlines
|
|
|
|
|
|
def remove_toc_pdf(items, outlines):
|
|
if not outlines:
|
|
return items
|
|
|
|
toc_start_page = None
|
|
content_start_page = None
|
|
for i, (title, level, page_no) in enumerate(outlines):
|
|
if re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", title.split("@@")[0].strip().lower()):
|
|
toc_start_page = page_no
|
|
for next_title, next_level, next_page_no in outlines[i + 1:]:
|
|
if next_level != level:
|
|
continue
|
|
if re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", next_title.split("@@")[0].strip().lower()):
|
|
continue
|
|
content_start_page = next_page_no
|
|
break
|
|
break
|
|
|
|
if content_start_page:
|
|
return [item for item in items if not (toc_start_page <= item["page_number"] < content_start_page)]
|
|
return items
|
|
|
|
|
|
def remove_toc_word(items, outlines):
|
|
if not outlines:
|
|
filtered_items, _ = remove_toc(items)
|
|
return filtered_items
|
|
outline_titles = [title.split("@@")[0].strip().lower() for title, _, _ in outlines if title]
|
|
if outline_titles:
|
|
indexed = [(_item_text(item), i) for i, item in enumerate(items)]
|
|
i = 0
|
|
while i < len(indexed):
|
|
if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", indexed[i][0].split("@@")[0].strip().lower()):
|
|
i += 1
|
|
continue
|
|
indexed.pop(i)
|
|
while i < len(indexed):
|
|
text = indexed[i][0]
|
|
normalized = text.split("@@")[0].strip().lower()
|
|
if not normalized:
|
|
indexed.pop(i)
|
|
continue
|
|
if any(normalized.startswith(title) or title.startswith(normalized) for title in outline_titles):
|
|
indexed.pop(i)
|
|
continue
|
|
if re.search(r"(\.{2,}|…{2,}|·{2,}|[ ]{2,})\s*\d+\s*$", text):
|
|
indexed.pop(i)
|
|
continue
|
|
break
|
|
break
|
|
items = [items[i] for _, i in indexed]
|
|
filtered_items, _ = remove_toc(items)
|
|
return filtered_items
|
|
|
|
|
|
def _item_text(item):
|
|
if isinstance(item, str):
|
|
return item
|
|
if isinstance(item, dict):
|
|
return item["text"]
|
|
return item[0]
|
|
|
|
|
|
def _is_english(indexed):
|
|
texts = [text for text, _ in indexed if text]
|
|
if not texts:
|
|
return False
|
|
return is_english(random_choices(texts, k=200))
|
|
|
|
|
|
def enhance_media_sections_with_vision(
|
|
sections,
|
|
tenant_id,
|
|
vlm_conf=None,
|
|
callback=None,
|
|
):
|
|
if not sections or not tenant_id:
|
|
return sections
|
|
|
|
try:
|
|
try:
|
|
vision_model_config = get_model_config_by_type_and_name(
|
|
tenant_id, LLMType.IMAGE2TEXT, vlm_conf["llm_id"]
|
|
)
|
|
except Exception:
|
|
vision_model_config = get_tenant_default_model_by_type(
|
|
tenant_id, LLMType.IMAGE2TEXT
|
|
)
|
|
vision_model = LLMBundle(tenant_id, vision_model_config)
|
|
except Exception:
|
|
return sections
|
|
|
|
for item in sections:
|
|
if item.get("doc_type_kwd") not in {"image", "table"}:
|
|
continue
|
|
if item.get("image") is None:
|
|
continue
|
|
|
|
text = item.get("text") or ""
|
|
try:
|
|
parsed = VisionFigureParser(
|
|
vision_model=vision_model,
|
|
figures_data=[((item["image"], [""]), [(0, 0, 0, 0, 0)])],
|
|
context_size=0,
|
|
)(callback=callback)
|
|
except Exception:
|
|
continue
|
|
|
|
if not parsed:
|
|
continue
|
|
|
|
# VisionFigureParser returns [((image, text_or_text_list), positions), ...].
|
|
first_result = parsed[0]
|
|
# first_result[0] is the (image, parsed_text) tuple.
|
|
image_and_text = first_result[0]
|
|
# image_and_text[1] is the parsed text content.
|
|
parsed_text = str(image_and_text[1] or "").strip()
|
|
|
|
if parsed_text:
|
|
item["text"] = f"{text}\n{parsed_text}" if text else parsed_text
|
|
|
|
return sections
|