ragflow/rag/flow/parser/utils.py

#
#  Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
import re
from io import BytesIO

from bs4 import BeautifulSoup
from docx import Document
from api.db.services.llm_service import LLMBundle
from api.db.joint_services.tenant_model_service import (
    get_model_config_by_type_and_name,
    get_tenant_default_model_by_type,
)
from common.constants import LLMType
from deepdoc.parser.figure_parser import VisionFigureParser
from rag.nlp import is_english, random_choices, remove_contents_table


def remove_toc(items):
    indexed = [(_item_text(item), i) for i, item in enumerate(items)]
    remove_contents_table(indexed, eng=_is_english(indexed))
    kept_indices = [i for _, i in indexed]
    return [items[i] for i in kept_indices], kept_indices


def extract_docx_header_footer_texts(filename=None, binary=None):
    doc = Document(filename) if binary is None else Document(BytesIO(binary))
    texts = set()
    for section in doc.sections:
        for container in (section.header, section.footer):
            for paragraph in container.paragraphs:
                normalized = re.sub(r"\s+", " ", paragraph.text).strip()
                if normalized:
                    texts.add(normalized)
            for table in container.tables:
                for row in table.rows:
                    for cell in row.cells:
                        normalized = re.sub(r"\s+", " ", cell.text).strip()
                        if normalized:
                            texts.add(normalized)
    return texts


def remove_header_footer_docx_sections(items, header_footer_texts):
    if not header_footer_texts:
        return items

    filtered = []
    for item in items:
        text = _item_text(item)
        normalized = re.sub(r"\s+", " ", text).strip() if isinstance(text, str) else ""
        if normalized and normalized in header_footer_texts:
            continue
        filtered.append(item)
    return filtered


def remove_header_footer_html_blob(blob):
    soup = BeautifulSoup(blob, "html.parser")
    for element in soup.find_all(
        lambda tag: tag.name in {"header", "footer"}
        or tag.get("role") in {"banner", "contentinfo"}
    ):
        element.decompose()
    return str(soup).encode("utf-8")


def extract_word_outlines(filename, binary=None):
    doc = Document(filename) if binary is None else Document(BytesIO(binary))
    outlines = []
    for paragraph in doc.paragraphs:
        text = paragraph.text.strip()
        if not text:
            continue
        style_name = paragraph.style.name if paragraph.style else ""
        match = re.search(r"Heading\s*(\d+)", style_name, re.I)
        if not match:
            continue
        outlines.append((text, int(match.group(1)) - 1, None))
    return outlines


def remove_toc_pdf(items, outlines):
    if not outlines:
        return items

    toc_start_page = None
    content_start_page = None
    for i, (title, level, page_no) in enumerate(outlines):
        if re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", title.split("@@")[0].strip().lower()):
            toc_start_page = page_no
            for next_title, next_level, next_page_no in outlines[i + 1:]:
                if next_level != level:
                    continue
                if re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", next_title.split("@@")[0].strip().lower()):
                    continue
                content_start_page = next_page_no
                break
            break

    if content_start_page:
        return [item for item in items if not (toc_start_page <= item["page_number"] < content_start_page)]
    return items


def remove_toc_word(items, outlines):
    if not outlines:
        filtered_items, _ = remove_toc(items)
        return filtered_items
    outline_titles = [title.split("@@")[0].strip().lower() for title, _, _ in outlines if title]
    if outline_titles:
        indexed = [(_item_text(item), i) for i, item in enumerate(items)]
        i = 0
        while i < len(indexed):
            if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", indexed[i][0].split("@@")[0].strip().lower()):
                i += 1
                continue
            indexed.pop(i)
            while i < len(indexed):
                text = indexed[i][0]
                normalized = text.split("@@")[0].strip().lower()
                if not normalized:
                    indexed.pop(i)
                    continue
                if any(normalized.startswith(title) or title.startswith(normalized) for title in outline_titles):
                    indexed.pop(i)
                    continue
                if re.search(r"(\.{2,}|…{2,}|·{2,}|[ ]{2,})\s*\d+\s*$", text):
                    indexed.pop(i)
                    continue
                break
            break
        items = [items[i] for _, i in indexed]
    filtered_items, _ = remove_toc(items)
    return filtered_items


def _item_text(item):
    if isinstance(item, str):
        return item
    if isinstance(item, dict):
        return item["text"]
    return item[0]


def _is_english(indexed):
    texts = [text for text, _ in indexed if text]
    if not texts:
        return False
    return is_english(random_choices(texts, k=200))


def enhance_media_sections_with_vision(
    sections,
    tenant_id,
    vlm_conf=None,
    callback=None,
):
    if not sections or not tenant_id:
        return sections

    try:
        try:
            vision_model_config = get_model_config_by_type_and_name(
                tenant_id, LLMType.IMAGE2TEXT, vlm_conf["llm_id"]
            )
        except Exception:
            vision_model_config = get_tenant_default_model_by_type(
                tenant_id, LLMType.IMAGE2TEXT
            )
        vision_model = LLMBundle(tenant_id, vision_model_config)
    except Exception:
        return sections

    for item in sections:
        if item.get("doc_type_kwd") not in {"image", "table"}:
            continue
        if item.get("image") is None:
            continue

        text = item.get("text") or ""
        try:
            parsed = VisionFigureParser(
                vision_model=vision_model,
                figures_data=[((item["image"], [""]), [(0, 0, 0, 0, 0)])],
                context_size=0,
            )(callback=callback)
        except Exception:
            continue

        if not parsed:
            continue

        # VisionFigureParser returns [((image, text_or_text_list), positions), ...].
        first_result = parsed[0]
        # first_result[0] is the (image, parsed_text) tuple.
        image_and_text = first_result[0]
        # image_and_text[1] is the parsed text content.
        parsed_text = str(image_and_text[1] or "").strip()

        if parsed_text:
            item["text"] = f"{text}\n{parsed_text}" if text else parsed_text

    return sections