Files
ragflow/rag/flow/parser/utils.py
Magicbook1108 bb3b99f0a5 Feat: add button for remove header & footer in pipeline (#14486)
### What problem does this PR solve?

Feat: add button for remove header & footer in pipeline

### Type of change


- [x] New Feature (non-breaking change which adds functionality)
2026-04-30 12:30:41 +08:00

217 lines
7.3 KiB
Python

#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
import re
from io import BytesIO
from bs4 import BeautifulSoup
from docx import Document
from api.db.services.llm_service import LLMBundle
from api.db.joint_services.tenant_model_service import (
get_model_config_by_type_and_name,
get_tenant_default_model_by_type,
)
from common.constants import LLMType
from deepdoc.parser.figure_parser import VisionFigureParser
from rag.nlp import is_english, random_choices, remove_contents_table
def remove_toc(items):
indexed = [(_item_text(item), i) for i, item in enumerate(items)]
remove_contents_table(indexed, eng=_is_english(indexed))
kept_indices = [i for _, i in indexed]
return [items[i] for i in kept_indices], kept_indices
def extract_docx_header_footer_texts(filename=None, binary=None):
doc = Document(filename) if binary is None else Document(BytesIO(binary))
texts = set()
for section in doc.sections:
for container in (section.header, section.footer):
for paragraph in container.paragraphs:
normalized = re.sub(r"\s+", " ", paragraph.text).strip()
if normalized:
texts.add(normalized)
for table in container.tables:
for row in table.rows:
for cell in row.cells:
normalized = re.sub(r"\s+", " ", cell.text).strip()
if normalized:
texts.add(normalized)
return texts
def remove_header_footer_docx_sections(items, header_footer_texts):
if not header_footer_texts:
return items
filtered = []
for item in items:
text = _item_text(item)
normalized = re.sub(r"\s+", " ", text).strip() if isinstance(text, str) else ""
if normalized and normalized in header_footer_texts:
continue
filtered.append(item)
return filtered
def remove_header_footer_html_blob(blob):
soup = BeautifulSoup(blob, "html.parser")
for element in soup.find_all(
lambda tag: tag.name in {"header", "footer"}
or tag.get("role") in {"banner", "contentinfo"}
):
element.decompose()
return str(soup).encode("utf-8")
def extract_word_outlines(filename, binary=None):
doc = Document(filename) if binary is None else Document(BytesIO(binary))
outlines = []
for paragraph in doc.paragraphs:
text = paragraph.text.strip()
if not text:
continue
style_name = paragraph.style.name if paragraph.style else ""
match = re.search(r"Heading\s*(\d+)", style_name, re.I)
if not match:
continue
outlines.append((text, int(match.group(1)) - 1, None))
return outlines
def remove_toc_pdf(items, outlines):
if not outlines:
return items
toc_start_page = None
content_start_page = None
for i, (title, level, page_no) in enumerate(outlines):
if re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", title.split("@@")[0].strip().lower()):
toc_start_page = page_no
for next_title, next_level, next_page_no in outlines[i + 1:]:
if next_level != level:
continue
if re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", next_title.split("@@")[0].strip().lower()):
continue
content_start_page = next_page_no
break
break
if content_start_page:
return [item for item in items if not (toc_start_page <= item["page_number"] < content_start_page)]
return items
def remove_toc_word(items, outlines):
if not outlines:
filtered_items, _ = remove_toc(items)
return filtered_items
outline_titles = [title.split("@@")[0].strip().lower() for title, _, _ in outlines if title]
if outline_titles:
indexed = [(_item_text(item), i) for i, item in enumerate(items)]
i = 0
while i < len(indexed):
if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", indexed[i][0].split("@@")[0].strip().lower()):
i += 1
continue
indexed.pop(i)
while i < len(indexed):
text = indexed[i][0]
normalized = text.split("@@")[0].strip().lower()
if not normalized:
indexed.pop(i)
continue
if any(normalized.startswith(title) or title.startswith(normalized) for title in outline_titles):
indexed.pop(i)
continue
if re.search(r"(\.{2,}|…{2,}|·{2,}|[ ]{2,})\s*\d+\s*$", text):
indexed.pop(i)
continue
break
break
items = [items[i] for _, i in indexed]
filtered_items, _ = remove_toc(items)
return filtered_items
def _item_text(item):
if isinstance(item, str):
return item
if isinstance(item, dict):
return item["text"]
return item[0]
def _is_english(indexed):
texts = [text for text, _ in indexed if text]
if not texts:
return False
return is_english(random_choices(texts, k=200))
def enhance_media_sections_with_vision(
sections,
tenant_id,
vlm_conf=None,
callback=None,
):
if not sections or not tenant_id:
return sections
try:
try:
vision_model_config = get_model_config_by_type_and_name(
tenant_id, LLMType.IMAGE2TEXT, vlm_conf["llm_id"]
)
except Exception:
vision_model_config = get_tenant_default_model_by_type(
tenant_id, LLMType.IMAGE2TEXT
)
vision_model = LLMBundle(tenant_id, vision_model_config)
except Exception:
return sections
for item in sections:
if item.get("doc_type_kwd") not in {"image", "table"}:
continue
if item.get("image") is None:
continue
text = item.get("text") or ""
try:
parsed = VisionFigureParser(
vision_model=vision_model,
figures_data=[((item["image"], [""]), [(0, 0, 0, 0, 0)])],
context_size=0,
)(callback=callback)
except Exception:
continue
if not parsed:
continue
# VisionFigureParser returns [((image, text_or_text_list), positions), ...].
first_result = parsed[0]
# first_result[0] is the (image, parsed_text) tuple.
image_and_text = first_result[0]
# image_and_text[1] is the parsed text content.
parsed_text = str(image_and_text[1] or "").strip()
if parsed_text:
item["text"] = f"{text}\n{parsed_text}" if text else parsed_text
return sections