mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-21 16:40:07 +08:00
### What problem does this PR solve? Feat: add button for remove header & footer in pipeline ### Type of change - [x] New Feature (non-breaking change which adds functionality)
217 lines
7.3 KiB
Python
217 lines
7.3 KiB
Python
#
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
import re
|
|
from io import BytesIO
|
|
|
|
from bs4 import BeautifulSoup
|
|
from docx import Document
|
|
from api.db.services.llm_service import LLMBundle
|
|
from api.db.joint_services.tenant_model_service import (
|
|
get_model_config_by_type_and_name,
|
|
get_tenant_default_model_by_type,
|
|
)
|
|
from common.constants import LLMType
|
|
from deepdoc.parser.figure_parser import VisionFigureParser
|
|
from rag.nlp import is_english, random_choices, remove_contents_table
|
|
|
|
|
|
def remove_toc(items):
|
|
indexed = [(_item_text(item), i) for i, item in enumerate(items)]
|
|
remove_contents_table(indexed, eng=_is_english(indexed))
|
|
kept_indices = [i for _, i in indexed]
|
|
return [items[i] for i in kept_indices], kept_indices
|
|
|
|
|
|
def extract_docx_header_footer_texts(filename=None, binary=None):
|
|
doc = Document(filename) if binary is None else Document(BytesIO(binary))
|
|
texts = set()
|
|
for section in doc.sections:
|
|
for container in (section.header, section.footer):
|
|
for paragraph in container.paragraphs:
|
|
normalized = re.sub(r"\s+", " ", paragraph.text).strip()
|
|
if normalized:
|
|
texts.add(normalized)
|
|
for table in container.tables:
|
|
for row in table.rows:
|
|
for cell in row.cells:
|
|
normalized = re.sub(r"\s+", " ", cell.text).strip()
|
|
if normalized:
|
|
texts.add(normalized)
|
|
return texts
|
|
|
|
|
|
def remove_header_footer_docx_sections(items, header_footer_texts):
|
|
if not header_footer_texts:
|
|
return items
|
|
|
|
filtered = []
|
|
for item in items:
|
|
text = _item_text(item)
|
|
normalized = re.sub(r"\s+", " ", text).strip() if isinstance(text, str) else ""
|
|
if normalized and normalized in header_footer_texts:
|
|
continue
|
|
filtered.append(item)
|
|
return filtered
|
|
|
|
|
|
def remove_header_footer_html_blob(blob):
|
|
soup = BeautifulSoup(blob, "html.parser")
|
|
for element in soup.find_all(
|
|
lambda tag: tag.name in {"header", "footer"}
|
|
or tag.get("role") in {"banner", "contentinfo"}
|
|
):
|
|
element.decompose()
|
|
return str(soup).encode("utf-8")
|
|
|
|
|
|
def extract_word_outlines(filename, binary=None):
|
|
doc = Document(filename) if binary is None else Document(BytesIO(binary))
|
|
outlines = []
|
|
for paragraph in doc.paragraphs:
|
|
text = paragraph.text.strip()
|
|
if not text:
|
|
continue
|
|
style_name = paragraph.style.name if paragraph.style else ""
|
|
match = re.search(r"Heading\s*(\d+)", style_name, re.I)
|
|
if not match:
|
|
continue
|
|
outlines.append((text, int(match.group(1)) - 1, None))
|
|
return outlines
|
|
|
|
|
|
def remove_toc_pdf(items, outlines):
|
|
if not outlines:
|
|
return items
|
|
|
|
toc_start_page = None
|
|
content_start_page = None
|
|
for i, (title, level, page_no) in enumerate(outlines):
|
|
if re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", title.split("@@")[0].strip().lower()):
|
|
toc_start_page = page_no
|
|
for next_title, next_level, next_page_no in outlines[i + 1:]:
|
|
if next_level != level:
|
|
continue
|
|
if re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", next_title.split("@@")[0].strip().lower()):
|
|
continue
|
|
content_start_page = next_page_no
|
|
break
|
|
break
|
|
|
|
if content_start_page:
|
|
return [item for item in items if not (toc_start_page <= item["page_number"] < content_start_page)]
|
|
return items
|
|
|
|
|
|
def remove_toc_word(items, outlines):
|
|
if not outlines:
|
|
filtered_items, _ = remove_toc(items)
|
|
return filtered_items
|
|
outline_titles = [title.split("@@")[0].strip().lower() for title, _, _ in outlines if title]
|
|
if outline_titles:
|
|
indexed = [(_item_text(item), i) for i, item in enumerate(items)]
|
|
i = 0
|
|
while i < len(indexed):
|
|
if not re.match(r"(contents|目录|目次|table of contents|致谢|acknowledge)$", indexed[i][0].split("@@")[0].strip().lower()):
|
|
i += 1
|
|
continue
|
|
indexed.pop(i)
|
|
while i < len(indexed):
|
|
text = indexed[i][0]
|
|
normalized = text.split("@@")[0].strip().lower()
|
|
if not normalized:
|
|
indexed.pop(i)
|
|
continue
|
|
if any(normalized.startswith(title) or title.startswith(normalized) for title in outline_titles):
|
|
indexed.pop(i)
|
|
continue
|
|
if re.search(r"(\.{2,}|…{2,}|·{2,}|[ ]{2,})\s*\d+\s*$", text):
|
|
indexed.pop(i)
|
|
continue
|
|
break
|
|
break
|
|
items = [items[i] for _, i in indexed]
|
|
filtered_items, _ = remove_toc(items)
|
|
return filtered_items
|
|
|
|
|
|
def _item_text(item):
|
|
if isinstance(item, str):
|
|
return item
|
|
if isinstance(item, dict):
|
|
return item["text"]
|
|
return item[0]
|
|
|
|
|
|
def _is_english(indexed):
|
|
texts = [text for text, _ in indexed if text]
|
|
if not texts:
|
|
return False
|
|
return is_english(random_choices(texts, k=200))
|
|
|
|
|
|
def enhance_media_sections_with_vision(
|
|
sections,
|
|
tenant_id,
|
|
vlm_conf=None,
|
|
callback=None,
|
|
):
|
|
if not sections or not tenant_id:
|
|
return sections
|
|
|
|
try:
|
|
try:
|
|
vision_model_config = get_model_config_by_type_and_name(
|
|
tenant_id, LLMType.IMAGE2TEXT, vlm_conf["llm_id"]
|
|
)
|
|
except Exception:
|
|
vision_model_config = get_tenant_default_model_by_type(
|
|
tenant_id, LLMType.IMAGE2TEXT
|
|
)
|
|
vision_model = LLMBundle(tenant_id, vision_model_config)
|
|
except Exception:
|
|
return sections
|
|
|
|
for item in sections:
|
|
if item.get("doc_type_kwd") not in {"image", "table"}:
|
|
continue
|
|
if item.get("image") is None:
|
|
continue
|
|
|
|
text = item.get("text") or ""
|
|
try:
|
|
parsed = VisionFigureParser(
|
|
vision_model=vision_model,
|
|
figures_data=[((item["image"], [""]), [(0, 0, 0, 0, 0)])],
|
|
context_size=0,
|
|
)(callback=callback)
|
|
except Exception:
|
|
continue
|
|
|
|
if not parsed:
|
|
continue
|
|
|
|
# VisionFigureParser returns [((image, text_or_text_list), positions), ...].
|
|
first_result = parsed[0]
|
|
# first_result[0] is the (image, parsed_text) tuple.
|
|
image_and_text = first_result[0]
|
|
# image_and_text[1] is the parsed text content.
|
|
parsed_text = str(image_and_text[1] or "").strip()
|
|
|
|
if parsed_text:
|
|
item["text"] = f"{text}\n{parsed_text}" if text else parsed_text
|
|
|
|
return sections
|