Files
ragflow/rag/flow/parser/pdf_chunk_metadata.py
Magicbook1108 bb3b99f0a5 Feat: add button for remove header & footer in pipeline (#14486)
### What problem does this PR solve?

Feat: add button for remove header & footer in pipeline

### Type of change


- [x] New Feature (non-breaking change which adds functionality)
2026-04-30 12:30:41 +08:00

349 lines
11 KiB
Python

#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import io
import logging
import sys
from copy import deepcopy
from functools import partial
import numpy as np
import pdfplumber
from PIL import Image
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
from common import settings
from common.misc_utils import get_uuid
from deepdoc.parser.pdf_parser import LOCK_KEY_pdfplumber, RAGFlowPdfParser
from rag.utils.base64_image import image2id
PDF_PREVIEW_GAP = 6
PDF_PREVIEW_CONTEXT = 120
PDF_PREVIEW_ZOOM = 3
PDF_POSITIONS_KEY = "_pdf_positions"
PDF_MULTI_COLUMN_ZOOM = 3
def _extract_raw_positions(item):
positions = item.get(PDF_POSITIONS_KEY)
if isinstance(positions, list):
return deepcopy(positions)
positions = item.get("positions")
if isinstance(positions, list):
return deepcopy(positions)
position_tag = item.get("position_tag")
if isinstance(position_tag, str) and position_tag:
return [[pos[0][-1], *pos[1:]] for pos in RAGFlowPdfParser.extract_positions(position_tag)]
position_int = item.get("position_int")
if isinstance(position_int, list):
return [
list(pos)
for pos in position_int
if isinstance(pos, (list, tuple)) and len(pos) >= 5
]
if item.get("page_number") is not None and all(
item.get(key) is not None for key in ["x0", "x1", "top", "bottom"]
):
return [[item["page_number"], item["x0"], item["x1"], item["top"], item["bottom"]]]
return []
def extract_pdf_positions(item):
# Parser-owned canonical PDF coordinate shape:
# [[page_number, left, right, top, bottom], ...]
if not isinstance(item, dict):
return []
positions = _extract_raw_positions(item)
ref_page_number = item.get("page_number")
ref_page_number = int(ref_page_number) if isinstance(ref_page_number, (int, float)) else None
if ref_page_number is not None and ref_page_number <= 0:
ref_page_number += 1
normalized_positions = []
for pos in positions:
if not isinstance(pos, (list, tuple)) or len(pos) < 5:
continue
page_number = pos[0][-1] if isinstance(pos[0], list) else pos[0]
try:
page_number = int(page_number)
if ref_page_number is not None and page_number == ref_page_number - 1:
page_number = ref_page_number
elif page_number <= 0:
page_number += 1
normalized_positions.append(
[page_number, float(pos[1]), float(pos[2]), float(pos[3]), float(pos[4])]
)
except (TypeError, ValueError):
continue
return normalized_positions
def normalize_pdf_item_metadata(item):
if not isinstance(item, dict):
return item
positions = extract_pdf_positions(item)
if positions:
item[PDF_POSITIONS_KEY] = positions
else:
item.pop(PDF_POSITIONS_KEY, None)
return item
def normalize_pdf_items_metadata(items):
if not isinstance(items, list):
return items
for item in items:
normalize_pdf_item_metadata(item)
return items
def reorder_multi_column_bboxes(pdf_parser, bboxes, zoom=PDF_MULTI_COLUMN_ZOOM):
text_boxes = [
box
for box in bboxes
if box.get("layout_type") == "text"
and all(box.get(key) is not None for key in ["x0", "x1", "page_number"])
]
if not text_boxes or not pdf_parser.page_images:
return bboxes
column_width = np.median([box["x1"] - box["x0"] for box in text_boxes])
page_width = pdf_parser.page_images[0].size[0] / zoom
if column_width >= page_width / 2:
return bboxes
return pdf_parser.sort_X_by_page(bboxes, column_width / 2)
def merge_pdf_positions(sources):
merged = []
seen = set()
for source in sources or []:
if isinstance(source, dict):
positions = extract_pdf_positions(source)
elif isinstance(source, list):
positions = source
else:
positions = []
for pos in positions:
if not isinstance(pos, (list, tuple)) or len(pos) < 5:
continue
key = tuple(pos[:5])
if key in seen:
continue
seen.add(key)
merged.append(list(pos[:5]))
merged.sort(key=lambda item: (item[0], item[3], item[1]))
return merged
def build_pdf_position_fields(positions):
position_int = []
page_num_int = []
top_int = []
for pos in positions or []:
if not isinstance(pos, (list, tuple)) or len(pos) < 5:
continue
try:
page_no = int(pos[0])
left = int(pos[1])
right = int(pos[2])
top = int(pos[3])
bottom = int(pos[4])
except (TypeError, ValueError):
continue
position_int.append((page_no, left, right, top, bottom))
page_num_int.append(page_no)
top_int.append(top)
return {
"position_int": deepcopy(position_int),
"page_num_int": deepcopy(page_num_int),
"top_int": deepcopy(top_int),
}
def finalize_pdf_chunk(chunk):
if not isinstance(chunk, dict):
return chunk
positions = extract_pdf_positions(chunk)
if positions:
chunk.update(build_pdf_position_fields(positions))
chunk.pop(PDF_POSITIONS_KEY, None)
return chunk
def _fetch_source_blob(from_upstream, canvas):
if canvas._doc_id:
bucket, name = File2DocumentService.get_storage_address(doc_id=canvas._doc_id)
return settings.STORAGE_IMPL.get(bucket, name)
if from_upstream.file:
return FileService.get_blob(from_upstream.file["created_by"], from_upstream.file["id"])
return None
def _load_pdf_page_images(blob, zoom=PDF_PREVIEW_ZOOM):
with sys.modules[LOCK_KEY_pdfplumber]:
with pdfplumber.open(io.BytesIO(blob)) as pdf:
return [
page.to_image(resolution=72 * zoom, antialias=True).annotated
for page in pdf.pages
]
def _crop_pdf_preview(page_images, positions, zoom=PDF_PREVIEW_ZOOM):
if not page_images or not positions:
return None
normalized_positions = []
for pos in sorted(positions, key=lambda item: (item[0], item[3], item[1])):
if len(pos) < 5:
continue
page_idx = int(pos[0]) - 1
if not (0 <= page_idx < len(page_images)):
continue
left, right, top, bottom = map(float, pos[1:5])
if right <= left or bottom <= top:
continue
normalized_positions.append((page_idx, left, right, top, bottom))
if not normalized_positions:
return None
max_width = max(right - left for _, left, right, _, _ in normalized_positions)
first_page, first_left, _, first_top, _ = normalized_positions[0]
last_page, last_left, _, _, last_bottom = normalized_positions[-1]
def page_height(idx):
return page_images[idx].size[1] / zoom
crop_positions = [
(
[first_page],
first_left,
first_left + max_width,
max(0, first_top - PDF_PREVIEW_CONTEXT),
max(first_top - PDF_PREVIEW_GAP, 0),
)
]
crop_positions.extend(
[
([page_idx], left, right, top, bottom)
for page_idx, left, right, top, bottom in normalized_positions
]
)
crop_positions.append(
(
[last_page],
last_left,
last_left + max_width,
min(page_height(last_page), last_bottom + PDF_PREVIEW_GAP),
min(page_height(last_page), last_bottom + PDF_PREVIEW_CONTEXT),
)
)
imgs = []
for idx, (pages, left, right, top, bottom) in enumerate(crop_positions):
page_idx = pages[0]
effective_right = (
left + max_width if idx in {0, len(crop_positions) - 1} else max(left + 10, right)
)
imgs.append(
page_images[page_idx].crop(
(
left * zoom,
top * zoom,
effective_right * zoom,
min(bottom * zoom, page_images[page_idx].size[1]),
)
)
)
canvas_height = int(sum(img.size[1] for img in imgs) + PDF_PREVIEW_GAP * len(imgs))
canvas_width = int(max(img.size[0] for img in imgs))
preview = Image.new("RGB", (canvas_width, canvas_height), (245, 245, 245))
height = 0
for idx, img in enumerate(imgs):
if idx in {0, len(imgs) - 1}:
# Dim the extra context so the highlighted body stays visually distinct.
img = img.convert("RGBA")
overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
overlay.putalpha(128)
img = Image.alpha_composite(img, overlay).convert("RGB")
preview.paste(img, (0, height))
height += img.size[1] + PDF_PREVIEW_GAP
return preview
async def restore_pdf_text_previews(chunks, from_upstream, canvas):
if not chunks or not str(from_upstream.name).lower().endswith(".pdf"):
return
text_chunks = [
chunk
for chunk in chunks
if chunk.get("doc_type_kwd", "text") == "text" and extract_pdf_positions(chunk)
]
if not text_chunks:
return
blob = _fetch_source_blob(from_upstream, canvas)
if not blob:
return
try:
page_images = _load_pdf_page_images(blob)
except Exception as e:
logging.warning(f"Failed to load PDF page images for chunk preview restore: {e}")
return
preview_cache = {}
storage_put = partial(settings.STORAGE_IMPL.put, tenant_id=canvas._tenant_id)
for chunk in text_chunks:
preview_positions = extract_pdf_positions(chunk)
positions_key = tuple(tuple(pos[:5]) for pos in preview_positions)
if not positions_key:
continue
if positions_key in preview_cache:
chunk["img_id"] = preview_cache[positions_key]
continue
preview = _crop_pdf_preview(page_images, preview_positions)
if not preview:
continue
chunk["image"] = preview
await image2id(chunk, storage_put, get_uuid())
if chunk.get("img_id"):
preview_cache[positions_key] = chunk["img_id"]