Files
ragflow/rag/flow/parser/parser.py
Magicbook1108 bb3b99f0a5 Feat: add button for remove header & footer in pipeline (#14486)
### What problem does this PR solve?

Feat: add button for remove header & footer in pipeline

### Type of change


- [x] New Feature (non-breaking change which adds functionality)
2026-04-30 12:30:41 +08:00

1339 lines
55 KiB
Python

#
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import asyncio
import io
import json
import os
import random
import re
from functools import partial
from litellm import logging
import numpy as np
from PIL import Image
from api.db.services.file2document_service import File2DocumentService
from api.db.services.file_service import FileService
from api.db.services.llm_service import LLMBundle
from api.db.joint_services.tenant_model_service import get_model_config_by_type_and_name, get_tenant_default_model_by_type
from common import settings
from common.constants import LLMType
from common.misc_utils import get_uuid, thread_pool_exec
from deepdoc.parser import ExcelParser, HtmlParser, TxtParser
from deepdoc.parser.docling_parser import DoclingParser
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
from deepdoc.parser.tcadp_parser import TCADPParser
from rag.app.naive import Docx
from rag.flow.base import ProcessBase, ProcessParamBase
from rag.flow.parser.pdf_chunk_metadata import (
extract_pdf_positions,
normalize_pdf_items_metadata,
reorder_multi_column_bboxes,
)
from rag.flow.parser.schema import ParserFromUpstream
from rag.flow.parser.utils import (
enhance_media_sections_with_vision,
extract_word_outlines,
extract_docx_header_footer_texts,
remove_header_footer_docx_sections,
remove_header_footer_html_blob,
remove_toc,
remove_toc_pdf,
remove_toc_word,
)
from rag.llm.cv_model import Base as VLM
from rag.utils.base64_image import image2id
class ParserParam(ProcessParamBase):
def __init__(self):
super().__init__()
self.allowed_output_format = {
"pdf": [
"json",
"markdown",
],
"spreadsheet": [
"json",
"markdown",
"html",
],
"doc": [
"json",
"markdown",
],
"docx": [
"json",
"markdown",
],
"slides": [
"json",
],
"image": [
"json",
],
"email": [
"text",
"json",
],
"markdown": [
"text",
"json",
],
"text&code": [
"text",
"json",
],
"html": [
"text",
"json",
],
"audio": [
"json",
],
"video": [],
"epub": [
"text",
"json",
],
}
self.setups = {
"pdf": {
"parse_method": "deepdoc", # deepdoc/plain_text/tcadp_parser/vlm
"lang": "Chinese",
"flatten_media_to_text": False,
"remove_toc": False,
"remove_header_footer": False,
"suffix": [
"pdf",
],
"output_format": "json",
},
"spreadsheet": {
"parse_method": "deepdoc", # deepdoc/tcadp_parser
"flatten_media_to_text": False,
"output_format": "html",
"suffix": [
"xls",
"xlsx",
"csv",
],
},
"doc": {
"remove_toc": False,
"remove_header_footer": False,
"suffix": [
"doc",
],
"output_format": "json",
},
"docx": {
"flatten_media_to_text": False,
"remove_toc": False,
"remove_header_footer": False,
"suffix": [
"docx",
],
"output_format": "json",
},
"markdown": {
"flatten_media_to_text": False,
"suffix": ["md", "markdown", "mdx"],
"remove_toc": False,
"output_format": "json",
},
"text&code": {
"suffix": [
"txt",
"py",
"js",
"java",
"c",
"cpp",
"h",
"php",
"go",
"ts",
"sh",
"cs",
"kt",
"sql",
],
"output_format": "json",
},
"html": {
"suffix": ["htm", "html"],
"remove_toc": False,
"remove_header_footer": False,
"output_format": "json",
},
"slides": {
"parse_method": "deepdoc", # deepdoc/tcadp_parser
"suffix": [
"pptx",
"ppt",
],
"output_format": "json",
},
"image": {
"parse_method": "ocr",
"llm_id": "",
"lang": "Chinese",
"system_prompt": "",
"suffix": ["jpg", "jpeg", "png", "gif"],
"output_format": "json",
},
"email": {
"suffix": [
"eml",
"msg",
],
"fields": ["from", "to", "cc", "bcc", "date", "subject", "body", "attachments", "metadata"],
"output_format": "json",
},
"audio": {
"suffix": [
"da",
"wave",
"wav",
"mp3",
"aac",
"flac",
"ogg",
"aiff",
"au",
"midi",
"wma",
"realaudio",
"vqf",
"oggvorbis",
"ape",
],
"output_format": "text",
},
"video": {
"suffix": [
"mp4",
"avi",
"mkv",
],
"output_format": "text",
"prompt": "",
},
"epub": {
"suffix": [
"epub",
],
"output_format": "json",
},
}
def check(self):
pdf_config = self.setups.get("pdf", {})
if pdf_config:
pdf_parse_method = pdf_config.get("parse_method", "")
self.check_empty(pdf_parse_method, "Parse method abnormal.")
if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru", "docling", "opendataloader", "tcadp parser", "paddleocr"]:
self.check_empty(pdf_config.get("lang", ""), "PDF VLM language")
pdf_output_format = pdf_config.get("output_format", "")
self.check_valid_value(pdf_output_format, "PDF output format abnormal.", self.allowed_output_format["pdf"])
spreadsheet_config = self.setups.get("spreadsheet", "")
if spreadsheet_config:
spreadsheet_output_format = spreadsheet_config.get("output_format", "")
self.check_valid_value(spreadsheet_output_format, "Spreadsheet output format abnormal.", self.allowed_output_format["spreadsheet"])
doc_config = self.setups.get("doc", "")
if doc_config:
doc_output_format = doc_config.get("output_format", "")
self.check_valid_value(doc_output_format, "DOC output format abnormal.", self.allowed_output_format["doc"])
docx_config = self.setups.get("docx", "")
if docx_config:
docx_output_format = docx_config.get("output_format", "")
self.check_valid_value(docx_output_format, "DOCX output format abnormal.", self.allowed_output_format["docx"])
slides_config = self.setups.get("slides", "")
if slides_config:
slides_output_format = slides_config.get("output_format", "")
self.check_valid_value(slides_output_format, "Slides output format abnormal.", self.allowed_output_format["slides"])
image_config = self.setups.get("image", "")
if image_config:
image_parse_method = image_config.get("parse_method", "")
if image_parse_method not in ["ocr"]:
self.check_empty(image_config.get("lang", ""), "Image VLM language")
text_config = self.setups.get("markdown", "")
if text_config:
text_output_format = text_config.get("output_format", "")
self.check_valid_value(text_output_format, "Markdown output format abnormal.", self.allowed_output_format["markdown"])
code_config = self.setups.get("text&code", "")
if code_config:
code_output_format = code_config.get("output_format", "")
self.check_valid_value(code_output_format, "Text&Code output format abnormal.", self.allowed_output_format["text&code"])
html_config = self.setups.get("html", "")
if html_config:
html_output_format = html_config.get("output_format", "")
self.check_valid_value(html_output_format, "HTML output format abnormal.", self.allowed_output_format["html"])
audio_config = self.setups.get("audio", "")
if audio_config:
audio_vlm = audio_config.get("vlm") or {}
self.check_empty(audio_vlm.get("llm_id"), "Audio VLM")
video_config = self.setups.get("video", "")
if video_config:
video_vlm = video_config.get("vlm") or {}
self.check_empty(video_vlm.get("llm_id"), "Video VLM")
email_config = self.setups.get("email", "")
if email_config:
email_output_format = email_config.get("output_format", "")
self.check_valid_value(email_output_format, "Email output format abnormal.", self.allowed_output_format["email"])
epub_config = self.setups.get("epub", "")
if epub_config:
epub_output_format = epub_config.get("output_format", "")
self.check_valid_value(epub_output_format, "EPUB output format abnormal.", self.allowed_output_format["epub"])
def get_input_form(self) -> dict[str, dict]:
return {}
class Parser(ProcessBase):
component_name = "Parser"
def _pdf(self, name, blob, **kwargs):
"""Parse PDF files into structured boxes or markdown/json output."""
self.callback(random.randint(1, 5) / 100.0, "Start to work on a PDF.")
conf = self._param.setups["pdf"]
self.set_output("output_format", conf["output_format"])
flatten_media_to_text = conf.get("flatten_media_to_text")
pdf_parser = None
# Normalize parser selection and optional provider-specific model name.
raw_parse_method = conf.get("parse_method", "")
parser_model_name = None
parse_method = raw_parse_method
parse_method = parse_method or ""
if isinstance(raw_parse_method, str):
lowered = raw_parse_method.lower()
if lowered.endswith("@mineru"):
parser_model_name = raw_parse_method.rsplit("@", 1)[0]
parse_method = "MinerU"
elif lowered.endswith("@paddleocr"):
parser_model_name = raw_parse_method.rsplit("@", 1)[0]
parse_method = "PaddleOCR"
# DeepDOC returns structured page boxes directly.
if parse_method.lower() == "deepdoc":
pdf_parser = RAGFlowPdfParser()
bboxes = pdf_parser.parse_into_bboxes(blob, callback=self.callback)
if conf.get("enable_multi_column"):
bboxes = reorder_multi_column_bboxes(pdf_parser, bboxes)
# Plain text only keeps extracted text lines.
elif parse_method.lower() == "plain_text":
pdf_parser = PlainParser()
lines, _ = pdf_parser(blob)
bboxes = [{"text": t, "layout_type": "text"} for t, _ in lines]
# MinerU/PaddleOCR/Docling/TCADP all return line-like sections that need
# to be converted into the shared bbox-like structure used below.
elif parse_method.lower() == "mineru":
def resolve_mineru_llm_name():
configured = parser_model_name or conf.get("mineru_llm_name")
if configured:
return configured
tenant_id = self._canvas._tenant_id
if not tenant_id:
return None
from api.db.services.tenant_llm_service import TenantLLMService
env_name = TenantLLMService.ensure_mineru_from_env(tenant_id)
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="MinerU", model_type=LLMType.OCR.value)
if candidates:
return candidates[0].llm_name
return env_name
parser_model_name = resolve_mineru_llm_name()
if not parser_model_name:
raise RuntimeError("MinerU model not configured. Please add MinerU in Model Providers or set MINERU_* env.")
tenant_id = self._canvas._tenant_id
ocr_model_config = get_model_config_by_type_and_name(tenant_id, LLMType.OCR, parser_model_name)
ocr_model = LLMBundle(tenant_id, ocr_model_config, lang=conf.get("lang", "Chinese"))
pdf_parser = ocr_model.mdl
lines, _ = pdf_parser.parse_pdf(
filepath=name,
binary=blob,
callback=self.callback,
parse_method="pipeline",
lang=conf.get("lang", "Chinese"),
)
bboxes = []
for line in lines or []:
if not isinstance(line, tuple) or len(line) < 3:
continue
t, layout_type, poss = line[0], line[1], line[2]
box = {
"text": t,
"layout_type": layout_type or "text",
}
positions = [[pos[0][-1] + 1, *pos[1:]] for pos in pdf_parser.extract_positions(poss)]
if positions:
box["positions"] = positions
image = pdf_parser.crop(poss, 1)
if image is not None:
box["image"] = image
bboxes.append(box)
elif parse_method.lower() == "docling":
pdf_parser = DoclingParser(docling_server_url=os.environ.get("DOCLING_SERVER_URL", ""))
lines, _ = pdf_parser.parse_pdf(
filepath=name,
binary=blob,
callback=self.callback,
parse_method="pipeline",
docling_server_url=os.environ.get("DOCLING_SERVER_URL", ""),
)
bboxes = []
for item in lines or []:
if not isinstance(item, tuple) or len(item) < 3:
continue
text, layout_type, poss = item[0], item[1], item[2]
box = {
"text": text,
"layout_type": layout_type or "text",
}
if isinstance(poss, str) and poss:
positions = [[pos[0][-1] + 1, *pos[1:]] for pos in pdf_parser.extract_positions(poss)]
if positions:
box["positions"] = positions
image = pdf_parser.crop(poss, 1)
if image is not None:
box["image"] = image
bboxes.append(box)
elif parse_method.lower() == "opendataloader":
def resolve_opendataloader_llm_name():
configured = parser_model_name or conf.get("opendataloader_llm_name")
if configured:
return configured
tenant_id = self._canvas._tenant_id
if not tenant_id:
return None
from api.db.services.tenant_llm_service import TenantLLMService
env_name = TenantLLMService.ensure_opendataloader_from_env(tenant_id)
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="OpenDataLoader", model_type=LLMType.OCR.value)
if candidates:
return candidates[0].llm_name
return env_name
parser_model_name = resolve_opendataloader_llm_name()
if not parser_model_name:
raise RuntimeError("OpenDataLoader model not configured. Please add OpenDataLoader in Model Providers.")
tenant_id = self._canvas._tenant_id
ocr_model_config = get_model_config_by_type_and_name(tenant_id, LLMType.OCR, parser_model_name)
ocr_model = LLMBundle(tenant_id, ocr_model_config)
pdf_parser = ocr_model.mdl
lines, odl_tables = pdf_parser.parse_pdf(
filepath=name,
binary=blob,
callback=self.callback,
parse_method="pipeline",
)
bboxes = []
for item in lines or []:
if not isinstance(item, tuple) or len(item) < 3:
continue
text, layout_type, poss = item[0], item[1], item[2]
box = {
"text": text,
"layout_type": layout_type or "text",
}
if isinstance(poss, str) and poss:
positions = [[pos[0][-1] + 1, *pos[1:]] for pos in pdf_parser.extract_positions(poss)]
if positions:
box["positions"] = positions
image = pdf_parser.crop(poss, 1)
if image is not None:
box["image"] = image
bboxes.append(box)
# Merge tables and images from the second return value.
for (img, html_or_caption), positions in odl_tables or []:
box = {"layout_type": "table" if not isinstance(html_or_caption, list) else "figure"}
if isinstance(html_or_caption, str):
box["text"] = html_or_caption
elif isinstance(html_or_caption, list):
box["text"] = html_or_caption[0] if html_or_caption else ""
if img is not None:
box["image"] = img
if positions:
try:
box["positions"] = [[p[0] + 1, p[1], p[2], p[3], p[4]] for p in positions]
except Exception:
pass
bboxes.append(box)
elif parse_method.lower() == "tcadp parser":
# ADP is a document parsing tool using Tencent Cloud API
table_result_type = conf.get("table_result_type", "1")
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
pdf_parser = TCADPParser(
table_result_type=table_result_type,
markdown_image_response_type=markdown_image_response_type,
)
sections, _ = pdf_parser.parse_pdf(
filepath=name,
binary=blob,
callback=self.callback,
file_type="PDF",
file_start_page=1,
file_end_page=1000,
)
bboxes = []
for section, position_tag in sections:
if position_tag:
match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
if match:
pn, x0, x1, top, bott = match.groups()
bboxes.append(
{
"page_number": int(pn.split("-")[0]),
"x0": float(x0),
"x1": float(x1),
"top": float(top),
"bottom": float(bott),
"text": section,
"layout_type": "text",
}
)
else:
bboxes.append({"text": section, "layout_type": "text"})
else:
bboxes.append({"text": section, "layout_type": "text"})
elif parse_method.lower() == "paddleocr":
def resolve_paddleocr_llm_name():
configured = parser_model_name or conf.get("paddleocr_llm_name")
if configured:
return configured
tenant_id = self._canvas._tenant_id
if not tenant_id:
return None
from api.db.services.tenant_llm_service import TenantLLMService
env_name = TenantLLMService.ensure_paddleocr_from_env(tenant_id)
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="PaddleOCR", model_type=LLMType.OCR.value)
if candidates:
return candidates[0].llm_name
return env_name
parser_model_name = resolve_paddleocr_llm_name()
if not parser_model_name:
raise RuntimeError("PaddleOCR model not configured. Please add PaddleOCR in Model Providers or set PADDLEOCR_* env.")
tenant_id = self._canvas._tenant_id
ocr_model_config = get_model_config_by_type_and_name(tenant_id, LLMType.OCR, parser_model_name)
ocr_model = LLMBundle(tenant_id, ocr_model_config)
pdf_parser = ocr_model.mdl
lines, _ = pdf_parser.parse_pdf(
filepath=name,
binary=blob,
callback=self.callback,
parse_method="pipeline",
)
bboxes = []
for line in lines or []:
if not isinstance(line, tuple) or len(line) < 3:
continue
t, layout_type, poss = line[0], line[1], line[2]
box = {
"text": t,
"layout_type": layout_type or "text",
}
positions = [[pos[0][-1] + 1, *pos[1:]] for pos in pdf_parser.extract_positions(poss)]
if positions:
box["positions"] = positions
image = pdf_parser.crop(poss)
if image is not None:
box["image"] = image
bboxes.append(box)
# Vision parser treats each page as a large image block.
else:
if conf.get("parse_method"):
vision_model_config = get_model_config_by_type_and_name(self._canvas._tenant_id, LLMType.IMAGE2TEXT, conf["parse_method"])
else:
vision_model_config = get_tenant_default_model_by_type(self._canvas._tenant_id, LLMType.IMAGE2TEXT)
vision_model = LLMBundle(self._canvas._tenant_id, vision_model_config, lang=self._param.setups["pdf"].get("lang"))
pdf_parser = VisionParser(vision_model=vision_model)
lines, _ = pdf_parser(blob, callback=self.callback)
bboxes = []
for t, poss in lines:
for pn, x0, x1, top, bott in RAGFlowPdfParser.extract_positions(poss):
bboxes.append(
{
"page_number": int(pn[0]) + 1,
"x0": float(x0),
"x1": float(x1),
"top": float(top),
"bottom": float(bott),
"text": t,
"layout_type": "text",
}
)
# Persist outlines and optionally remove TOC before normalizing metadata.
self.set_output("file", {**kwargs.get("file", {}), "outlines": pdf_parser.outlines})
if conf.get("remove_toc"):
if not pdf_parser.outlines:
bboxes, _ = remove_toc(bboxes)
elif pdf_parser.outlines[0][2] == 1:
bboxes = remove_toc_pdf(bboxes, pdf_parser.outlines)
else:
first_outline_page = pdf_parser.outlines[0][2]
split_at = len(bboxes)
for i, item in enumerate(bboxes):
page_number = item.get("page_number")
if page_number is None:
positions = extract_pdf_positions(item)
if positions:
page_number = positions[0][0]
if page_number is not None and page_number >= first_outline_page:
split_at = i
break
toc_bboxes, _ = remove_toc(bboxes[:split_at])
bboxes = toc_bboxes + bboxes[split_at:]
normalize_bboxes = []
# Normalize shared bbox fields for downstream consumers.
for b in bboxes:
raw_layout = str(b.get("layout_type") or "").strip()
has_layout = bool(raw_layout)
layout = re.sub(r"\s+", " ", raw_layout) if has_layout else "text"
b["layout_type"] = layout
if conf.get("remove_header_footer") and re.search(r"(header|footer|number)", raw_layout, re.I):
continue
if flatten_media_to_text:
b["doc_type_kwd"] = "text"
elif layout == "table":
b["doc_type_kwd"] = "table"
elif layout == "figure":
b["doc_type_kwd"] = "image"
elif not has_layout and b.get("image") is not None:
b["doc_type_kwd"] = "image"
else:
b["doc_type_kwd"] = "text"
normalize_bboxes.append(b)
bboxes = normalize_bboxes
enhance_media_sections_with_vision(
bboxes,
self._canvas._tenant_id,
conf.get("vlm"),
callback=self.callback,
)
# Emit the requested final PDF output format.
if conf.get("output_format") == "json":
normalize_pdf_items_metadata(bboxes)
self.set_output("json", bboxes)
if conf.get("output_format") == "markdown":
mkdn = ""
for b in bboxes:
if b.get("layout_type", "") == "title":
mkdn += "\n## "
if b.get("layout_type", "") == "figure":
mkdn += "\n![Image]({})".format(VLM.image2base64(b["image"]))
continue
mkdn += b.get("text", "") + "\n"
self.set_output("markdown", mkdn)
def _spreadsheet(self, name, blob, **kwargs):
"""Parse spreadsheet files and normalize them into html/json/markdown output."""
self.callback(random.randint(1, 5) / 100.0, "Start to work on a Spreadsheet.")
conf = self._param.setups["spreadsheet"]
self.set_output("output_format", conf["output_format"])
flatten_media_to_text = conf.get("flatten_media_to_text")
parse_method = conf.get("parse_method", "deepdoc")
# Handle TCADP parser
if parse_method.lower() == "tcadp parser":
table_result_type = conf.get("table_result_type", "1")
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
tcadp_parser = TCADPParser(
table_result_type=table_result_type,
markdown_image_response_type=markdown_image_response_type,
)
if not tcadp_parser.check_installation():
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
# Determine file type based on extension
if re.search(r"\.xlsx?$", name, re.IGNORECASE):
file_type = "XLSX"
else:
file_type = "CSV"
self.callback(0.2, f"Using TCADP parser for {file_type} file.")
sections, tables = tcadp_parser.parse_pdf(
filepath=name,
binary=blob,
callback=self.callback,
file_type=file_type,
file_start_page=1,
file_end_page=1000,
)
# Process TCADP parser output based on configured output_format
output_format = conf.get("output_format", "html")
if output_format == "html":
# For HTML output, combine sections and tables into HTML
html_content = ""
for section, position_tag in sections:
if section:
html_content += section + "\n"
for table in tables:
if table:
html_content += table + "\n"
self.set_output("html", html_content)
elif output_format == "json":
# For JSON output, create a list of text items
result = []
# Add sections as text
for section, position_tag in sections:
if section:
result.append({"text": section, "doc_type_kwd": "text"})
# Add tables as text
for table in tables:
if table:
result.append(
{
"text": table,
"doc_type_kwd": "text" if flatten_media_to_text else "table",
}
)
self.set_output("json", result)
elif output_format == "markdown":
# For markdown output, combine into markdown
md_content = ""
for section, position_tag in sections:
if section:
md_content += section + "\n\n"
for table in tables:
if table:
md_content += table + "\n\n"
self.set_output("markdown", md_content)
else:
# Default DeepDOC parser
spreadsheet_parser = ExcelParser()
if conf.get("output_format") == "html":
htmls = spreadsheet_parser.html(blob, 1000000000)
self.set_output("html", htmls[0])
elif conf.get("output_format") == "json":
self.set_output("json", [{"text": txt, "doc_type_kwd": "text"} for txt in spreadsheet_parser(blob) if txt])
elif conf.get("output_format") == "markdown":
self.set_output("markdown", spreadsheet_parser.markdown(blob))
def _doc(self, name, blob, **kwargs):
"""Parse DOC files into text/json sections."""
self.callback(random.randint(1, 5) / 100.0, "Start to work on a DOC document")
conf = self._param.setups["doc"]
self.set_output("output_format", conf["output_format"])
from tika import parser as tika_parser
parsed = tika_parser.from_buffer(io.BytesIO(blob))
sections = [line for line in parsed["content"].split("\n") if line]
if conf.get("output_format") == "json":
self.set_output("json", [{"text": section, "doc_type_kwd": "text"} for section in sections])
return
self.set_output("markdown", "\n".join(sections))
def _docx(self, name, blob, **kwargs):
"""Parse DOCX files and optionally remove table-of-contents content."""
self.callback(random.randint(1, 5) / 100.0, "Start to work on a DOCX document")
conf = self._param.setups["docx"]
self.set_output("output_format", conf["output_format"])
flatten_media_to_text = conf.get("flatten_media_to_text")
if re.search(r"\.doc$", name, re.IGNORECASE):
self.set_output("file", {**kwargs.get("file", {}), "outlines": []})
try:
from tika import parser as tika_parser
except Exception as e:
msg = f"tika not available: {e}. Unsupported .doc parsing."
self.callback(0.8, msg)
logging.warning(f"{msg} for {name}.")
return
doc_parsed = tika_parser.from_buffer(io.BytesIO(blob))
content = doc_parsed.get("content")
if content is None:
msg = f"tika.parser got empty content from {name}."
self.callback(0.8, msg)
logging.warning(msg)
return
sections = [line.strip() for line in content.splitlines() if line and line.strip()]
if conf.get("remove_toc"):
sections = remove_toc_word(sections, [])
if conf.get("output_format") == "json":
self.set_output(
"json",
[{"text": line, "image": None, "doc_type_kwd": "text"} for line in sections],
)
elif conf.get("output_format") == "markdown":
# Tika gives us plain text lines, so join with blank lines to preserve paragraph boundaries in markdown.
self.set_output("markdown", "\n\n".join(sections))
self.callback(0.8, "Finish parsing.")
return
docx_parser = Docx()
# Extract heading-based outlines for metadata and TOC removal.
outlines = extract_word_outlines(name, blob)
self.set_output("file", {**kwargs.get("file", {}), "outlines": outlines})
# JSON output keeps text/image blocks and appends table HTML as table items.
if conf.get("output_format") == "json":
main_sections = docx_parser(name, binary=blob)
if conf.get("remove_header_footer"):
header_footer_texts = extract_docx_header_footer_texts(binary=blob)
main_sections = remove_header_footer_docx_sections(main_sections, header_footer_texts)
if conf.get("remove_toc"):
main_sections = remove_toc_word(main_sections, outlines)
sections = []
for text, image, html in main_sections:
sections.append(
{
"text": text,
"image": image,
"doc_type_kwd": "text" if flatten_media_to_text or image is None else "image",
}
)
if html:
sections.append(
{
"text": html,
"image": None,
"doc_type_kwd": "text" if flatten_media_to_text else "table",
}
)
enhance_media_sections_with_vision(
sections,
self._canvas._tenant_id,
conf.get("vlm"),
callback=self.callback,
)
self.set_output("json", sections)
# Markdown output removes TOC on plain markdown lines before writing back.
elif conf.get("output_format") == "markdown":
markdown_text = docx_parser.to_markdown(name, binary=blob)
if conf.get("remove_header_footer"):
header_footer_texts = extract_docx_header_footer_texts(binary=blob)
markdown_lines = remove_header_footer_docx_sections(markdown_text.split("\n"), header_footer_texts)
markdown_text = "\n".join(markdown_lines)
if conf.get("remove_toc"):
markdown_text = "\n".join(remove_toc_word(markdown_text.split("\n"), outlines))
self.set_output("markdown", markdown_text)
def _slides(self, name, blob, **kwargs):
"""Parse presentation files into json sections."""
self.callback(random.randint(1, 5) / 100.0, "Start to work on a PowerPoint Document")
conf = self._param.setups["slides"]
self.set_output("output_format", conf["output_format"])
parse_method = conf.get("parse_method", "deepdoc")
# Handle TCADP parser
if parse_method.lower() == "tcadp parser":
table_result_type = conf.get("table_result_type", "1")
markdown_image_response_type = conf.get("markdown_image_response_type", "1")
tcadp_parser = TCADPParser(
table_result_type=table_result_type,
markdown_image_response_type=markdown_image_response_type,
)
if not tcadp_parser.check_installation():
raise RuntimeError("TCADP parser not available. Please check Tencent Cloud API configuration.")
# Determine file type based on extension
if re.search(r"\.pptx?$", name, re.IGNORECASE):
file_type = "PPTX"
else:
file_type = "PPT"
self.callback(0.2, f"Using TCADP parser for {file_type} file.")
sections, tables = tcadp_parser.parse_pdf(
filepath=name,
binary=blob,
callback=self.callback,
file_type=file_type,
file_start_page=1,
file_end_page=1000,
)
# Process TCADP parser output - PPT only supports json format
output_format = conf.get("output_format", "json")
if output_format == "json":
# For JSON output, create a list of text items
result = []
# Add sections as text
for section, position_tag in sections:
if section:
result.append({"text": section, "doc_type_kwd": "text"})
# Add tables as text
for table in tables:
if table:
result.append({"text": table, "doc_type_kwd": "table"})
self.set_output("json", result)
else:
# Default DeepDOC parser (supports .pptx format)
from deepdoc.parser.ppt_parser import RAGFlowPptParser as ppt_parser
ppt_parser = ppt_parser()
txts = ppt_parser(blob, 0, 100000, None)
sections = [{"text": section, "doc_type_kwd": "text"} for section in txts if section.strip()]
# json
assert conf.get("output_format") == "json", "have to be json for ppt"
if conf.get("output_format") == "json":
self.set_output("json", sections)
def _markdown(self, name, blob, **kwargs):
"""Parse markdown files into text/json sections."""
from functools import reduce
from rag.app.naive import Markdown as naive_markdown_parser
from rag.nlp import concat_img
self.callback(random.randint(1, 5) / 100.0, "Start to work on a markdown.")
conf = self._param.setups["markdown"]
self.set_output("output_format", conf["output_format"])
flatten_media_to_text = conf.get("flatten_media_to_text")
markdown_parser = naive_markdown_parser()
sections, tables, section_images = markdown_parser(
name,
blob,
separate_tables=False,
delimiter=conf.get("delimiter"),
return_section_images=True,
)
if conf.get("output_format") == "json":
json_results = []
for idx, (section_text, _) in enumerate(sections):
json_result = {
"text": section_text,
}
images = []
if section_images and len(section_images) > idx and section_images[idx] is not None:
images.append(section_images[idx])
if images:
# If multiple images found, combine them using concat_img
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
json_result["image"] = combined_image
json_result["doc_type_kwd"] = (
"text"
if flatten_media_to_text or json_result.get("image") is None
else "image"
)
json_results.append(json_result)
for table in tables:
table_text = table[0][1] if table and table[0] else ""
if table_text:
json_results.append(
{
"text": table_text,
"doc_type_kwd": "text" if flatten_media_to_text else "table",
}
)
enhance_media_sections_with_vision(
json_results,
self._canvas._tenant_id,
conf.get("vlm"),
callback=self.callback,
)
self.set_output("json", json_results)
else:
texts = [section_text for section_text, _ in sections if section_text]
texts.extend(table[0][1] for table in tables if table and table[0] and table[0][1])
self.set_output("text", "\n".join(texts))
def _code(self, name, blob, **kwargs):
"""Parse text and source code files as plain text chunks."""
self.callback(random.randint(1, 5) / 100.0, "Start to work on a text or code file.")
conf = self._param.setups["text&code"]
self.set_output("output_format", conf["output_format"])
sections = TxtParser()(
name,
blob,
conf.get("chunk_token_num", 128),
conf.get("delimiter", "\n!?;。;!?"),
)
if conf.get("output_format") == "json":
self.set_output("json", [{"text": section[0], "doc_type_kwd": "text"} for section in sections if section[0]])
return
self.set_output("text", "\n".join([section[0] for section in sections if section[0]]))
def _html(self, name, blob, **kwargs):
"""Parse HTML files into text/json sections."""
self.callback(random.randint(1, 5) / 100.0, "Start to work on an HTML document.")
conf = self._param.setups["html"]
self.set_output("output_format", conf["output_format"])
if conf.get("remove_header_footer"):
blob = remove_header_footer_html_blob(blob)
sections = HtmlParser()(name, blob, int(conf.get("chunk_token_num", 512)))
if conf.get("remove_toc"):
sections, _ = remove_toc(sections)
if conf.get("output_format") == "json":
self.set_output("json", [{"text": section, "doc_type_kwd": "text"} for section in sections if section])
return
self.set_output("text", "\n".join([section for section in sections if section]))
def _image(self, name, blob, **kwargs):
"""Parse images with OCR or image-to-text models."""
from deepdoc.vision import OCR
self.callback(random.randint(1, 5) / 100.0, "Start to work on an image.")
conf = self._param.setups["image"]
self.set_output("output_format", conf["output_format"])
img = Image.open(io.BytesIO(blob)).convert("RGB")
if conf["parse_method"] == "ocr":
# use ocr, recognize chars only
ocr = OCR()
bxs = ocr(np.array(img)) # return boxes and recognize result
txt = "\n".join([t[0] for _, t in bxs if t[0]])
else:
lang = conf["lang"]
# use VLM to describe the picture
cv_model_config = get_model_config_by_type_and_name(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, conf["parse_method"])
cv_model = LLMBundle(self._canvas.get_tenant_id(), cv_model_config, lang=lang)
img_binary = io.BytesIO()
img.save(img_binary, format="JPEG")
img_binary.seek(0)
system_prompt = conf.get("system_prompt")
if system_prompt:
txt = cv_model.describe_with_prompt(img_binary.read(), system_prompt)
else:
txt = cv_model.describe(img_binary.read())
json_result = [
{
"text": txt,
"image": img,
"doc_type_kwd": "image",
}
]
self.set_output("json", json_result)
def _audio(self, name, blob, **kwargs):
"""Parse audio files with speech-to-text models."""
import os
import tempfile
self.callback(random.randint(1, 5) / 100.0, "Start to work on an audio.")
conf = self._param.setups["audio"]
vlm = conf.get("vlm")
self.set_output("output_format", conf["output_format"])
_, ext = os.path.splitext(name)
with tempfile.NamedTemporaryFile(suffix=ext) as tmpf:
tmpf.write(blob)
tmpf.flush()
tmp_path = os.path.abspath(tmpf.name)
seq2txt_model_config = get_model_config_by_type_and_name(self._canvas.get_tenant_id(), LLMType.SPEECH2TEXT, vlm["llm_id"])
seq2txt_mdl = LLMBundle(self._canvas.get_tenant_id(), seq2txt_model_config)
txt = seq2txt_mdl.transcription(tmp_path)
self.set_output("text", txt)
def _video(self, name, blob, **kwargs):
"""Parse video files with image-to-text models."""
self.callback(random.randint(1, 5) / 100.0, "Start to work on an video.")
conf = self._param.setups["video"]
vlm = conf.get("vlm")
self.set_output("output_format", conf["output_format"])
cv_model_config = get_model_config_by_type_and_name(self._canvas.get_tenant_id(), LLMType.IMAGE2TEXT, vlm["llm_id"])
cv_mdl = LLMBundle(self._canvas.get_tenant_id(), cv_model_config)
video_prompt = str(conf.get("prompt", "") or "")
txt = asyncio.run(cv_mdl.async_chat(system="", history=[], gen_conf={}, video_bytes=blob, filename=name, video_prompt=video_prompt))
self.set_output("text", txt)
def _email(self, name, blob, **kwargs):
"""Parse eml/msg files into structured email content."""
self.callback(random.randint(1, 5) / 100.0, "Start to work on an email.")
email_content = {}
conf = self._param.setups["email"]
self.set_output("output_format", conf["output_format"])
target_fields = conf["fields"]
_, ext = os.path.splitext(name)
if ext == ".eml":
# handle eml file
from email import policy
from email.parser import BytesParser
msg = BytesParser(policy=policy.default).parse(io.BytesIO(blob))
email_content["metadata"] = {}
# handle header info
for header, value in msg.items():
# get fields like from, to, cc, bcc, date, subject
if header.lower() in target_fields:
email_content[header.lower()] = value
# get metadata
elif header.lower() not in ["from", "to", "cc", "bcc", "date", "subject"]:
email_content["metadata"][header.lower()] = value
# get body
if "body" in target_fields:
body_text, body_html = [], []
def _add_content(m, content_type):
def _decode_payload(payload, charset, target_list):
try:
target_list.append(payload.decode(charset))
except (UnicodeDecodeError, LookupError):
for enc in ["utf-8", "gb2312", "gbk", "gb18030", "latin1"]:
try:
target_list.append(payload.decode(enc))
break
except UnicodeDecodeError:
continue
else:
target_list.append(payload.decode("utf-8", errors="ignore"))
if content_type == "text/plain":
payload = msg.get_payload(decode=True)
charset = msg.get_content_charset() or "utf-8"
_decode_payload(payload, charset, body_text)
elif content_type == "text/html":
payload = msg.get_payload(decode=True)
charset = msg.get_content_charset() or "utf-8"
_decode_payload(payload, charset, body_html)
elif "multipart" in content_type:
if m.is_multipart():
for part in m.iter_parts():
_add_content(part, part.get_content_type())
_add_content(msg, msg.get_content_type())
email_content["text"] = "\n".join(body_text)
email_content["text_html"] = "\n".join(body_html)
# get attachment
if "attachments" in target_fields:
attachments = []
for part in msg.iter_attachments():
content_disposition = part.get("Content-Disposition")
if content_disposition:
dispositions = content_disposition.strip().split(";")
if dispositions[0].lower() == "attachment":
filename = part.get_filename()
payload = part.get_payload(decode=True).decode(part.get_content_charset())
attachments.append(
{
"filename": filename,
"payload": payload,
}
)
email_content["attachments"] = attachments
else:
# handle msg file
import extract_msg
msg = extract_msg.Message(blob)
# handle header info
basic_content = {
"from": msg.sender,
"to": msg.to,
"cc": msg.cc,
"bcc": msg.bcc,
"date": msg.date,
"subject": msg.subject,
}
email_content.update({k: v for k, v in basic_content.items() if k in target_fields})
# get metadata
email_content["metadata"] = {
"message_id": msg.messageId,
"in_reply_to": msg.inReplyTo,
}
# get body
if "body" in target_fields:
email_content["text"] = msg.body[0] if isinstance(msg.body, list) and msg.body else msg.body
if not email_content["text"] and msg.htmlBody:
email_content["text"] = msg.htmlBody[0] if isinstance(msg.htmlBody, list) and msg.htmlBody else msg.htmlBody
# get attachments
if "attachments" in target_fields:
attachments = []
for t in msg.attachments:
attachments.append(
{
"filename": t.name,
"payload": t.data.decode("utf-8"),
}
)
email_content["attachments"] = attachments
if conf["output_format"] == "json":
email_content["doc_type_kwd"] = "text"
self.set_output("json", [email_content])
else:
content_txt = ""
for k, v in email_content.items():
if isinstance(v, str):
# basic info
content_txt += f"{k}:{v}" + "\n"
elif isinstance(v, dict):
# metadata
content_txt += f"{k}:{json.dumps(v)}" + "\n"
elif isinstance(v, list):
# attachments or others
for fb in v:
if isinstance(fb, dict):
# attachments
content_txt += f"{fb['filename']}:{fb['payload']}" + "\n"
else:
# str, usually plain text
content_txt += fb
self.set_output("text", content_txt)
def _epub(self, name, blob, **kwargs):
"""Parse EPUB files into text/json sections."""
from deepdoc.parser import EpubParser
self.callback(random.randint(1, 5) / 100.0, "Start to work on an EPUB.")
conf = self._param.setups["epub"]
self.set_output("output_format", conf["output_format"])
epub_parser = EpubParser()
sections = epub_parser(name, binary=blob)
if conf.get("output_format") == "json":
json_results = [{"text": s, "doc_type_kwd": "text"} for s in sections if s]
self.set_output("json", json_results)
else:
self.set_output("text", "\n".join(s for s in sections if s))
async def _invoke(self, **kwargs):
"""Dispatch the current file to the matching parser branch by suffix."""
function_map = {
"pdf": self._pdf,
"markdown": self._markdown,
"text&code": self._code,
"html": self._html,
"spreadsheet": self._spreadsheet,
"slides": self._slides,
"doc": self._doc,
"docx": self._docx,
"image": self._image,
"audio": self._audio,
"video": self._video,
"email": self._email,
"epub": self._epub,
}
try:
from_upstream = ParserFromUpstream.model_validate(kwargs)
except Exception as e:
self.set_output("_ERROR", f"Input error: {str(e)}")
return
name = from_upstream.name
if self._canvas._doc_id:
b, n = File2DocumentService.get_storage_address(doc_id=self._canvas._doc_id)
blob = settings.STORAGE_IMPL.get(b, n)
else:
blob = FileService.get_blob(from_upstream.file["created_by"], from_upstream.file["id"])
done = False
for p_type, conf in self._param.setups.items():
if from_upstream.name.split(".")[-1].lower() not in conf.get("suffix", []):
continue
call_kwargs = dict(kwargs)
call_kwargs.pop("name", None)
call_kwargs.pop("blob", None)
await thread_pool_exec(function_map[p_type], name, blob, **call_kwargs)
done = True
break
if not done:
raise Exception("No suitable for file extension: `.%s`" % from_upstream.name.split(".")[-1].lower())
outs = self.output()
tasks = []
for d in outs.get("json", []):
tasks.append(asyncio.create_task(image2id(d, partial(settings.STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id), get_uuid())))
try:
await asyncio.gather(*tasks, return_exceptions=False)
except Exception as e:
logging.error("Error while parsing: %s" % e)
for t in tasks:
t.cancel()
await asyncio.gather(*tasks, return_exceptions=True)
raise