mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-03-28 01:50:38 +08:00
Closes #1398 ### What problem does this PR solve? Adds native support for EPUB files. EPUB content is extracted in spine (reading) order and parsed using the existing HTML parser. No new dependencies required. ### Type of change - [x] New Feature (non-breaking change which adds functionality) To check this parser manually: ```python uv run --python 3.12 python -c " from deepdoc.parser import EpubParser with open('$HOME/some_epub_book.epub', 'rb') as f: data = f.read() sections = EpubParser()(None, binary=data, chunk_token_num=512) print(f'Got {len(sections)} sections') for i, s in enumerate(sections[:5]): print(f'\n--- Section {i} ---') print(s[:200]) " ```
1088 lines
44 KiB
Python
1088 lines
44 KiB
Python
#
|
|
# Copyright 2025 The InfiniFlow Authors. All Rights Reserved.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
#
|
|
|
|
import logging
|
|
import re
|
|
import os
|
|
from functools import reduce
|
|
from io import BytesIO
|
|
from timeit import default_timer as timer
|
|
from docx import Document
|
|
from docx.opc.pkgreader import _SerializedRelationships, _SerializedRelationship
|
|
from docx.table import Table as DocxTable
|
|
from docx.text.paragraph import Paragraph
|
|
from docx.opc.oxml import parse_xml
|
|
from markdown import markdown
|
|
from PIL import Image
|
|
from common.token_utils import num_tokens_from_string
|
|
|
|
from common.constants import LLMType
|
|
from api.db.services.llm_service import LLMBundle
|
|
from api.db.joint_services.tenant_model_service import get_model_config_by_type_and_name, get_tenant_default_model_by_type
|
|
from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
|
|
from deepdoc.parser import DocxParser, EpubParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
|
|
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, vision_figure_parser_pdf_wrapper
|
|
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
|
from deepdoc.parser.docling_parser import DoclingParser
|
|
from deepdoc.parser.tcadp_parser import TCADPParser
|
|
from common.float_utils import normalize_overlapped_percent
|
|
from common.parser_config_utils import normalize_layout_recognizer
|
|
from common.text_utils import normalize_arabic_presentation_forms
|
|
from rag.nlp import (
|
|
concat_img,
|
|
find_codec,
|
|
naive_merge,
|
|
naive_merge_with_images,
|
|
naive_merge_docx,
|
|
rag_tokenizer,
|
|
tokenize_chunks,
|
|
doc_tokenize_chunks_with_images,
|
|
tokenize_table,
|
|
append_context2table_image4pdf,
|
|
tokenize_chunks_with_images,
|
|
) # noqa: F401
|
|
|
|
|
|
def _normalize_section_text_for_rtl_presentation_forms(sections):
|
|
if not sections:
|
|
return sections
|
|
|
|
normalized_sections = []
|
|
for section in sections:
|
|
if isinstance(section, tuple):
|
|
if not section:
|
|
normalized_sections.append(section)
|
|
continue
|
|
text = section[0]
|
|
normalized_text = normalize_arabic_presentation_forms(text)
|
|
normalized_sections.append((normalized_text, *section[1:]))
|
|
continue
|
|
if isinstance(section, list):
|
|
if not section:
|
|
normalized_sections.append(section)
|
|
continue
|
|
text = section[0]
|
|
normalized_text = normalize_arabic_presentation_forms(text)
|
|
normalized_sections.append([normalized_text, *section[1:]])
|
|
continue
|
|
normalized_sections.append(normalize_arabic_presentation_forms(section))
|
|
|
|
return normalized_sections
|
|
|
|
|
|
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None, **kwargs):
|
|
callback = callback
|
|
binary = binary
|
|
pdf_parser = pdf_cls() if pdf_cls else Pdf()
|
|
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
|
|
|
|
tables = vision_figure_parser_pdf_wrapper(
|
|
tbls=tables,
|
|
sections=sections,
|
|
callback=callback,
|
|
**kwargs,
|
|
)
|
|
return sections, tables, pdf_parser
|
|
|
|
|
|
def by_mineru(
|
|
filename,
|
|
binary=None,
|
|
from_page=0,
|
|
to_page=100000,
|
|
lang="Chinese",
|
|
callback=None,
|
|
pdf_cls=None,
|
|
parse_method: str = "raw",
|
|
mineru_llm_name: str | None = None,
|
|
tenant_id: str | None = None,
|
|
**kwargs,
|
|
):
|
|
pdf_parser = None
|
|
if tenant_id:
|
|
if not mineru_llm_name:
|
|
try:
|
|
from api.db.services.tenant_llm_service import TenantLLMService
|
|
|
|
env_name = TenantLLMService.ensure_mineru_from_env(tenant_id)
|
|
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="MinerU", model_type=LLMType.OCR)
|
|
if candidates:
|
|
mineru_llm_name = candidates[0].llm_name
|
|
elif env_name:
|
|
mineru_llm_name = env_name
|
|
except Exception as e: # best-effort fallback
|
|
logging.warning(f"fallback to env mineru: {e}")
|
|
|
|
if mineru_llm_name:
|
|
try:
|
|
ocr_model_config = get_model_config_by_type_and_name(tenant_id, LLMType.OCR, mineru_llm_name)
|
|
ocr_model = LLMBundle(tenant_id=tenant_id, model_config=ocr_model_config, lang=lang)
|
|
pdf_parser = ocr_model.mdl
|
|
sections, tables = pdf_parser.parse_pdf(
|
|
filepath=filename,
|
|
binary=binary,
|
|
callback=callback,
|
|
parse_method=parse_method,
|
|
lang=lang,
|
|
**kwargs,
|
|
)
|
|
return sections, tables, pdf_parser
|
|
except Exception as e:
|
|
logging.error(f"Failed to parse pdf via LLMBundle MinerU ({mineru_llm_name}): {e}")
|
|
|
|
if callback:
|
|
callback(-1, "MinerU not found.")
|
|
return None, None, None
|
|
|
|
|
|
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None, **kwargs):
|
|
pdf_parser = DoclingParser()
|
|
parse_method = kwargs.get("parse_method", "raw")
|
|
|
|
if not pdf_parser.check_installation():
|
|
if callback:
|
|
callback(-1, "Docling not found.")
|
|
return None, None, pdf_parser
|
|
|
|
sections, tables = pdf_parser.parse_pdf(
|
|
filepath=filename,
|
|
binary=binary,
|
|
callback=callback,
|
|
output_dir=os.environ.get("DOCLING_OUTPUT_DIR", ""),
|
|
delete_output=bool(int(os.environ.get("DOCLING_DELETE_OUTPUT", 1))),
|
|
docling_server_url=os.environ.get("DOCLING_SERVER_URL", ""),
|
|
parse_method=parse_method,
|
|
)
|
|
return sections, tables, pdf_parser
|
|
|
|
|
|
def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None, **kwargs):
|
|
tcadp_parser = TCADPParser()
|
|
|
|
if not tcadp_parser.check_installation():
|
|
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
|
|
return None, None, tcadp_parser
|
|
|
|
sections, tables = tcadp_parser.parse_pdf(filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), file_type="PDF")
|
|
return sections, tables, tcadp_parser
|
|
|
|
|
|
def by_paddleocr(
|
|
filename,
|
|
binary=None,
|
|
from_page=0,
|
|
to_page=100000,
|
|
lang="Chinese",
|
|
callback=None,
|
|
pdf_cls=None,
|
|
parse_method: str = "raw",
|
|
paddleocr_llm_name: str | None = None,
|
|
tenant_id: str | None = None,
|
|
**kwargs,
|
|
):
|
|
pdf_parser = None
|
|
if tenant_id:
|
|
if not paddleocr_llm_name:
|
|
try:
|
|
from api.db.services.tenant_llm_service import TenantLLMService
|
|
|
|
env_name = TenantLLMService.ensure_paddleocr_from_env(tenant_id)
|
|
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="PaddleOCR", model_type=LLMType.OCR)
|
|
if candidates:
|
|
paddleocr_llm_name = candidates[0].llm_name
|
|
elif env_name:
|
|
paddleocr_llm_name = env_name
|
|
except Exception as e: # best-effort fallback
|
|
logging.warning(f"fallback to env paddleocr: {e}")
|
|
|
|
if paddleocr_llm_name:
|
|
try:
|
|
ocr_model_config = get_model_config_by_type_and_name(tenant_id, LLMType.OCR, paddleocr_llm_name)
|
|
ocr_model = LLMBundle(tenant_id=tenant_id, model_config=ocr_model_config, lang=lang)
|
|
pdf_parser = ocr_model.mdl
|
|
sections, tables = pdf_parser.parse_pdf(
|
|
filepath=filename,
|
|
binary=binary,
|
|
callback=callback,
|
|
parse_method=parse_method,
|
|
**kwargs,
|
|
)
|
|
return sections, tables, pdf_parser
|
|
except Exception as e:
|
|
logging.error(f"Failed to parse pdf via LLMBundle PaddleOCR ({paddleocr_llm_name}): {e}")
|
|
|
|
return None, None, None
|
|
|
|
if callback:
|
|
callback(-1, "PaddleOCR not found.")
|
|
return None, None, None
|
|
|
|
|
|
def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
|
layout_recognizer = (kwargs.get("layout_recognizer") or "").strip()
|
|
if (not layout_recognizer) or (layout_recognizer == "Plain Text"):
|
|
pdf_parser = PlainParser()
|
|
else:
|
|
tenant_id = kwargs.get("tenant_id")
|
|
if not tenant_id:
|
|
raise ValueError("tenant_id is required when using vision layout recognizer")
|
|
vision_model_config = get_model_config_by_type_and_name(tenant_id, LLMType.IMAGE2TEXT, layout_recognizer)
|
|
vision_model = LLMBundle(
|
|
tenant_id,
|
|
model_config=vision_model_config,
|
|
lang=kwargs.get("lang", "Chinese"),
|
|
)
|
|
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
|
|
|
|
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
|
|
return sections, tables, pdf_parser
|
|
|
|
|
|
PARSERS = {
|
|
"deepdoc": by_deepdoc,
|
|
"mineru": by_mineru,
|
|
"docling": by_docling,
|
|
"tcadp parser": by_tcadp,
|
|
"paddleocr": by_paddleocr,
|
|
"plaintext": by_plaintext, # default
|
|
}
|
|
|
|
|
|
class Docx(DocxParser):
|
|
def __init__(self):
|
|
pass
|
|
|
|
def __clean(self, line):
|
|
line = re.sub(r"\u3000", " ", line).strip()
|
|
return line
|
|
|
|
def __get_nearest_title(self, table_index, filename):
|
|
"""Get the hierarchical title structure before the table"""
|
|
import re
|
|
from docx.text.paragraph import Paragraph
|
|
|
|
titles = []
|
|
blocks = []
|
|
|
|
# Get document name from filename parameter
|
|
doc_name = re.sub(r"\.[a-zA-Z]+$", "", filename)
|
|
if not doc_name:
|
|
doc_name = "Untitled Document"
|
|
|
|
# Collect all document blocks while maintaining document order
|
|
try:
|
|
# Iterate through all paragraphs and tables in document order
|
|
for i, block in enumerate(self.doc._element.body):
|
|
if block.tag.endswith("p"): # Paragraph
|
|
p = Paragraph(block, self.doc)
|
|
blocks.append(("p", i, p))
|
|
elif block.tag.endswith("tbl"): # Table
|
|
blocks.append(("t", i, None)) # Table object will be retrieved later
|
|
except Exception as e:
|
|
logging.error(f"Error collecting blocks: {e}")
|
|
return ""
|
|
|
|
# Find the target table position
|
|
target_table_pos = -1
|
|
table_count = 0
|
|
for i, (block_type, pos, _) in enumerate(blocks):
|
|
if block_type == "t":
|
|
if table_count == table_index:
|
|
target_table_pos = pos
|
|
break
|
|
table_count += 1
|
|
|
|
if target_table_pos == -1:
|
|
return "" # Target table not found
|
|
|
|
# Find the nearest heading paragraph in reverse order
|
|
nearest_title = None
|
|
for i in range(len(blocks) - 1, -1, -1):
|
|
block_type, pos, block = blocks[i]
|
|
if pos >= target_table_pos: # Skip blocks after the table
|
|
continue
|
|
|
|
if block_type != "p":
|
|
continue
|
|
|
|
if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
|
|
try:
|
|
level_match = re.search(r"(\d+)", block.style.name)
|
|
if level_match:
|
|
level = int(level_match.group(1))
|
|
if level <= 7: # Support up to 7 heading levels
|
|
title_text = block.text.strip()
|
|
if title_text: # Avoid empty titles
|
|
nearest_title = (level, title_text)
|
|
break
|
|
except Exception as e:
|
|
logging.error(f"Error parsing heading level: {e}")
|
|
|
|
if nearest_title:
|
|
# Add current title
|
|
titles.append(nearest_title)
|
|
current_level = nearest_title[0]
|
|
|
|
# Find all parent headings, allowing cross-level search
|
|
while current_level > 1:
|
|
found = False
|
|
for i in range(len(blocks) - 1, -1, -1):
|
|
block_type, pos, block = blocks[i]
|
|
if pos >= target_table_pos: # Skip blocks after the table
|
|
continue
|
|
|
|
if block_type != "p":
|
|
continue
|
|
|
|
if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
|
|
try:
|
|
level_match = re.search(r"(\d+)", block.style.name)
|
|
if level_match:
|
|
level = int(level_match.group(1))
|
|
# Find any heading with a higher level
|
|
if level < current_level:
|
|
title_text = block.text.strip()
|
|
if title_text: # Avoid empty titles
|
|
titles.append((level, title_text))
|
|
current_level = level
|
|
found = True
|
|
break
|
|
except Exception as e:
|
|
logging.error(f"Error parsing parent heading: {e}")
|
|
|
|
if not found: # Break if no parent heading is found
|
|
break
|
|
|
|
# Sort by level (ascending, from highest to lowest)
|
|
titles.sort(key=lambda x: x[0])
|
|
# Organize titles (from highest to lowest)
|
|
hierarchy = [doc_name] + [t[1] for t in titles]
|
|
return " > ".join(hierarchy)
|
|
|
|
return ""
|
|
|
|
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
|
|
self.doc = Document(filename) if not binary else Document(BytesIO(binary))
|
|
pn = 0
|
|
lines = []
|
|
last_image = None
|
|
table_idx = 0
|
|
|
|
def flush_last_image():
|
|
nonlocal last_image, lines
|
|
if last_image is not None:
|
|
lines.append({"text": "", "image": last_image, "table": None, "style": "Image"})
|
|
last_image = None
|
|
|
|
for block in self.doc._element.body:
|
|
if pn > to_page:
|
|
break
|
|
|
|
if block.tag.endswith("p"):
|
|
p = Paragraph(block, self.doc)
|
|
|
|
if from_page <= pn < to_page:
|
|
text = p.text.strip()
|
|
style_name = p.style.name if p.style else ""
|
|
|
|
if text:
|
|
if style_name == "Caption":
|
|
former_image = None
|
|
|
|
if lines and lines[-1].get("image") and lines[-1].get("style") != "Caption":
|
|
former_image = lines[-1].get("image")
|
|
lines.pop()
|
|
|
|
elif last_image is not None:
|
|
former_image = last_image
|
|
last_image = None
|
|
|
|
lines.append(
|
|
{
|
|
"text": self.__clean(text),
|
|
"image": former_image if former_image else None,
|
|
"table": None,
|
|
}
|
|
)
|
|
|
|
else:
|
|
flush_last_image()
|
|
lines.append(
|
|
{
|
|
"text": self.__clean(text),
|
|
"image": None,
|
|
"table": None,
|
|
}
|
|
)
|
|
|
|
current_image = self.get_picture(self.doc, p)
|
|
if current_image is not None:
|
|
lines.append(
|
|
{
|
|
"text": "",
|
|
"image": current_image,
|
|
"table": None,
|
|
}
|
|
)
|
|
|
|
else:
|
|
current_image = self.get_picture(self.doc, p)
|
|
if current_image is not None:
|
|
last_image = current_image
|
|
|
|
for run in p.runs:
|
|
xml = run._element.xml
|
|
if "lastRenderedPageBreak" in xml:
|
|
pn += 1
|
|
continue
|
|
if "w:br" in xml and 'type="page"' in xml:
|
|
pn += 1
|
|
|
|
elif block.tag.endswith("tbl"):
|
|
if pn < from_page or pn > to_page:
|
|
table_idx += 1
|
|
continue
|
|
|
|
flush_last_image()
|
|
tb = DocxTable(block, self.doc)
|
|
title = self.__get_nearest_title(table_idx, filename)
|
|
html = "<table>"
|
|
if title:
|
|
html += f"<caption>Table Location: {title}</caption>"
|
|
for r in tb.rows:
|
|
html += "<tr>"
|
|
col_idx = 0
|
|
try:
|
|
while col_idx < len(r.cells):
|
|
span = 1
|
|
c = r.cells[col_idx]
|
|
for j in range(col_idx + 1, len(r.cells)):
|
|
if c.text == r.cells[j].text:
|
|
span += 1
|
|
col_idx = j
|
|
else:
|
|
break
|
|
col_idx += 1
|
|
html += f"<td>{c.text}</td>" if span == 1 else f"<td colspan='{span}'>{c.text}</td>"
|
|
except Exception as e:
|
|
logging.warning(f"Error parsing table, ignore: {e}")
|
|
html += "</tr>"
|
|
html += "</table>"
|
|
lines.append({"text": "", "image": None, "table": html})
|
|
table_idx += 1
|
|
|
|
flush_last_image()
|
|
new_line = [(line.get("text"), line.get("image"), line.get("table")) for line in lines]
|
|
|
|
return new_line
|
|
|
|
def to_markdown(self, filename=None, binary=None, inline_images: bool = True):
|
|
"""
|
|
This function uses mammoth, licensed under the BSD 2-Clause License.
|
|
"""
|
|
|
|
import base64
|
|
import uuid
|
|
|
|
import mammoth
|
|
from markdownify import markdownify
|
|
|
|
docx_file = BytesIO(binary) if binary else open(filename, "rb")
|
|
|
|
def _convert_image_to_base64(image):
|
|
try:
|
|
with image.open() as image_file:
|
|
image_bytes = image_file.read()
|
|
encoded = base64.b64encode(image_bytes).decode("utf-8")
|
|
base64_url = f"data:{image.content_type};base64,{encoded}"
|
|
|
|
alt_name = "image"
|
|
alt_name = f"img_{uuid.uuid4().hex[:8]}"
|
|
|
|
return {"src": base64_url, "alt": alt_name}
|
|
except Exception as e:
|
|
logging.warning(f"Failed to convert image to base64: {e}")
|
|
return {"src": "", "alt": "image"}
|
|
|
|
try:
|
|
if inline_images:
|
|
result = mammoth.convert_to_html(docx_file, convert_image=mammoth.images.img_element(_convert_image_to_base64))
|
|
else:
|
|
result = mammoth.convert_to_html(docx_file)
|
|
|
|
html = result.value
|
|
|
|
markdown_text = markdownify(html)
|
|
return markdown_text
|
|
|
|
finally:
|
|
if not binary:
|
|
docx_file.close()
|
|
|
|
|
|
class Pdf(PdfParser):
|
|
def __init__(self):
|
|
super().__init__()
|
|
|
|
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None, separate_tables_figures=False):
|
|
start = timer()
|
|
first_start = start
|
|
callback(msg="OCR started")
|
|
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
|
|
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
|
logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
|
|
|
|
start = timer()
|
|
self._layouts_rec(zoomin)
|
|
callback(0.63, "Layout analysis ({:.2f}s)".format(timer() - start))
|
|
|
|
start = timer()
|
|
self._table_transformer_job(zoomin)
|
|
callback(0.65, "Table analysis ({:.2f}s)".format(timer() - start))
|
|
|
|
start = timer()
|
|
self._text_merge(zoomin=zoomin)
|
|
callback(0.67, "Text merged ({:.2f}s)".format(timer() - start))
|
|
|
|
if separate_tables_figures:
|
|
tbls, figures = self._extract_table_figure(True, zoomin, True, True, True)
|
|
self._concat_downward()
|
|
logging.info("layouts cost: {}s".format(timer() - first_start))
|
|
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls, figures
|
|
else:
|
|
tbls = self._extract_table_figure(True, zoomin, True, True)
|
|
self._naive_vertical_merge()
|
|
self._concat_downward()
|
|
# self._final_reading_order_merge()
|
|
# self._filter_forpages()
|
|
logging.info("layouts cost: {}s".format(timer() - first_start))
|
|
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], tbls
|
|
|
|
|
|
class Markdown(MarkdownParser):
|
|
def md_to_html(self, sections):
|
|
if not sections:
|
|
return []
|
|
if isinstance(sections, type("")):
|
|
text = sections
|
|
elif isinstance(sections[0], type("")):
|
|
text = sections[0]
|
|
else:
|
|
return []
|
|
|
|
from bs4 import BeautifulSoup
|
|
|
|
html_content = markdown(text)
|
|
soup = BeautifulSoup(html_content, "html.parser")
|
|
return soup
|
|
|
|
def get_hyperlink_urls(self, soup):
|
|
if soup:
|
|
return set([a.get("href") for a in soup.find_all("a") if a.get("href")])
|
|
return []
|
|
|
|
def extract_image_urls_with_lines(self, text):
|
|
md_img_re = re.compile(r"!\[[^\]]*\]\(([^)\s]+)")
|
|
html_img_re = re.compile(r'src=["\\\']([^"\\\'>\\s]+)', re.IGNORECASE)
|
|
urls = []
|
|
seen = set()
|
|
lines = text.splitlines()
|
|
for idx, line in enumerate(lines):
|
|
for url in md_img_re.findall(line):
|
|
if (url, idx) not in seen:
|
|
urls.append({"url": url, "line": idx})
|
|
seen.add((url, idx))
|
|
for url in html_img_re.findall(line):
|
|
if (url, idx) not in seen:
|
|
urls.append({"url": url, "line": idx})
|
|
seen.add((url, idx))
|
|
|
|
# cross-line
|
|
try:
|
|
from bs4 import BeautifulSoup
|
|
|
|
soup = BeautifulSoup(text, "html.parser")
|
|
newline_offsets = [m.start() for m in re.finditer(r"\n", text)] + [len(text)]
|
|
for img_tag in soup.find_all("img"):
|
|
src = img_tag.get("src")
|
|
if not src:
|
|
continue
|
|
|
|
tag_str = str(img_tag)
|
|
pos = text.find(tag_str)
|
|
if pos == -1:
|
|
# fallback
|
|
pos = max(text.find(src), 0)
|
|
line_no = 0
|
|
for i, off in enumerate(newline_offsets):
|
|
if pos <= off:
|
|
line_no = i
|
|
break
|
|
if (src, line_no) not in seen:
|
|
urls.append({"url": src, "line": line_no})
|
|
seen.add((src, line_no))
|
|
except Exception as e:
|
|
logging.error("Failed to extract image urls: {}".format(e))
|
|
pass
|
|
|
|
return urls
|
|
|
|
def load_images_from_urls(self, urls, cache=None):
|
|
import requests
|
|
from pathlib import Path
|
|
|
|
cache = cache or {}
|
|
images = []
|
|
for url in urls:
|
|
if url in cache:
|
|
if cache[url]:
|
|
images.append(cache[url])
|
|
continue
|
|
img_obj = None
|
|
try:
|
|
if url.startswith(("http://", "https://")):
|
|
response = requests.get(url, stream=True, timeout=30)
|
|
if response.status_code == 200 and response.headers.get("Content-Type", "").startswith("image/"):
|
|
img_obj = Image.open(BytesIO(response.content)).convert("RGB")
|
|
else:
|
|
local_path = Path(url)
|
|
if local_path.exists():
|
|
img_obj = Image.open(url).convert("RGB")
|
|
else:
|
|
logging.warning(f"Local image file not found: {url}")
|
|
except Exception as e:
|
|
logging.error(f"Failed to download/open image from {url}: {e}")
|
|
cache[url] = img_obj
|
|
if img_obj:
|
|
images.append(img_obj)
|
|
return images, cache
|
|
|
|
def __call__(self, filename, binary=None, separate_tables=True, delimiter=None, return_section_images=False):
|
|
if binary:
|
|
encoding = find_codec(binary)
|
|
txt = binary.decode(encoding, errors="ignore")
|
|
else:
|
|
with open(filename, "r") as f:
|
|
txt = f.read()
|
|
|
|
remainder, tables = self.extract_tables_and_remainder(f"{txt}\n", separate_tables=separate_tables)
|
|
# To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410.
|
|
# extractor = MarkdownElementExtractor(remainder)
|
|
extractor = MarkdownElementExtractor(txt)
|
|
image_refs = self.extract_image_urls_with_lines(txt)
|
|
element_sections = extractor.extract_elements(delimiter, include_meta=True)
|
|
|
|
sections = []
|
|
section_images = []
|
|
image_cache = {}
|
|
for element in element_sections:
|
|
content = element["content"]
|
|
start_line = element["start_line"]
|
|
end_line = element["end_line"]
|
|
urls_in_section = [ref["url"] for ref in image_refs if start_line <= ref["line"] <= end_line]
|
|
imgs = []
|
|
if urls_in_section:
|
|
imgs, image_cache = self.load_images_from_urls(urls_in_section, image_cache)
|
|
combined_image = None
|
|
if imgs:
|
|
combined_image = reduce(concat_img, imgs) if len(imgs) > 1 else imgs[0]
|
|
sections.append((content, ""))
|
|
section_images.append(combined_image)
|
|
|
|
tbls = []
|
|
for table in tables:
|
|
tbls.append(((None, markdown(table, extensions=["markdown.extensions.tables"])), ""))
|
|
if return_section_images:
|
|
return sections, tbls, section_images
|
|
return sections, tbls
|
|
|
|
|
|
def load_from_xml_v2(baseURI, rels_item_xml):
|
|
"""
|
|
Return |_SerializedRelationships| instance loaded with the
|
|
relationships contained in *rels_item_xml*. Returns an empty
|
|
collection if *rels_item_xml* is |None|.
|
|
"""
|
|
srels = _SerializedRelationships()
|
|
if rels_item_xml is not None:
|
|
rels_elm = parse_xml(rels_item_xml)
|
|
for rel_elm in rels_elm.Relationship_lst:
|
|
if rel_elm.target_ref in ("../NULL", "NULL") or rel_elm.target_ref.startswith("#"):
|
|
continue
|
|
srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
|
|
return srels
|
|
|
|
|
|
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
|
"""
|
|
Supported file formats are docx, pdf, excel, txt.
|
|
This method apply the naive ways to chunk files.
|
|
Successive text will be sliced into pieces using 'delimiter'.
|
|
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
|
|
"""
|
|
urls = set()
|
|
url_res = []
|
|
|
|
is_english = lang.lower() == "english" # is_english(cks)
|
|
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
|
|
|
|
child_deli = (parser_config.get("children_delimiter") or "").encode("utf-8").decode("unicode_escape").encode("latin1").decode("utf-8")
|
|
cust_child_deli = re.findall(r"`([^`]+)`", child_deli)
|
|
child_deli = "|".join(re.sub(r"`([^`]+)`", "", child_deli))
|
|
if cust_child_deli:
|
|
cust_child_deli = sorted(set(cust_child_deli), key=lambda x: -len(x))
|
|
cust_child_deli = "|".join(re.escape(t) for t in cust_child_deli if t)
|
|
child_deli += cust_child_deli
|
|
|
|
is_markdown = False
|
|
table_context_size = max(0, int(parser_config.get("table_context_size", 0) or 0))
|
|
image_context_size = max(0, int(parser_config.get("image_context_size", 0) or 0))
|
|
|
|
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
|
|
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
|
res = []
|
|
pdf_parser = None
|
|
section_images = None
|
|
|
|
is_root = kwargs.get("is_root", True)
|
|
embed_res = []
|
|
if is_root:
|
|
# Only extract embedded files at the root call
|
|
embeds = []
|
|
if binary is not None:
|
|
embeds = extract_embed_file(binary)
|
|
else:
|
|
raise Exception("Embedding extraction from file path is not supported.")
|
|
|
|
# Recursively chunk each embedded file and collect results
|
|
for embed_filename, embed_bytes in embeds:
|
|
try:
|
|
sub_res = chunk(embed_filename, binary=embed_bytes, lang=lang, callback=callback, is_root=False, **kwargs) or []
|
|
embed_res.extend(sub_res)
|
|
except Exception as e:
|
|
error_msg = f"Failed to chunk embed {embed_filename}: {e}"
|
|
logging.error(error_msg)
|
|
if callback:
|
|
callback(0.05, error_msg)
|
|
continue
|
|
|
|
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
|
callback(0.1, "Start to parse.")
|
|
if parser_config.get("analyze_hyperlink", False) and is_root:
|
|
urls = extract_links_from_docx(binary)
|
|
for index, url in enumerate(urls):
|
|
html_bytes, metadata = extract_html(url)
|
|
if not html_bytes:
|
|
continue
|
|
try:
|
|
sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
|
|
except Exception as e:
|
|
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
|
|
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
|
|
url_res.extend(sub_url_res)
|
|
|
|
# fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
|
|
_SerializedRelationships.load_from_xml = load_from_xml_v2
|
|
|
|
# sections = (text, image, tables)
|
|
sections = Docx()(filename, binary)
|
|
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
|
|
|
|
# chunks list[dict]
|
|
# images list - index of image chunk in chunks
|
|
chunks, images = naive_merge_docx(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"), table_context_size, image_context_size)
|
|
|
|
vision_figure_parser_docx_wrapper_naive(chunks=chunks, idx_lst=images, callback=callback, **kwargs)
|
|
|
|
callback(0.8, "Finish parsing.")
|
|
st = timer()
|
|
|
|
res.extend(doc_tokenize_chunks_with_images(chunks, doc, is_english, child_delimiters_pattern=child_deli))
|
|
logging.info("naive_merge({}): {}".format(filename, timer() - st))
|
|
res.extend(embed_res)
|
|
res.extend(url_res)
|
|
return res
|
|
|
|
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
|
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
|
|
|
|
if parser_config.get("analyze_hyperlink", False) and is_root:
|
|
urls = extract_links_from_pdf(binary)
|
|
|
|
if isinstance(layout_recognizer, bool):
|
|
layout_recognizer = "DeepDOC" if layout_recognizer else "PlainText"
|
|
|
|
name = layout_recognizer.strip().lower()
|
|
parser = PARSERS.get(name, by_plaintext)
|
|
callback(0.1, "Start to parse.")
|
|
|
|
sections, tables, pdf_parser = parser(
|
|
filename=filename,
|
|
binary=binary,
|
|
from_page=from_page,
|
|
to_page=to_page,
|
|
lang=lang,
|
|
callback=callback,
|
|
layout_recognizer=layout_recognizer,
|
|
mineru_llm_name=parser_model_name,
|
|
paddleocr_llm_name=parser_model_name,
|
|
**kwargs,
|
|
)
|
|
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
|
|
|
|
if not sections and not tables:
|
|
return []
|
|
|
|
if table_context_size or image_context_size:
|
|
tables = append_context2table_image4pdf(sections, tables, image_context_size)
|
|
|
|
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
|
|
if int(parser_config.get("chunk_token_num", 0)) <= 0:
|
|
parser_config["chunk_token_num"] = 0
|
|
|
|
res = tokenize_table(tables, doc, is_english)
|
|
callback(0.8, "Finish parsing.")
|
|
|
|
elif re.search(r"\.(csv|xlsx?)$", filename, re.IGNORECASE):
|
|
callback(0.1, "Start to parse.")
|
|
|
|
# Check if tcadp_parser is selected for spreadsheet files
|
|
layout_recognizer = parser_config.get("layout_recognize", "DeepDOC")
|
|
if layout_recognizer == "TCADP Parser":
|
|
table_result_type = parser_config.get("table_result_type", "1")
|
|
markdown_image_response_type = parser_config.get("markdown_image_response_type", "1")
|
|
tcadp_parser = TCADPParser(table_result_type=table_result_type, markdown_image_response_type=markdown_image_response_type)
|
|
if not tcadp_parser.check_installation():
|
|
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
|
|
return res
|
|
|
|
# Determine file type based on extension
|
|
file_type = "XLSX" if re.search(r"\.xlsx?$", filename, re.IGNORECASE) else "CSV"
|
|
|
|
sections, tables = tcadp_parser.parse_pdf(filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), file_type=file_type)
|
|
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
|
|
parser_config["chunk_token_num"] = 0
|
|
res = tokenize_table(tables, doc, is_english)
|
|
callback(0.8, "Finish parsing.")
|
|
else:
|
|
# Default DeepDOC parser
|
|
excel_parser = ExcelParser()
|
|
if parser_config.get("html4excel"):
|
|
sections = [(_, "") for _ in excel_parser.html(binary, 12) if _]
|
|
parser_config["chunk_token_num"] = 0
|
|
else:
|
|
sections = [(_, "") for _ in excel_parser(binary) if _]
|
|
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
|
|
|
|
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
|
|
callback(0.1, "Start to parse.")
|
|
sections = TxtParser()(filename, binary, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?;。;!?"))
|
|
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
|
|
callback(0.8, "Finish parsing.")
|
|
|
|
elif re.search(r"\.(md|markdown|mdx)$", filename, re.IGNORECASE):
|
|
callback(0.1, "Start to parse.")
|
|
markdown_parser = Markdown(int(parser_config.get("chunk_token_num", 128)))
|
|
sections, tables, section_images = markdown_parser(
|
|
filename,
|
|
binary,
|
|
separate_tables=False,
|
|
delimiter=parser_config.get("delimiter", "\n!?;。;!?"),
|
|
return_section_images=True,
|
|
)
|
|
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
|
|
|
|
is_markdown = True
|
|
|
|
try:
|
|
vision_model_config = get_tenant_default_model_by_type(kwargs["tenant_id"], LLMType.IMAGE2TEXT)
|
|
vision_model = LLMBundle(kwargs["tenant_id"], vision_model_config)
|
|
callback(0.2, "Visual model detected. Attempting to enhance figure extraction...")
|
|
except Exception as e:
|
|
logging.warning(f"Failed to detect figure extraction: {e}")
|
|
vision_model = None
|
|
|
|
if vision_model:
|
|
# Process images for each section
|
|
for idx, (section_text, _) in enumerate(sections):
|
|
images = []
|
|
if section_images and len(section_images) > idx and section_images[idx] is not None:
|
|
images.append(section_images[idx])
|
|
|
|
if images and len(images) > 0:
|
|
# If multiple images found, combine them using concat_img
|
|
combined_image = reduce(concat_img, images) if len(images) > 1 else images[0]
|
|
if section_images:
|
|
section_images[idx] = combined_image
|
|
else:
|
|
section_images = [None] * len(sections)
|
|
section_images[idx] = combined_image
|
|
markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=[((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
|
|
boosted_figures = markdown_vision_parser(callback=callback)
|
|
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1])
|
|
|
|
else:
|
|
logging.warning("No visual model detected. Skipping figure parsing enhancement.")
|
|
|
|
if parser_config.get("hyperlink_urls", False) and is_root:
|
|
for idx, (section_text, _) in enumerate(sections):
|
|
soup = markdown_parser.md_to_html(section_text)
|
|
hyperlink_urls = markdown_parser.get_hyperlink_urls(soup)
|
|
urls.update(hyperlink_urls)
|
|
res = tokenize_table(tables, doc, is_english)
|
|
callback(0.8, "Finish parsing.")
|
|
|
|
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
|
|
callback(0.1, "Start to parse.")
|
|
chunk_token_num = int(parser_config.get("chunk_token_num", 128))
|
|
sections = HtmlParser()(filename, binary, chunk_token_num)
|
|
sections = [(_, "") for _ in sections if _]
|
|
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
|
|
callback(0.8, "Finish parsing.")
|
|
|
|
elif re.search(r"\.epub$", filename, re.IGNORECASE):
|
|
callback(0.1, "Start to parse.")
|
|
chunk_token_num = int(parser_config.get("chunk_token_num", 128))
|
|
sections = EpubParser()(filename, binary, chunk_token_num)
|
|
sections = [(_, "") for _ in sections if _]
|
|
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
|
|
callback(0.8, "Finish parsing.")
|
|
|
|
elif re.search(r"\.(json|jsonl|ldjson)$", filename, re.IGNORECASE):
|
|
callback(0.1, "Start to parse.")
|
|
chunk_token_num = int(parser_config.get("chunk_token_num", 128))
|
|
sections = JsonParser(chunk_token_num)(binary)
|
|
sections = [(_, "") for _ in sections if _]
|
|
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
|
|
callback(0.8, "Finish parsing.")
|
|
|
|
elif re.search(r"\.doc$", filename, re.IGNORECASE):
|
|
callback(0.1, "Start to parse.")
|
|
|
|
try:
|
|
from tika import parser as tika_parser
|
|
except Exception as e:
|
|
callback(0.8, f"tika not available: {e}. Unsupported .doc parsing.")
|
|
logging.warning(f"tika not available: {e}. Unsupported .doc parsing for {filename}.")
|
|
return []
|
|
|
|
binary = BytesIO(binary)
|
|
doc_parsed = tika_parser.from_buffer(binary)
|
|
if doc_parsed.get("content", None) is not None:
|
|
sections = doc_parsed["content"].split("\n")
|
|
sections = [(_, "") for _ in sections if _]
|
|
sections = _normalize_section_text_for_rtl_presentation_forms(sections)
|
|
callback(0.8, "Finish parsing.")
|
|
else:
|
|
error_msg = f"tika.parser got empty content from {filename}."
|
|
callback(0.8, error_msg)
|
|
logging.warning(error_msg)
|
|
return []
|
|
else:
|
|
raise NotImplementedError("file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
|
|
|
st = timer()
|
|
overlapped_percent = normalize_overlapped_percent(parser_config.get("overlapped_percent", 0))
|
|
if is_markdown:
|
|
merged_chunks = []
|
|
merged_images = []
|
|
chunk_limit = max(0, int(parser_config.get("chunk_token_num", 128)))
|
|
|
|
current_text = ""
|
|
current_tokens = 0
|
|
current_image = None
|
|
|
|
for idx, sec in enumerate(sections):
|
|
text = sec[0] if isinstance(sec, tuple) else sec
|
|
sec_tokens = num_tokens_from_string(text)
|
|
sec_image = section_images[idx] if section_images and idx < len(section_images) else None
|
|
|
|
if current_text and current_tokens + sec_tokens > chunk_limit:
|
|
merged_chunks.append(current_text)
|
|
merged_images.append(current_image)
|
|
overlap_part = ""
|
|
if overlapped_percent > 0:
|
|
overlap_len = int(len(current_text) * overlapped_percent / 100)
|
|
if overlap_len > 0:
|
|
overlap_part = current_text[-overlap_len:]
|
|
current_text = overlap_part
|
|
current_tokens = num_tokens_from_string(current_text)
|
|
current_image = current_image if overlap_part else None
|
|
|
|
if current_text:
|
|
current_text += "\n" + text
|
|
else:
|
|
current_text = text
|
|
current_tokens += sec_tokens
|
|
|
|
if sec_image:
|
|
current_image = concat_img(current_image, sec_image) if current_image else sec_image
|
|
|
|
if current_text:
|
|
merged_chunks.append(current_text)
|
|
merged_images.append(current_image)
|
|
|
|
chunks = merged_chunks
|
|
has_images = merged_images and any(img is not None for img in merged_images)
|
|
|
|
if has_images:
|
|
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images, child_delimiters_pattern=child_deli))
|
|
else:
|
|
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))
|
|
else:
|
|
if section_images:
|
|
if all(image is None for image in section_images):
|
|
section_images = None
|
|
|
|
if section_images:
|
|
chunks, images = naive_merge_with_images(sections, section_images, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"), overlapped_percent)
|
|
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli))
|
|
else:
|
|
chunks = naive_merge(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"), overlapped_percent)
|
|
|
|
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))
|
|
|
|
if urls and parser_config.get("analyze_hyperlink", False) and is_root:
|
|
for index, url in enumerate(urls):
|
|
html_bytes, metadata = extract_html(url)
|
|
if not html_bytes:
|
|
continue
|
|
try:
|
|
sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
|
|
except Exception as e:
|
|
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
|
|
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
|
|
url_res.extend(sub_url_res)
|
|
|
|
logging.info("naive_merge({}): {}".format(filename, timer() - st))
|
|
|
|
if embed_res:
|
|
res.extend(embed_res)
|
|
if url_res:
|
|
res.extend(url_res)
|
|
# if table_context_size or image_context_size:
|
|
# attach_media_context(res, table_context_size, image_context_size)
|
|
return res
|
|
|
|
|
|
if __name__ == "__main__":
|
|
import sys
|
|
|
|
def dummy(prog=None, msg=""):
|
|
pass
|
|
|
|
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
|