diff --git a/api/utils/file_utils.py b/api/utils/file_utils.py index e4fbe5e03..857cf1738 100644 --- a/api/utils/file_utils.py +++ b/api/utils/file_utils.py @@ -35,8 +35,8 @@ from api.db import FileType # Robustness and resource limits: reject oversized inputs to avoid DoS and OOM. MAX_BLOB_SIZE_THUMBNAIL = 50 * 1024 * 1024 # 50 MiB for thumbnail generation -MAX_BLOB_SIZE_PDF = 100 * 1024 * 1024 # 100 MiB for PDF repair / read -GHOSTSCRIPT_TIMEOUT_SEC = 120 # Timeout for Ghostscript subprocess +MAX_BLOB_SIZE_PDF = 100 * 1024 * 1024 # 100 MiB for PDF repair / read +GHOSTSCRIPT_TIMEOUT_SEC = 120 # Timeout for Ghostscript subprocess LOCK_KEY_pdfplumber = "global_shared_lock_pdfplumber" if LOCK_KEY_pdfplumber not in sys.modules: @@ -64,13 +64,17 @@ def filename_type(filename): if re.match(r".*\.pdf$", filename): return FileType.PDF.value - if re.match(r".*\.(msg|eml|doc|docx|ppt|pptx|yml|xml|htm|json|jsonl|ldjson|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|mdx|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql)$", filename): + if re.match( + r".*\.(msg|eml|doc|docx|ppt|pptx|yml|xml|htm|json|jsonl|ldjson|csv|txt|ini|xls|xlsx|wps|rtf|hlp|pages|numbers|key|md|mdx|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|html|sql|epub)$", filename + ): return FileType.DOC.value if re.match(r".*\.(wav|flac|ape|alac|wavpack|wv|mp3|aac|ogg|vorbis|opus)$", filename): return FileType.AURAL.value - if re.match(r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4|avi|mkv)$", filename): + if re.match( + r".*\.(jpg|jpeg|png|tif|gif|pcx|tga|exif|fpx|svg|psd|cdr|pcd|dxf|ufo|eps|ai|raw|WMF|webp|avif|apng|icon|ico|mpg|mpeg|avi|rm|rmvb|mov|wmv|asf|dat|asx|wvx|mpe|mpa|mp4|avi|mkv)$", filename + ): return FileType.VISUAL.value return FileType.OTHER.value diff --git a/deepdoc/parser/__init__.py b/deepdoc/parser/__init__.py index 809a56edf..a34b1de0f 100644 --- a/deepdoc/parser/__init__.py +++ b/deepdoc/parser/__init__.py @@ -15,6 +15,7 @@ # from .docx_parser import RAGFlowDocxParser as DocxParser +from .epub_parser import RAGFlowEpubParser as EpubParser from .excel_parser import RAGFlowExcelParser as ExcelParser from .html_parser import RAGFlowHtmlParser as HtmlParser from .json_parser import RAGFlowJsonParser as JsonParser @@ -29,6 +30,7 @@ __all__ = [ "PdfParser", "PlainParser", "DocxParser", + "EpubParser", "ExcelParser", "PptParser", "HtmlParser", @@ -37,4 +39,3 @@ __all__ = [ "TxtParser", "MarkdownElementExtractor", ] - diff --git a/deepdoc/parser/epub_parser.py b/deepdoc/parser/epub_parser.py new file mode 100644 index 000000000..5badd7c33 --- /dev/null +++ b/deepdoc/parser/epub_parser.py @@ -0,0 +1,145 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import logging +import warnings +import zipfile +from io import BytesIO +from xml.etree import ElementTree + +from .html_parser import RAGFlowHtmlParser + +# OPF XML namespaces +_OPF_NS = "http://www.idpf.org/2007/opf" +_CONTAINER_NS = "urn:oasis:names:tc:opendocument:xmlns:container" + +# Media types that contain readable XHTML content +_XHTML_MEDIA_TYPES = {"application/xhtml+xml", "text/html", "text/xml"} + +logger = logging.getLogger(__name__) + + +class RAGFlowEpubParser: + """Parse EPUB files by extracting XHTML content in spine (reading) order + and delegating to RAGFlowHtmlParser for chunking.""" + + def __call__(self, fnm, binary=None, chunk_token_num=512): + if binary is not None: + if not binary: + logger.warning( + "RAGFlowEpubParser received an empty EPUB binary payload for %r", + fnm, + ) + raise ValueError("Empty EPUB binary payload") + zf = zipfile.ZipFile(BytesIO(binary)) + else: + zf = zipfile.ZipFile(fnm) + + try: + content_items = self._get_spine_items(zf) + all_sections = [] + html_parser = RAGFlowHtmlParser() + + for item_path in content_items: + try: + html_bytes = zf.read(item_path) + except KeyError: + continue + if not html_bytes: + logger.debug("Skipping empty EPUB content item: %s", item_path) + continue + with warnings.catch_warnings(): + warnings.filterwarnings("ignore", category=UserWarning) + sections = html_parser( + item_path, binary=html_bytes, chunk_token_num=chunk_token_num + ) + all_sections.extend(sections) + + return all_sections + finally: + zf.close() + + @staticmethod + def _get_spine_items(zf): + """Return content file paths in spine (reading) order.""" + # 1. Find the OPF file path from META-INF/container.xml + try: + container_xml = zf.read("META-INF/container.xml") + except KeyError: + return RAGFlowEpubParser._fallback_xhtml_order(zf) + + try: + container_root = ElementTree.fromstring(container_xml) + except ElementTree.ParseError: + logger.warning("Failed to parse META-INF/container.xml; falling back to XHTML order.") + return RAGFlowEpubParser._fallback_xhtml_order(zf) + + rootfile_el = container_root.find(f".//{{{_CONTAINER_NS}}}rootfile") + if rootfile_el is None: + return RAGFlowEpubParser._fallback_xhtml_order(zf) + + opf_path = rootfile_el.get("full-path", "") + if not opf_path: + return RAGFlowEpubParser._fallback_xhtml_order(zf) + + # Base directory of the OPF file (content paths are relative to it) + opf_dir = opf_path.rsplit("/", 1)[0] + "/" if "/" in opf_path else "" + + # 2. Parse the OPF file + try: + opf_xml = zf.read(opf_path) + except KeyError: + return RAGFlowEpubParser._fallback_xhtml_order(zf) + + try: + opf_root = ElementTree.fromstring(opf_xml) + except ElementTree.ParseError: + logger.warning("Failed to parse OPF file '%s'; falling back to XHTML order.", opf_path) + return RAGFlowEpubParser._fallback_xhtml_order(zf) + + # 3. Build id->href+mediatype map from + manifest = {} + for item in opf_root.findall(f".//{{{_OPF_NS}}}item"): + item_id = item.get("id", "") + href = item.get("href", "") + media_type = item.get("media-type", "") + if item_id and href: + manifest[item_id] = (href, media_type) + + # 4. Walk to get reading order + spine_items = [] + for itemref in opf_root.findall(f".//{{{_OPF_NS}}}itemref"): + idref = itemref.get("idref", "") + if idref not in manifest: + continue + href, media_type = manifest[idref] + if media_type not in _XHTML_MEDIA_TYPES: + continue + spine_items.append(opf_dir + href) + + return ( + spine_items if spine_items else RAGFlowEpubParser._fallback_xhtml_order(zf) + ) + + @staticmethod + def _fallback_xhtml_order(zf): + """Fallback: return all .xhtml/.html files sorted alphabetically.""" + return sorted( + n + for n in zf.namelist() + if n.lower().endswith((".xhtml", ".html", ".htm")) + and not n.startswith("META-INF/") + ) diff --git a/rag/app/naive.py b/rag/app/naive.py index 3eec55df0..f885fcbab 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -33,7 +33,7 @@ from common.constants import LLMType from api.db.services.llm_service import LLMBundle from api.db.joint_services.tenant_model_service import get_model_config_by_type_and_name, get_tenant_default_model_by_type from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html -from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser +from deepdoc.parser import DocxParser, EpubParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, vision_figure_parser_pdf_wrapper from deepdoc.parser.pdf_parser import PlainParser, VisionParser from deepdoc.parser.docling_parser import DoclingParser @@ -953,6 +953,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca sections = _normalize_section_text_for_rtl_presentation_forms(sections) callback(0.8, "Finish parsing.") + elif re.search(r"\.epub$", filename, re.IGNORECASE): + callback(0.1, "Start to parse.") + chunk_token_num = int(parser_config.get("chunk_token_num", 128)) + sections = EpubParser()(filename, binary, chunk_token_num) + sections = [(_, "") for _ in sections if _] + sections = _normalize_section_text_for_rtl_presentation_forms(sections) + callback(0.8, "Finish parsing.") + elif re.search(r"\.(json|jsonl|ldjson)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") chunk_token_num = int(parser_config.get("chunk_token_num", 128)) diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index 3f779e252..0803ddef7 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -43,10 +43,9 @@ from rag.nlp import BULLET_PATTERN, bullets_category, docx_question_level, not_b from rag.utils.base64_image import image2id - - from common.misc_utils import thread_pool_exec + class ParserParam(ProcessParamBase): def __init__(self): super().__init__() @@ -82,6 +81,10 @@ class ParserParam(ProcessParamBase): "json", ], "video": [], + "epub": [ + "text", + "json", + ], } self.setups = { @@ -166,6 +169,12 @@ class ParserParam(ProcessParamBase): "output_format": "text", "prompt": "", }, + "epub": { + "suffix": [ + "epub", + ], + "output_format": "json", + }, } def check(self): @@ -219,6 +228,11 @@ class ParserParam(ProcessParamBase): email_output_format = email_config.get("output_format", "") self.check_valid_value(email_output_format, "Email output format abnormal.", self.allowed_output_format["email"]) + epub_config = self.setups.get("epub", "") + if epub_config: + epub_output_format = epub_config.get("output_format", "") + self.check_valid_value(epub_output_format, "EPUB output format abnormal.", self.allowed_output_format["epub"]) + def get_input_form(self) -> dict[str, dict]: return {} @@ -390,9 +404,7 @@ class Parser(ProcessBase): box = { "text": text, "image": pdf_parser.crop(poss, 1) if isinstance(poss, str) and poss else None, - "positions": [[pos[0][-1], *pos[1:]] for pos in pdf_parser.extract_positions(poss)] - if isinstance(poss, str) and poss - else [], + "positions": [[pos[0][-1], *pos[1:]] for pos in pdf_parser.extract_positions(poss)] if isinstance(poss, str) and poss else [], } bboxes.append(box) elif parse_method.lower() == "tcadp parser": @@ -698,7 +710,6 @@ class Parser(ProcessBase): markdown_text = docx_parser.to_markdown(name, binary=blob) self.set_output("markdown", markdown_text) - def _slides(self, name, blob, **kwargs): self.callback(random.randint(1, 5) / 100.0, "Start to work on a PowerPoint Document") @@ -839,11 +850,13 @@ class Parser(ProcessBase): else: txt = cv_model.describe(img_binary.read()) - json_result = [{ - "text": txt, - "image": img, - "doc_type_kwd": "image", - }] + json_result = [ + { + "text": txt, + "image": img, + "doc_type_kwd": "image", + } + ] self.set_output("json", json_result) def _audio(self, name, blob, **kwargs): @@ -1013,6 +1026,22 @@ class Parser(ProcessBase): content_txt += fb self.set_output("text", content_txt) + def _epub(self, name, blob, **kwargs): + from deepdoc.parser import EpubParser + + self.callback(random.randint(1, 5) / 100.0, "Start to work on an EPUB.") + conf = self._param.setups["epub"] + self.set_output("output_format", conf["output_format"]) + + epub_parser = EpubParser() + sections = epub_parser(name, binary=blob) + + if conf.get("output_format") == "json": + json_results = [{"text": s} for s in sections if s] + self.set_output("json", json_results) + else: + self.set_output("text", "\n".join(s for s in sections if s)) + async def _invoke(self, **kwargs): function_map = { "pdf": self._pdf, @@ -1024,6 +1053,7 @@ class Parser(ProcessBase): "audio": self._audio, "video": self._video, "email": self._email, + "epub": self._epub, } try: diff --git a/test/unit_test/api/utils/test_api_file_utils.py b/test/unit_test/api/utils/test_api_file_utils.py index 65e1ce14c..b47aea383 100644 --- a/test/unit_test/api/utils/test_api_file_utils.py +++ b/test/unit_test/api/utils/test_api_file_utils.py @@ -34,24 +34,33 @@ from api.utils.file_utils import ( class TestFilenameType: """Edge cases and robustness for filename_type.""" - @pytest.mark.parametrize("filename,expected", [ - ("doc.pdf", FileType.PDF.value), - ("a.PDF", FileType.PDF.value), - ("x.png", FileType.VISUAL.value), - ("file.docx", FileType.DOC.value), - ("a/b/c.pdf", FileType.PDF.value), - ("path/to/file.txt", FileType.DOC.value), - ]) + @pytest.mark.parametrize( + "filename,expected", + [ + ("doc.pdf", FileType.PDF.value), + ("a.PDF", FileType.PDF.value), + ("x.png", FileType.VISUAL.value), + ("file.docx", FileType.DOC.value), + ("a/b/c.pdf", FileType.PDF.value), + ("path/to/file.txt", FileType.DOC.value), + ("book.epub", FileType.DOC.value), + ("BOOK.EPUB", FileType.DOC.value), + ("path/to/book.epub", FileType.DOC.value), + ], + ) def test_valid_filenames(self, filename, expected): assert filename_type(filename) == expected - @pytest.mark.parametrize("filename", [ - None, - "", - " ", - 123, - [], - ]) + @pytest.mark.parametrize( + "filename", + [ + None, + "", + " ", + 123, + [], + ], + ) def test_invalid_or_empty_returns_other(self, filename): assert filename_type(filename) == FileType.OTHER.value @@ -62,16 +71,19 @@ class TestFilenameType: class TestSanitizePath: """Edge cases for sanitize_path.""" - @pytest.mark.parametrize("raw,expected", [ - (None, ""), - ("", ""), - (" ", ""), - (42, ""), - ("a/b", "a/b"), - ("a/../b", "a/b"), - ("/leading/", "leading"), - ("\\mixed\\path", "mixed/path"), - ]) + @pytest.mark.parametrize( + "raw,expected", + [ + (None, ""), + ("", ""), + (" ", ""), + (42, ""), + ("a/b", "a/b"), + ("a/../b", "a/b"), + ("/leading/", "leading"), + ("\\mixed\\path", "mixed/path"), + ], + ) def test_sanitize_cases(self, raw, expected): assert sanitize_path(raw) == expected @@ -88,6 +100,7 @@ class TestReadPotentialBrokenPdf: def test_non_len_raises_or_returns_empty(self): class NoLen: pass + result = read_potential_broken_pdf(NoLen()) assert result == b"" @@ -120,7 +133,11 @@ class TestThumbnail: def test_valid_img_returns_base64_prefix(self): from api.constants import IMG_BASE64_PREFIX - result = thumbnail("x.png", b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x02\x00\x00\x00\x90wS\xde\x00\x00\x00\x0cIDATx\x9cc\xf8\x0f\x00\x00\x01\x01\x00\x05\x18\xd8N\x00\x00\x00\x00IEND\xaeB`\x82") + + result = thumbnail( + "x.png", + b"\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x02\x00\x00\x00\x90wS\xde\x00\x00\x00\x0cIDATx\x9cc\xf8\x0f\x00\x00\x01\x01\x00\x05\x18\xd8N\x00\x00\x00\x00IEND\xaeB`\x82", + ) assert result.startswith(IMG_BASE64_PREFIX) or result == "" diff --git a/test/unit_test/deepdoc/parser/test_epub_parser.py b/test/unit_test/deepdoc/parser/test_epub_parser.py new file mode 100644 index 000000000..6b75126ca --- /dev/null +++ b/test/unit_test/deepdoc/parser/test_epub_parser.py @@ -0,0 +1,350 @@ +# +# Copyright 2025 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Unit tests for the EPUB parser. + +Tests cover: +- Parsing a well-formed EPUB with OPF spine ordering +- Fallback parsing when META-INF/container.xml is missing +- Handling of empty or content-less EPUB files +- Spine ordering respects the OPF itemref sequence +- Malformed XML graceful fallback +- Empty binary input handling +""" + +import importlib.util +import os +import sys +import zipfile +from io import BytesIO +from unittest import mock + +# Import RAGFlowEpubParser directly by file path to avoid triggering +# deepdoc/parser/__init__.py which pulls in heavy dependencies +# (pdfplumber, xgboost, etc.) that may not be available in test environments. +_MOCK_MODULES = [ + "xgboost", + "xgb", + "pdfplumber", + "huggingface_hub", + "PIL", + "PIL.Image", + "pypdf", + "sklearn", + "sklearn.cluster", + "sklearn.metrics", + "deepdoc.vision", + "infinity", + "infinity.rag_tokenizer", +] +for _m in _MOCK_MODULES: + if _m not in sys.modules: + sys.modules[_m] = mock.MagicMock() + + +def _find_project_root(marker="pyproject.toml"): + d = os.path.dirname(os.path.abspath(__file__)) + while d != os.path.dirname(d): + if os.path.exists(os.path.join(d, marker)): + return d + d = os.path.dirname(d) + return None + + +_PROJECT_ROOT = _find_project_root() + +# Load html_parser first (epub_parser depends on it via relative import) +_html_spec = importlib.util.spec_from_file_location( + "deepdoc.parser.html_parser", + os.path.join(_PROJECT_ROOT, "deepdoc", "parser", "html_parser.py"), +) +_html_mod = importlib.util.module_from_spec(_html_spec) +sys.modules["deepdoc.parser.html_parser"] = _html_mod +_html_spec.loader.exec_module(_html_mod) + +_epub_spec = importlib.util.spec_from_file_location( + "deepdoc.parser.epub_parser", + os.path.join(_PROJECT_ROOT, "deepdoc", "parser", "epub_parser.py"), +) +_epub_mod = importlib.util.module_from_spec(_epub_spec) +sys.modules["deepdoc.parser.epub_parser"] = _epub_mod +_epub_spec.loader.exec_module(_epub_mod) + +RAGFlowEpubParser = _epub_mod.RAGFlowEpubParser + + +def _make_epub(chapters, include_container=True, spine_order=None): + """Build a minimal EPUB ZIP in memory. + + Args: + chapters: list of (filename, html_content) tuples. + include_container: whether to include META-INF/container.xml. + spine_order: optional list of filenames for spine ordering. + Defaults to the order of `chapters`. + """ + buf = BytesIO() + with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf: + zf.writestr("mimetype", "application/epub+zip") + + if include_container: + container_xml = ( + '' + '' + " " + ' ' + " " + "" + ) + zf.writestr("META-INF/container.xml", container_xml) + + if spine_order is None: + spine_order = [fn for fn, _ in chapters] + + manifest_items = "" + for i, (fn, _) in enumerate(chapters): + manifest_items += f'' + + spine_refs = "" + fn_to_id = {fn: f"ch{i}" for i, (fn, _) in enumerate(chapters)} + for fn in spine_order: + spine_refs += f'' + + opf_xml = ( + f' {manifest_items} {spine_refs}' + ) + zf.writestr("OEBPS/content.opf", opf_xml) + + for fn, content in chapters: + path = f"OEBPS/{fn}" if include_container else fn + zf.writestr(path, content) + + return buf.getvalue() + + +def _simple_html(body_text): + return f"Test

{body_text}

" + + +class TestEpubParserBasic: + def test_parse_single_chapter(self): + epub_bytes = _make_epub([("ch1.xhtml", _simple_html("Hello World"))]) + parser = RAGFlowEpubParser() + sections = parser(None, binary=epub_bytes, chunk_token_num=512) + assert len(sections) >= 1 + combined = " ".join(sections) + assert "Hello World" in combined + + def test_parse_multiple_chapters(self): + chapters = [ + ("ch1.xhtml", _simple_html("Chapter One")), + ("ch2.xhtml", _simple_html("Chapter Two")), + ("ch3.xhtml", _simple_html("Chapter Three")), + ] + epub_bytes = _make_epub(chapters) + parser = RAGFlowEpubParser() + sections = parser(None, binary=epub_bytes, chunk_token_num=512) + combined = " ".join(sections) + assert "Chapter One" in combined + assert "Chapter Two" in combined + assert "Chapter Three" in combined + + def test_spine_ordering(self): + """Chapters should be returned in spine order, not filename order.""" + chapters = [ + ("ch1.xhtml", _simple_html("First")), + ("ch2.xhtml", _simple_html("Second")), + ("ch3.xhtml", _simple_html("Third")), + ] + epub_bytes = _make_epub(chapters, spine_order=["ch3.xhtml", "ch1.xhtml", "ch2.xhtml"]) + parser = RAGFlowEpubParser() + sections = parser(None, binary=epub_bytes, chunk_token_num=512) + combined = " ".join(sections) + assert combined.index("Third") < combined.index("First") + assert combined.index("First") < combined.index("Second") + + def test_empty_epub(self): + epub_bytes = _make_epub([]) + parser = RAGFlowEpubParser() + sections = parser(None, binary=epub_bytes, chunk_token_num=512) + assert sections == [] + + def test_empty_binary(self): + """Empty bytes should raise ValueError, not trigger file open.""" + parser = RAGFlowEpubParser() + try: + parser(None, binary=b"", chunk_token_num=512) + assert False, "Expected ValueError for empty binary" + except ValueError: + pass + + +class TestEpubParserFallback: + def test_fallback_without_container(self): + """When META-INF/container.xml is missing, should fall back to finding .xhtml files.""" + chapters = [ + ("chapter1.xhtml", _simple_html("Fallback Content")), + ] + epub_bytes = _make_epub(chapters, include_container=False) + parser = RAGFlowEpubParser() + sections = parser(None, binary=epub_bytes, chunk_token_num=512) + combined = " ".join(sections) + assert "Fallback Content" in combined + + def test_fallback_on_malformed_container_xml(self): + """Malformed container.xml should fall back, not raise.""" + buf = BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr("mimetype", "application/epub+zip") + zf.writestr("META-INF/container.xml", "THIS IS NOT XML <><><>") + zf.writestr("chapter.xhtml", _simple_html("Recovered Content")) + + parser = RAGFlowEpubParser() + sections = parser(None, binary=buf.getvalue(), chunk_token_num=512) + combined = " ".join(sections) + assert "Recovered Content" in combined + + def test_fallback_on_malformed_opf_xml(self): + """Malformed OPF file should fall back, not raise.""" + buf = BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr("mimetype", "application/epub+zip") + container_xml = ( + '' + '' + " " + ' ' + " " + "" + ) + zf.writestr("META-INF/container.xml", container_xml) + zf.writestr("content.opf", "BROKEN OPF {{{") + zf.writestr("chapter.xhtml", _simple_html("OPF Fallback")) + + parser = RAGFlowEpubParser() + sections = parser(None, binary=buf.getvalue(), chunk_token_num=512) + combined = " ".join(sections) + assert "OPF Fallback" in combined + + +class TestEpubParserEdgeCases: + def test_non_xhtml_spine_items_skipped(self): + """Non-XHTML items in the spine should be skipped.""" + buf = BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr("mimetype", "application/epub+zip") + container_xml = ( + '' + '' + " " + ' ' + " " + "" + ) + zf.writestr("META-INF/container.xml", container_xml) + opf_xml = ( + '' + '' + " " + ' ' + ' ' + " " + " " + ' ' + ' ' + " " + "" + ) + zf.writestr("content.opf", opf_xml) + zf.writestr("ch1.xhtml", _simple_html("Real Content")) + zf.writestr("cover.png", b"\x89PNG fake image data") + + epub_bytes = buf.getvalue() + parser = RAGFlowEpubParser() + sections = parser(None, binary=epub_bytes, chunk_token_num=512) + combined = " ".join(sections) + assert "Real Content" in combined + + def test_missing_spine_file(self): + """If a spine item references a file not in the ZIP, it should be skipped.""" + buf = BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr("mimetype", "application/epub+zip") + container_xml = ( + '' + '' + " " + ' ' + " " + "" + ) + zf.writestr("META-INF/container.xml", container_xml) + opf_xml = ( + '' + '' + " " + ' ' + ' ' + " " + " " + ' ' + ' ' + " " + "" + ) + zf.writestr("content.opf", opf_xml) + zf.writestr("ch1.xhtml", _simple_html("Existing Chapter")) + + epub_bytes = buf.getvalue() + parser = RAGFlowEpubParser() + sections = parser(None, binary=epub_bytes, chunk_token_num=512) + combined = " ".join(sections) + assert "Existing Chapter" in combined + + def test_empty_xhtml_file_skipped(self): + """Empty XHTML files in the EPUB should be skipped without error.""" + buf = BytesIO() + with zipfile.ZipFile(buf, "w") as zf: + zf.writestr("mimetype", "application/epub+zip") + container_xml = ( + '' + '' + " " + ' ' + " " + "" + ) + zf.writestr("META-INF/container.xml", container_xml) + opf_xml = ( + '' + '' + " " + ' ' + ' ' + " " + " " + ' ' + ' ' + " " + "" + ) + zf.writestr("content.opf", opf_xml) + zf.writestr("empty.xhtml", b"") + zf.writestr("real.xhtml", _simple_html("Has Content")) + + parser = RAGFlowEpubParser() + sections = parser(None, binary=buf.getvalue(), chunk_token_num=512) + combined = " ".join(sections) + assert "Has Content" in combined