""" Unit tests for deepdoc/parser/opendataloader_parser.py Tests cover the HTTP-client refactoring: check_installation(), parse_pdf(), and the crop() bounds guard — without requiring a live OpenDataLoader service, opendataloader_pdf package, or Java runtime. """ from __future__ import annotations import importlib.util import io import sys from pathlib import Path from unittest import mock import pytest import requests # --------------------------------------------------------------------------- # Bootstrap: stub out heavy imports the module pulls in so tests run anywhere # --------------------------------------------------------------------------- import types as _types # PIL — used only at runtime for image ops, mock the whole package for _m in ("pdfplumber", "PIL", "PIL.Image"): if _m not in sys.modules: sys.modules[_m] = mock.MagicMock() # deepdoc.parser.pdf_parser — provide a real base class so OpenDataLoaderParser # inherits a proper Python class, not a MagicMock (which breaks __init__). _pdf_parser_mod = _types.ModuleType("deepdoc.parser.pdf_parser") class _RAGFlowPdfParserStub: # noqa: E302 pass _pdf_parser_mod.RAGFlowPdfParser = _RAGFlowPdfParserStub sys.modules.setdefault("deepdoc.parser.pdf_parser", _pdf_parser_mod) sys.modules.setdefault("deepdoc", mock.MagicMock()) sys.modules.setdefault("deepdoc.parser", mock.MagicMock()) # deepdoc.parser.utils — extract_pdf_outlines must be a real callable _utils_mod = _types.ModuleType("deepdoc.parser.utils") _utils_mod.extract_pdf_outlines = mock.MagicMock(return_value=[]) sys.modules.setdefault("deepdoc.parser.utils", _utils_mod) # Load the module under test _REPO = Path(__file__).parents[4] _spec = importlib.util.spec_from_file_location( "opendataloader_parser", _REPO / "deepdoc" / "parser" / "opendataloader_parser.py", ) _mod = importlib.util.module_from_spec(_spec) # Register before exec so @dataclass can resolve __module__ sys.modules["opendataloader_parser"] = _mod _spec.loader.exec_module(_mod) OpenDataLoaderParser = _mod.OpenDataLoaderParser # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _make_parser(api_url: str = "http://odl:9383") -> OpenDataLoaderParser: p = OpenDataLoaderParser() p.api_url = api_url return p def _fake_page_image(width: int = 600, height: int = 800): img = mock.MagicMock() img.size = (width, height) img.crop = mock.MagicMock(return_value=img) img.convert = mock.MagicMock(return_value=img) return img # --------------------------------------------------------------------------- # check_installation() # --------------------------------------------------------------------------- class TestCheckInstallation: def test_no_api_url_returns_false(self): p = OpenDataLoaderParser() p.api_url = "" assert p.check_installation() is False def test_health_200_returns_true(self): p = _make_parser() resp = mock.MagicMock(status_code=200) with mock.patch("requests.get", return_value=resp): assert p.check_installation() is True def test_health_503_returns_false(self): p = _make_parser() resp = mock.MagicMock(status_code=503, text="unavailable") with mock.patch("requests.get", return_value=resp): assert p.check_installation() is False def test_connection_error_returns_false(self): p = _make_parser() with mock.patch("requests.get", side_effect=requests.ConnectionError("refused")): assert p.check_installation() is False # --------------------------------------------------------------------------- # parse_pdf() # --------------------------------------------------------------------------- class TestParsePdf: def _mock_response(self, json_doc=None, md_text=None) -> mock.MagicMock: resp = mock.MagicMock() resp.raise_for_status = mock.MagicMock() resp.json.return_value = {"json_doc": json_doc, "md_text": md_text} return resp def test_raises_when_api_url_not_set(self, tmp_path): p = OpenDataLoaderParser() p.api_url = "" pdf = tmp_path / "doc.pdf" pdf.write_bytes(b"%PDF-dummy") with pytest.raises(RuntimeError, match="OPENDATALOADER_APISERVER"): p.parse_pdf(filepath=str(pdf)) def test_posts_to_file_parse_endpoint(self, tmp_path): p = _make_parser() pdf = tmp_path / "doc.pdf" pdf.write_bytes(b"%PDF-dummy") resp = self._mock_response(md_text="hello world") with mock.patch.object(p, "__images__"), \ mock.patch("requests.post", return_value=resp) as mock_post: p.parse_pdf(filepath=str(pdf)) mock_post.assert_called_once() call_kwargs = mock_post.call_args assert "/file_parse" in call_kwargs.kwargs.get("url", call_kwargs.args[0] if call_kwargs.args else "") def test_binary_bytes_sent_as_multipart(self, tmp_path): p = _make_parser() pdf_bytes = b"%PDF-binary" resp = self._mock_response(md_text="section text") with mock.patch.object(p, "__images__"), \ mock.patch("requests.post", return_value=resp) as mock_post: p.parse_pdf(filepath="file.pdf", binary=pdf_bytes) files_arg = mock_post.call_args.kwargs.get("files", {}) assert "file" in files_arg _, sent_bytes, mime = files_arg["file"] assert sent_bytes == pdf_bytes assert mime == "application/pdf" def test_bytesio_binary_sent_correctly(self, tmp_path): p = _make_parser() pdf_bytes = b"%PDF-bytesio" resp = self._mock_response(md_text="text from bytesio") with mock.patch.object(p, "__images__"), \ mock.patch("requests.post", return_value=resp) as mock_post: p.parse_pdf(filepath="file.pdf", binary=io.BytesIO(pdf_bytes)) files_arg = mock_post.call_args.kwargs.get("files", {}) _, sent_bytes, _ = files_arg["file"] assert sent_bytes == pdf_bytes def test_json_doc_response_returns_sections(self, tmp_path): p = _make_parser() json_doc = { "type": "paragraph", "content": "Hello from JSON", "page_number": 1, "bounding_box": [0, 0, 100, 20], } resp = self._mock_response(json_doc=json_doc) with mock.patch.object(p, "__images__"), \ mock.patch("requests.post", return_value=resp): sections, tables = p.parse_pdf(filepath="doc.pdf", binary=b"%PDF", parse_method="pipeline") assert any("Hello from JSON" in s[0] for s in sections) def test_md_text_fallback_when_no_json(self, tmp_path): p = _make_parser() resp = self._mock_response(json_doc=None, md_text="# Markdown heading\n\nBody text.") with mock.patch.object(p, "__images__"), \ mock.patch("requests.post", return_value=resp): sections, tables = p.parse_pdf(filepath="doc.pdf", binary=b"%PDF", parse_method="pipeline") assert len(sections) > 0 assert tables == [] def test_sanitize_true_sends_string_true(self): p = _make_parser() resp = self._mock_response(md_text="ok") with mock.patch.object(p, "__images__"), \ mock.patch("requests.post", return_value=resp) as mock_post: p.parse_pdf(filepath="doc.pdf", binary=b"%PDF", sanitize=True) data_arg = mock_post.call_args.kwargs.get("data", {}) assert data_arg.get("sanitize") == "true" def test_sanitize_false_sends_string_false(self): p = _make_parser() resp = self._mock_response(md_text="ok") with mock.patch.object(p, "__images__"), \ mock.patch("requests.post", return_value=resp) as mock_post: p.parse_pdf(filepath="doc.pdf", binary=b"%PDF", sanitize=False) data_arg = mock_post.call_args.kwargs.get("data", {}) assert data_arg.get("sanitize") == "false" def test_hybrid_and_image_output_forwarded(self): p = _make_parser() resp = self._mock_response(md_text="ok") with mock.patch.object(p, "__images__"), \ mock.patch("requests.post", return_value=resp) as mock_post: p.parse_pdf(filepath="doc.pdf", binary=b"%PDF", hybrid="docling-fast", image_output="embedded") data_arg = mock_post.call_args.kwargs.get("data", {}) assert data_arg.get("hybrid") == "docling-fast" assert data_arg.get("image_output") == "embedded" def test_optional_params_omitted_when_none(self): p = _make_parser() resp = self._mock_response(md_text="ok") with mock.patch.object(p, "__images__"), \ mock.patch("requests.post", return_value=resp) as mock_post: p.parse_pdf(filepath="doc.pdf", binary=b"%PDF") data_arg = mock_post.call_args.kwargs.get("data", {}) assert "hybrid" not in data_arg assert "image_output" not in data_arg assert "sanitize" not in data_arg def test_callback_called_at_progress_points(self): p = _make_parser() resp = self._mock_response(md_text="text") cb = mock.MagicMock() with mock.patch.object(p, "__images__"), \ mock.patch("requests.post", return_value=resp): p.parse_pdf(filepath="doc.pdf", binary=b"%PDF", callback=cb) progress_values = [call.args[0] for call in cb.call_args_list] assert 0.1 in progress_values assert 1.0 in progress_values def test_http_error_raises_runtime_error(self): p = _make_parser() with mock.patch.object(p, "__images__"), \ mock.patch("requests.post", side_effect=requests.ConnectionError("down")): with pytest.raises(RuntimeError, match="service call failed"): p.parse_pdf(filepath="doc.pdf", binary=b"%PDF") def test_non_200_status_raises_runtime_error(self): p = _make_parser() resp = mock.MagicMock() resp.raise_for_status.side_effect = requests.HTTPError("500 Server Error") with mock.patch.object(p, "__images__"), \ mock.patch("requests.post", return_value=resp): with pytest.raises(RuntimeError, match="service call failed"): p.parse_pdf(filepath="doc.pdf", binary=b"%PDF") # --------------------------------------------------------------------------- # crop() — bounds guard # --------------------------------------------------------------------------- class TestCrop: def test_returns_none_when_no_page_images(self): p = _make_parser() p.page_images = [] result = p.crop("@@1\t10.0\t100.0\t20.0\t80.0##") assert result is None def test_returns_none_when_no_position_tags(self): p = _make_parser() p.page_images = [_fake_page_image()] result = p.crop("no tags here") assert result is None def test_out_of_range_page_index_filtered_returns_none(self): p = _make_parser() # Only 1 page rendered (index 0), but tag references page 5 (index 4) p.page_images = [_fake_page_image()] # Tag: page 5 → extract_positions returns pn=[4] tag = "@@5\t10.0\t100.0\t20.0\t80.0##" result = p.crop(tag) assert result is None def test_valid_page_index_does_not_raise(self): p = _make_parser() img = _fake_page_image(width=200, height=300) p.page_images = [img, img, img] # Tag references page 2 (index 1) — within rendered range. # Patch Image.new and alpha_composite at the module level to avoid # real ImagingCore requirements from mocked PIL images. tag = "@@2\t10.0\t100.0\t20.0\t80.0##" canvas = mock.MagicMock() canvas.paste = mock.MagicMock() try: with mock.patch.object(_mod.Image, "new", return_value=canvas), \ mock.patch.object(_mod.Image, "alpha_composite", return_value=img): p.crop(tag) except IndexError: pytest.fail("crop() raised IndexError for a valid page index") def test_need_position_false_returns_image_or_none(self): p = _make_parser() p.page_images = [] result = p.crop("@@1\t10.0\t100.0\t20.0\t80.0##", need_position=False) assert result is None def test_need_position_true_returns_tuple_when_no_images(self): p = _make_parser() p.page_images = [] result = p.crop("@@1\t10.0\t100.0\t20.0\t80.0##", need_position=True) assert result == (None, None)