mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-21 00:36:43 +08:00
### What problem does this PR solve? Closes #14058. RAGFlow supports multiple PDF parsing backends (DeepDOC, MinerU, Docling, TCADP, PaddleOCR). This PR adds **OpenDataLoader** ([opendataloader-project/opendataloader-pdf](https://github.com/opendataloader-project/opendataloader-pdf)) as a new optional backend, giving users a deterministic, local-first alternative with competitive table extraction accuracy. ### Type of change - [x] New Feature (non-breaking change which adds functionality) - [x] Documentation Update --- ### Changes #### Backend - `deepdoc/parser/opendataloader_parser.py` — new `OpenDataLoaderParser` class inheriting `RAGFlowPdfParser`. Implements `check_installation()` (guards Python package + Java 11+ runtime), `parse_pdf()` with JSON-first extraction (heading/paragraph/table/list/image/formula) and Markdown fallback, position-tag generation compatible with the shared `@@page\tx0\tx1\ty0\ty1##` format, and temp-dir lifecycle with cleanup. - `rag/app/naive.py` — new `by_opendataloader()` wrapper, registered in `PARSERS` dict, added to `chunk_token_num=0` override list. - `rag/flow/parser/parser.py` — `"opendataloader"` branch in the pipeline PDF handler + check validation list. #### Infrastructure - `docker/entrypoint.sh` — `ensure_opendataloader()` function: opt-in via `USE_OPENDATALOADER=true`, skips gracefully if Java is not on PATH. #### Frontend - `web/src/components/layout-recognize-form-field.tsx` — `OpenDataLoader` added to `ParseDocumentType` enum and parser dropdown. Cascades automatically to the pipeline editor's Parser component. #### Docs - `docs/guides/dataset/select_pdf_parser.md` — added OpenDataLoader entry and full env-var reference. --- ### Environment variables | Variable | Default | Description | |---|---|---| | `USE_OPENDATALOADER` | `false` | Set `true` to install `opendataloader-pdf` on container startup | | `OPENDATALOADER_VERSION` | latest | Pin the PyPI release (e.g. `==2.2.1`) | | `OPENDATALOADER_HYBRID` | _(unset)_ | Enable hybrid AI mode (e.g. `docling-fast`) | | `OPENDATALOADER_IMAGE_OUTPUT` | _(unset)_ | `off` / `embedded` / `external` | | `OPENDATALOADER_OUTPUT_DIR` | _(tmp)_ | Persistent output dir; temp dir used + cleaned if unset | | `OPENDATALOADER_DELETE_OUTPUT` | `1` | `0` to retain intermediate files for debugging | | `OPENDATALOADER_SANITIZE` | _(unset)_ | `1` to filter prompt-injection patterns from output | --- ### Dependencies - **Runtime**: `opendataloader-pdf` (PyPI, Apache 2.0) — opt-in, not added to `pyproject.toml` core deps. Installed by `ensure_opendataloader()` at container startup when `USE_OPENDATALOADER=true`. - **System**: Java 11+ on PATH (JVM is the underlying engine). The installer skips with a warning if `java` is not found. --- ### How to test **Standalone parser:** ```bash source .venv/bin/activate uv pip install opendataloader-pdf python3 -c " import sys; sys.path.insert(0, '.') from deepdoc.parser.opendataloader_parser import OpenDataLoaderParser p = OpenDataLoaderParser() print('available:', p.check_installation()) s, t = p.parse_pdf('path/to/test.pdf', parse_method='pipeline') print(f'sections={len(s)} tables={len(t)}') " ``` ### Benchmark vs Docling ``` file parser secs sections tables ---------------------------------------------------------------------- text-heavy.pdf docling 45.29 148 10 text-heavy.pdf opendataloader 3.14 559 0 table-heavy.pdf docling 7.05 76 3 table-heavy.pdf opendataloader 3.71 90 0 complex.pdf docling 42.67 114 8 complex.pdf opendataloader 3.51 180 0 ```
327 lines
12 KiB
Python
327 lines
12 KiB
Python
"""
|
|
Unit tests for deepdoc/parser/opendataloader_parser.py
|
|
|
|
Tests cover the HTTP-client refactoring: check_installation(), parse_pdf(),
|
|
and the crop() bounds guard — without requiring a live OpenDataLoader service,
|
|
opendataloader_pdf package, or Java runtime.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import importlib.util
|
|
import io
|
|
import sys
|
|
from pathlib import Path
|
|
from unittest import mock
|
|
|
|
import pytest
|
|
import requests
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Bootstrap: stub out heavy imports the module pulls in so tests run anywhere
|
|
# ---------------------------------------------------------------------------
|
|
import types as _types
|
|
|
|
# PIL — used only at runtime for image ops, mock the whole package
|
|
for _m in ("pdfplumber", "PIL", "PIL.Image"):
|
|
if _m not in sys.modules:
|
|
sys.modules[_m] = mock.MagicMock()
|
|
|
|
# deepdoc.parser.pdf_parser — provide a real base class so OpenDataLoaderParser
|
|
# inherits a proper Python class, not a MagicMock (which breaks __init__).
|
|
_pdf_parser_mod = _types.ModuleType("deepdoc.parser.pdf_parser")
|
|
class _RAGFlowPdfParserStub: # noqa: E302
|
|
pass
|
|
_pdf_parser_mod.RAGFlowPdfParser = _RAGFlowPdfParserStub
|
|
sys.modules.setdefault("deepdoc.parser.pdf_parser", _pdf_parser_mod)
|
|
sys.modules.setdefault("deepdoc", mock.MagicMock())
|
|
sys.modules.setdefault("deepdoc.parser", mock.MagicMock())
|
|
|
|
# deepdoc.parser.utils — extract_pdf_outlines must be a real callable
|
|
_utils_mod = _types.ModuleType("deepdoc.parser.utils")
|
|
_utils_mod.extract_pdf_outlines = mock.MagicMock(return_value=[])
|
|
sys.modules.setdefault("deepdoc.parser.utils", _utils_mod)
|
|
|
|
# Load the module under test
|
|
_REPO = Path(__file__).parents[4]
|
|
_spec = importlib.util.spec_from_file_location(
|
|
"opendataloader_parser",
|
|
_REPO / "deepdoc" / "parser" / "opendataloader_parser.py",
|
|
)
|
|
_mod = importlib.util.module_from_spec(_spec)
|
|
# Register before exec so @dataclass can resolve __module__
|
|
sys.modules["opendataloader_parser"] = _mod
|
|
_spec.loader.exec_module(_mod)
|
|
|
|
OpenDataLoaderParser = _mod.OpenDataLoaderParser
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def _make_parser(api_url: str = "http://odl:9383") -> OpenDataLoaderParser:
|
|
p = OpenDataLoaderParser()
|
|
p.api_url = api_url
|
|
return p
|
|
|
|
|
|
def _fake_page_image(width: int = 600, height: int = 800):
|
|
img = mock.MagicMock()
|
|
img.size = (width, height)
|
|
img.crop = mock.MagicMock(return_value=img)
|
|
img.convert = mock.MagicMock(return_value=img)
|
|
return img
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# check_installation()
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestCheckInstallation:
|
|
def test_no_api_url_returns_false(self):
|
|
p = OpenDataLoaderParser()
|
|
p.api_url = ""
|
|
assert p.check_installation() is False
|
|
|
|
def test_health_200_returns_true(self):
|
|
p = _make_parser()
|
|
resp = mock.MagicMock(status_code=200)
|
|
with mock.patch("requests.get", return_value=resp):
|
|
assert p.check_installation() is True
|
|
|
|
def test_health_503_returns_false(self):
|
|
p = _make_parser()
|
|
resp = mock.MagicMock(status_code=503, text="unavailable")
|
|
with mock.patch("requests.get", return_value=resp):
|
|
assert p.check_installation() is False
|
|
|
|
def test_connection_error_returns_false(self):
|
|
p = _make_parser()
|
|
with mock.patch("requests.get", side_effect=requests.ConnectionError("refused")):
|
|
assert p.check_installation() is False
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# parse_pdf()
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestParsePdf:
|
|
def _mock_response(self, json_doc=None, md_text=None) -> mock.MagicMock:
|
|
resp = mock.MagicMock()
|
|
resp.raise_for_status = mock.MagicMock()
|
|
resp.json.return_value = {"json_doc": json_doc, "md_text": md_text}
|
|
return resp
|
|
|
|
def test_raises_when_api_url_not_set(self, tmp_path):
|
|
p = OpenDataLoaderParser()
|
|
p.api_url = ""
|
|
pdf = tmp_path / "doc.pdf"
|
|
pdf.write_bytes(b"%PDF-dummy")
|
|
with pytest.raises(RuntimeError, match="OPENDATALOADER_APISERVER"):
|
|
p.parse_pdf(filepath=str(pdf))
|
|
|
|
def test_posts_to_file_parse_endpoint(self, tmp_path):
|
|
p = _make_parser()
|
|
pdf = tmp_path / "doc.pdf"
|
|
pdf.write_bytes(b"%PDF-dummy")
|
|
resp = self._mock_response(md_text="hello world")
|
|
|
|
with mock.patch.object(p, "__images__"), \
|
|
mock.patch("requests.post", return_value=resp) as mock_post:
|
|
p.parse_pdf(filepath=str(pdf))
|
|
|
|
mock_post.assert_called_once()
|
|
call_kwargs = mock_post.call_args
|
|
assert "/file_parse" in call_kwargs.kwargs.get("url", call_kwargs.args[0] if call_kwargs.args else "")
|
|
|
|
def test_binary_bytes_sent_as_multipart(self, tmp_path):
|
|
p = _make_parser()
|
|
pdf_bytes = b"%PDF-binary"
|
|
resp = self._mock_response(md_text="section text")
|
|
|
|
with mock.patch.object(p, "__images__"), \
|
|
mock.patch("requests.post", return_value=resp) as mock_post:
|
|
p.parse_pdf(filepath="file.pdf", binary=pdf_bytes)
|
|
|
|
files_arg = mock_post.call_args.kwargs.get("files", {})
|
|
assert "file" in files_arg
|
|
_, sent_bytes, mime = files_arg["file"]
|
|
assert sent_bytes == pdf_bytes
|
|
assert mime == "application/pdf"
|
|
|
|
def test_bytesio_binary_sent_correctly(self, tmp_path):
|
|
p = _make_parser()
|
|
pdf_bytes = b"%PDF-bytesio"
|
|
resp = self._mock_response(md_text="text from bytesio")
|
|
|
|
with mock.patch.object(p, "__images__"), \
|
|
mock.patch("requests.post", return_value=resp) as mock_post:
|
|
p.parse_pdf(filepath="file.pdf", binary=io.BytesIO(pdf_bytes))
|
|
|
|
files_arg = mock_post.call_args.kwargs.get("files", {})
|
|
_, sent_bytes, _ = files_arg["file"]
|
|
assert sent_bytes == pdf_bytes
|
|
|
|
def test_json_doc_response_returns_sections(self, tmp_path):
|
|
p = _make_parser()
|
|
json_doc = {
|
|
"type": "paragraph",
|
|
"content": "Hello from JSON",
|
|
"page_number": 1,
|
|
"bounding_box": [0, 0, 100, 20],
|
|
}
|
|
resp = self._mock_response(json_doc=json_doc)
|
|
|
|
with mock.patch.object(p, "__images__"), \
|
|
mock.patch("requests.post", return_value=resp):
|
|
sections, tables = p.parse_pdf(filepath="doc.pdf", binary=b"%PDF", parse_method="pipeline")
|
|
|
|
assert any("Hello from JSON" in s[0] for s in sections)
|
|
|
|
def test_md_text_fallback_when_no_json(self, tmp_path):
|
|
p = _make_parser()
|
|
resp = self._mock_response(json_doc=None, md_text="# Markdown heading\n\nBody text.")
|
|
|
|
with mock.patch.object(p, "__images__"), \
|
|
mock.patch("requests.post", return_value=resp):
|
|
sections, tables = p.parse_pdf(filepath="doc.pdf", binary=b"%PDF", parse_method="pipeline")
|
|
|
|
assert len(sections) > 0
|
|
assert tables == []
|
|
|
|
def test_sanitize_true_sends_string_true(self):
|
|
p = _make_parser()
|
|
resp = self._mock_response(md_text="ok")
|
|
|
|
with mock.patch.object(p, "__images__"), \
|
|
mock.patch("requests.post", return_value=resp) as mock_post:
|
|
p.parse_pdf(filepath="doc.pdf", binary=b"%PDF", sanitize=True)
|
|
|
|
data_arg = mock_post.call_args.kwargs.get("data", {})
|
|
assert data_arg.get("sanitize") == "true"
|
|
|
|
def test_sanitize_false_sends_string_false(self):
|
|
p = _make_parser()
|
|
resp = self._mock_response(md_text="ok")
|
|
|
|
with mock.patch.object(p, "__images__"), \
|
|
mock.patch("requests.post", return_value=resp) as mock_post:
|
|
p.parse_pdf(filepath="doc.pdf", binary=b"%PDF", sanitize=False)
|
|
|
|
data_arg = mock_post.call_args.kwargs.get("data", {})
|
|
assert data_arg.get("sanitize") == "false"
|
|
|
|
def test_hybrid_and_image_output_forwarded(self):
|
|
p = _make_parser()
|
|
resp = self._mock_response(md_text="ok")
|
|
|
|
with mock.patch.object(p, "__images__"), \
|
|
mock.patch("requests.post", return_value=resp) as mock_post:
|
|
p.parse_pdf(filepath="doc.pdf", binary=b"%PDF",
|
|
hybrid="docling-fast", image_output="embedded")
|
|
|
|
data_arg = mock_post.call_args.kwargs.get("data", {})
|
|
assert data_arg.get("hybrid") == "docling-fast"
|
|
assert data_arg.get("image_output") == "embedded"
|
|
|
|
def test_optional_params_omitted_when_none(self):
|
|
p = _make_parser()
|
|
resp = self._mock_response(md_text="ok")
|
|
|
|
with mock.patch.object(p, "__images__"), \
|
|
mock.patch("requests.post", return_value=resp) as mock_post:
|
|
p.parse_pdf(filepath="doc.pdf", binary=b"%PDF")
|
|
|
|
data_arg = mock_post.call_args.kwargs.get("data", {})
|
|
assert "hybrid" not in data_arg
|
|
assert "image_output" not in data_arg
|
|
assert "sanitize" not in data_arg
|
|
|
|
def test_callback_called_at_progress_points(self):
|
|
p = _make_parser()
|
|
resp = self._mock_response(md_text="text")
|
|
cb = mock.MagicMock()
|
|
|
|
with mock.patch.object(p, "__images__"), \
|
|
mock.patch("requests.post", return_value=resp):
|
|
p.parse_pdf(filepath="doc.pdf", binary=b"%PDF", callback=cb)
|
|
|
|
progress_values = [call.args[0] for call in cb.call_args_list]
|
|
assert 0.1 in progress_values
|
|
assert 1.0 in progress_values
|
|
|
|
def test_http_error_raises_runtime_error(self):
|
|
p = _make_parser()
|
|
|
|
with mock.patch.object(p, "__images__"), \
|
|
mock.patch("requests.post", side_effect=requests.ConnectionError("down")):
|
|
with pytest.raises(RuntimeError, match="service call failed"):
|
|
p.parse_pdf(filepath="doc.pdf", binary=b"%PDF")
|
|
|
|
def test_non_200_status_raises_runtime_error(self):
|
|
p = _make_parser()
|
|
resp = mock.MagicMock()
|
|
resp.raise_for_status.side_effect = requests.HTTPError("500 Server Error")
|
|
|
|
with mock.patch.object(p, "__images__"), \
|
|
mock.patch("requests.post", return_value=resp):
|
|
with pytest.raises(RuntimeError, match="service call failed"):
|
|
p.parse_pdf(filepath="doc.pdf", binary=b"%PDF")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# crop() — bounds guard
|
|
# ---------------------------------------------------------------------------
|
|
|
|
class TestCrop:
|
|
def test_returns_none_when_no_page_images(self):
|
|
p = _make_parser()
|
|
p.page_images = []
|
|
result = p.crop("@@1\t10.0\t100.0\t20.0\t80.0##")
|
|
assert result is None
|
|
|
|
def test_returns_none_when_no_position_tags(self):
|
|
p = _make_parser()
|
|
p.page_images = [_fake_page_image()]
|
|
result = p.crop("no tags here")
|
|
assert result is None
|
|
|
|
def test_out_of_range_page_index_filtered_returns_none(self):
|
|
p = _make_parser()
|
|
# Only 1 page rendered (index 0), but tag references page 5 (index 4)
|
|
p.page_images = [_fake_page_image()]
|
|
# Tag: page 5 → extract_positions returns pn=[4]
|
|
tag = "@@5\t10.0\t100.0\t20.0\t80.0##"
|
|
result = p.crop(tag)
|
|
assert result is None
|
|
|
|
def test_valid_page_index_does_not_raise(self):
|
|
p = _make_parser()
|
|
img = _fake_page_image(width=200, height=300)
|
|
p.page_images = [img, img, img]
|
|
# Tag references page 2 (index 1) — within rendered range.
|
|
# Patch Image.new and alpha_composite at the module level to avoid
|
|
# real ImagingCore requirements from mocked PIL images.
|
|
tag = "@@2\t10.0\t100.0\t20.0\t80.0##"
|
|
canvas = mock.MagicMock()
|
|
canvas.paste = mock.MagicMock()
|
|
try:
|
|
with mock.patch.object(_mod.Image, "new", return_value=canvas), \
|
|
mock.patch.object(_mod.Image, "alpha_composite", return_value=img):
|
|
p.crop(tag)
|
|
except IndexError:
|
|
pytest.fail("crop() raised IndexError for a valid page index")
|
|
|
|
def test_need_position_false_returns_image_or_none(self):
|
|
p = _make_parser()
|
|
p.page_images = []
|
|
result = p.crop("@@1\t10.0\t100.0\t20.0\t80.0##", need_position=False)
|
|
assert result is None
|
|
|
|
def test_need_position_true_returns_tuple_when_no_images(self):
|
|
p = _make_parser()
|
|
p.page_images = []
|
|
result = p.crop("@@1\t10.0\t100.0\t20.0\t80.0##", need_position=True)
|
|
assert result == (None, None)
|