feat(parser): support external Docling server via DOCLING_SERVER_URL (#13527)

### What problem does this PR solve?

This PR adds support for parsing PDFs through an external Docling
server, so RAGFlow can connect to remote `docling serve` deployments
instead of relying only on local in-process Docling.

It addresses the feature request in
[#13426](https://github.com/infiniflow/ragflow/issues/13426) and aligns
with the external-server usage pattern already used by MinerU.

### Type of change

- [ ] Bug Fix (non-breaking change which fixes an issue)
- [x] New Feature (non-breaking change which adds functionality)
- [x] Documentation Update
- [ ] Refactoring
- [ ] Performance Improvement
- [ ] Other (please describe):

### What is changed?

- Add external Docling server support in `DoclingParser`:
  - Use `DOCLING_SERVER_URL` to enable remote parsing mode.
- Try `POST /v1/convert/source` first, and fallback to
`/v1alpha/convert/source`.
- Keep existing local Docling behavior when `DOCLING_SERVER_URL` is not
set.
- Wire Docling env settings into parser invocation paths:
  - `rag/app/naive.py`
  - `rag/flow/parser/parser.py`
- Add Docling env hints in constants and update docs:
  - `docs/guides/dataset/select_pdf_parser.md`
  - `docs/guides/agent/agent_component_reference/parser.md`
  - `docs/faq.mdx`

### Why this approach?

This keeps the change focused on one issue and one capability (external
Docling connectivity), without introducing unrelated provider-model
plumbing.

### Validation

- Static checks:
  - `python -m py_compile` on changed Python files
  - `python -m ruff check` on changed Python files
- Functional checks:
  - Remote v1 endpoint path works
  - v1alpha fallback works
  - Local Docling path remains available when server URL is unset

### Related links

- Feature request: [Support external Docling server (issue
#13426)](https://github.com/infiniflow/ragflow/issues/13426)
- Compare view for this branch:
[main...feat/docling-server](https://github.com/infiniflow/ragflow/compare/main...spider-yamet:ragflow:feat/docling-server?expand=1)

##### Fixes [#13426](https://github.com/infiniflow/ragflow/issues/13426)
This commit is contained in:
NeedmeFordev
2026-03-12 18:09:03 +09:00
committed by GitHub
parent a353c7bdd7
commit 387b0b27c4
7 changed files with 246 additions and 10 deletions

View File

@ -17,6 +17,8 @@ from __future__ import annotations
import logging
import re
import base64
import os
from dataclasses import dataclass
from enum import Enum
from io import BytesIO
@ -25,6 +27,7 @@ from pathlib import Path
from typing import Any, Callable, Iterable, Optional
import pdfplumber
import requests
from PIL import Image
try:
@ -74,15 +77,41 @@ def _extract_bbox_from_prov(item, prov_attr: str = "prov") -> Optional[_BBox]:
class DoclingParser(RAGFlowPdfParser):
def __init__(self):
def __init__(self, docling_server_url: str = "", request_timeout: int = 600):
self.logger = logging.getLogger(self.__class__.__name__)
self.page_images: list[Image.Image] = []
self.page_from = 0
self.page_to = 10_000
self.outlines = []
def check_installation(self) -> bool:
self.docling_server_url = (docling_server_url or "").rstrip("/")
self.request_timeout = request_timeout
def _effective_server_url(self, docling_server_url: Optional[str] = None) -> str:
return (docling_server_url or self.docling_server_url or "").rstrip("/") or (
os.environ.get("DOCLING_SERVER_URL", "").rstrip("/")
)
@staticmethod
def _is_http_endpoint_valid(url: str, timeout: int = 5) -> bool:
try:
response = requests.head(url, timeout=timeout, allow_redirects=True)
return response.status_code in [200, 301, 302, 307, 308]
except Exception:
try:
response = requests.get(url, timeout=timeout, allow_redirects=True)
return response.status_code in [200, 301, 302, 307, 308]
except Exception:
return False
def check_installation(self, docling_server_url: Optional[str] = None) -> bool:
server_url = self._effective_server_url(docling_server_url)
if server_url:
for path in ("/openapi.json", "/docs", "/v1/convert/source"):
if self._is_http_endpoint_valid(f"{server_url}{path}", timeout=5):
return True
self.logger.warning(f"[Docling] external server not reachable: {server_url}")
return False
if DocumentConverter is None:
self.logger.warning("[Docling] 'docling' is not importable, please: pip install docling")
return False
@ -277,6 +306,141 @@ class DoclingParser(RAGFlowPdfParser):
tables.append(((img, [captions]), positions if positions else ""))
return tables
@staticmethod
def _sections_from_remote_text(text: str, parse_method: str) -> list[tuple[str, ...]]:
txt = (text or "").strip()
if not txt:
return []
if parse_method == "manual":
return [(txt, DoclingContentType.TEXT.value, "")]
if parse_method == "paper":
return [(txt, DoclingContentType.TEXT.value)]
return [(txt, "")]
@staticmethod
def _extract_remote_document_entries(payload: Any) -> list[dict[str, Any]]:
if not isinstance(payload, dict):
return []
if isinstance(payload.get("document"), dict):
return [payload["document"]]
if isinstance(payload.get("documents"), list):
return [d for d in payload["documents"] if isinstance(d, dict)]
if isinstance(payload.get("results"), list):
docs = []
for it in payload["results"]:
if isinstance(it, dict):
if isinstance(it.get("document"), dict):
docs.append(it["document"])
elif isinstance(it.get("result"), dict):
docs.append(it["result"])
else:
docs.append(it)
return docs
return []
def _parse_pdf_remote(
self,
filepath: str | PathLike[str],
binary: BytesIO | bytes | None = None,
callback: Optional[Callable] = None,
*,
parse_method: str = "raw",
docling_server_url: Optional[str] = None,
request_timeout: Optional[int] = None,
):
server_url = self._effective_server_url(docling_server_url)
if not server_url:
raise RuntimeError("[Docling] DOCLING_SERVER_URL is not configured.")
timeout = request_timeout or self.request_timeout
if binary is not None:
if isinstance(binary, (bytes, bytearray)):
pdf_bytes = bytes(binary)
else:
pdf_bytes = bytes(binary.getbuffer())
else:
src_path = Path(filepath)
if not src_path.exists():
raise FileNotFoundError(f"PDF not found: {src_path}")
with open(src_path, "rb") as f:
pdf_bytes = f.read()
if callback:
callback(0.2, f"[Docling] Requesting external server: {server_url}")
filename = Path(filepath).name or "input.pdf"
b64 = base64.b64encode(pdf_bytes).decode("ascii")
v1_payload = {
"options": {
"from_formats": ["pdf"],
"to_formats": ["json", "md", "text"],
},
"sources": [
{
"kind": "file",
"filename": filename,
"base64_string": b64,
}
],
}
v1alpha_payload = {
"options": {
"from_formats": ["pdf"],
"to_formats": ["json", "md", "text"],
},
"file_sources": [
{
"filename": filename,
"base64_string": b64,
}
],
}
errors = []
response_json = None
for endpoint, payload in (
("/v1/convert/source", v1_payload),
("/v1alpha/convert/source", v1alpha_payload),
):
try:
resp = requests.post(
f"{server_url}{endpoint}",
json=payload,
timeout=timeout,
)
if resp.status_code < 300:
response_json = resp.json()
break
errors.append(f"{endpoint}: HTTP {resp.status_code} {resp.text[:300]}")
except Exception as exc:
errors.append(f"{endpoint}: {exc}")
if response_json is None:
raise RuntimeError("[Docling] remote convert failed: " + " | ".join(errors))
docs = self._extract_remote_document_entries(response_json)
if not docs:
raise RuntimeError("[Docling] remote response does not contain parsed documents.")
sections: list[tuple[str, ...]] = []
tables = []
for doc in docs:
md = doc.get("md_content")
txt = doc.get("text_content")
if isinstance(md, str) and md.strip():
sections.extend(self._sections_from_remote_text(md, parse_method=parse_method))
elif isinstance(txt, str) and txt.strip():
sections.extend(self._sections_from_remote_text(txt, parse_method=parse_method))
json_content = doc.get("json_content")
if isinstance(json_content, dict):
md_fallback = json_content.get("md_content")
if isinstance(md_fallback, str) and md_fallback.strip() and not sections:
sections.extend(self._sections_from_remote_text(md_fallback, parse_method=parse_method))
if callback:
callback(0.95, f"[Docling] Remote sections: {len(sections)}")
return sections, tables
def parse_pdf(
self,
filepath: str | PathLike[str],
@ -287,12 +451,25 @@ class DoclingParser(RAGFlowPdfParser):
lang: Optional[str] = None,
method: str = "auto",
delete_output: bool = True,
parse_method: str = "raw"
parse_method: str = "raw",
docling_server_url: Optional[str] = None,
request_timeout: Optional[int] = None,
):
if not self.check_installation():
if not self.check_installation(docling_server_url=docling_server_url):
raise RuntimeError("Docling not available, please install `docling`")
server_url = self._effective_server_url(docling_server_url)
if server_url:
return self._parse_pdf_remote(
filepath=filepath,
binary=binary,
callback=callback,
parse_method=parse_method,
docling_server_url=server_url,
request_timeout=request_timeout,
)
if binary is not None:
tmpdir = Path(output_dir) if output_dir else Path.cwd() / ".docling_tmp"
tmpdir.mkdir(parents=True, exist_ok=True)