feat(parser): support external Docling server via DOCLING_SERVER_URL (#13527)

### What problem does this PR solve? This PR adds support for parsing PDFs through an external Docling server, so RAGFlow can connect to remote `docling serve` deployments instead of relying only on local in-process Docling. It addresses the feature request in [#13426](https://github.com/infiniflow/ragflow/issues/13426) and aligns with the external-server usage pattern already used by MinerU. ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) - [x] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): ### What is changed? - Add external Docling server support in `DoclingParser`: - Use `DOCLING_SERVER_URL` to enable remote parsing mode. - Try `POST /v1/convert/source` first, and fallback to `/v1alpha/convert/source`. - Keep existing local Docling behavior when `DOCLING_SERVER_URL` is not set. - Wire Docling env settings into parser invocation paths: - `rag/app/naive.py` - `rag/flow/parser/parser.py` - Add Docling env hints in constants and update docs: - `docs/guides/dataset/select_pdf_parser.md` - `docs/guides/agent/agent_component_reference/parser.md` - `docs/faq.mdx` ### Why this approach? This keeps the change focused on one issue and one capability (external Docling connectivity), without introducing unrelated provider-model plumbing. ### Validation - Static checks: - `python -m py_compile` on changed Python files - `python -m ruff check` on changed Python files - Functional checks: - Remote v1 endpoint path works - v1alpha fallback works - Local Docling path remains available when server URL is unset ### Related links - Feature request: [Support external Docling server (issue #13426)](https://github.com/infiniflow/ragflow/issues/13426) - Compare view for this branch: [main...feat/docling-server](https://github.com/infiniflow/ragflow/compare/main...spider-yamet:ragflow:feat/docling-server?expand=1) ##### Fixes [#13426](https://github.com/infiniflow/ragflow/issues/13426)
2026-05-03 08:47:48 +08:00 · 2026-03-12 18:09:03 +09:00
parent a353c7bdd7
commit 387b0b27c4
7 changed files with 246 additions and 10 deletions
--- a/deepdoc/parser/docling_parser.py
+++ b/deepdoc/parser/docling_parser.py
@ -17,6 +17,8 @@ from __future__ import annotations

 import logging
 import re
+import base64
+import os
 from dataclasses import dataclass
 from enum import Enum
 from io import BytesIO
@ -25,6 +27,7 @@ from pathlib import Path
 from typing import Any, Callable, Iterable, Optional

 import pdfplumber
+import requests
 from PIL import Image

 try:
@ -74,15 +77,41 @@ def _extract_bbox_from_prov(item, prov_attr: str = "prov") -> Optional[_BBox]:


 class DoclingParser(RAGFlowPdfParser):
-    def __init__(self):
+    def __init__(self, docling_server_url: str = "", request_timeout: int = 600):
        self.logger = logging.getLogger(self.__class__.__name__)
        self.page_images: list[Image.Image] = []
        self.page_from = 0
        self.page_to = 10_000
        self.outlines = []
-   
-        
-    def check_installation(self) -> bool:
+        self.docling_server_url = (docling_server_url or "").rstrip("/")
+        self.request_timeout = request_timeout
+
+    def _effective_server_url(self, docling_server_url: Optional[str] = None) -> str:
+        return (docling_server_url or self.docling_server_url or "").rstrip("/") or (
+            os.environ.get("DOCLING_SERVER_URL", "").rstrip("/")
+        )
+
+    @staticmethod
+    def _is_http_endpoint_valid(url: str, timeout: int = 5) -> bool:
+        try:
+            response = requests.head(url, timeout=timeout, allow_redirects=True)
+            return response.status_code in [200, 301, 302, 307, 308]
+        except Exception:
+            try:
+                response = requests.get(url, timeout=timeout, allow_redirects=True)
+                return response.status_code in [200, 301, 302, 307, 308]
+            except Exception:
+                return False
+
+    def check_installation(self, docling_server_url: Optional[str] = None) -> bool:
+        server_url = self._effective_server_url(docling_server_url)
+        if server_url:
+            for path in ("/openapi.json", "/docs", "/v1/convert/source"):
+                if self._is_http_endpoint_valid(f"{server_url}{path}", timeout=5):
+                    return True
+            self.logger.warning(f"[Docling] external server not reachable: {server_url}")
+            return False
+
        if DocumentConverter is None:
            self.logger.warning("[Docling] 'docling' is not importable, please: pip install docling")
            return False
@ -277,6 +306,141 @@ class DoclingParser(RAGFlowPdfParser):
            tables.append(((img, [captions]), positions if positions else ""))
        return tables

+    @staticmethod
+    def _sections_from_remote_text(text: str, parse_method: str) -> list[tuple[str, ...]]:
+        txt = (text or "").strip()
+        if not txt:
+            return []
+        if parse_method == "manual":
+            return [(txt, DoclingContentType.TEXT.value, "")]
+        if parse_method == "paper":
+            return [(txt, DoclingContentType.TEXT.value)]
+        return [(txt, "")]
+
+    @staticmethod
+    def _extract_remote_document_entries(payload: Any) -> list[dict[str, Any]]:
+        if not isinstance(payload, dict):
+            return []
+        if isinstance(payload.get("document"), dict):
+            return [payload["document"]]
+        if isinstance(payload.get("documents"), list):
+            return [d for d in payload["documents"] if isinstance(d, dict)]
+        if isinstance(payload.get("results"), list):
+            docs = []
+            for it in payload["results"]:
+                if isinstance(it, dict):
+                    if isinstance(it.get("document"), dict):
+                        docs.append(it["document"])
+                    elif isinstance(it.get("result"), dict):
+                        docs.append(it["result"])
+                    else:
+                        docs.append(it)
+            return docs
+        return []
+
+    def _parse_pdf_remote(
+        self,
+        filepath: str | PathLike[str],
+        binary: BytesIO | bytes | None = None,
+        callback: Optional[Callable] = None,
+        *,
+        parse_method: str = "raw",
+        docling_server_url: Optional[str] = None,
+        request_timeout: Optional[int] = None,
+    ):
+        server_url = self._effective_server_url(docling_server_url)
+        if not server_url:
+            raise RuntimeError("[Docling] DOCLING_SERVER_URL is not configured.")
+
+        timeout = request_timeout or self.request_timeout
+        if binary is not None:
+            if isinstance(binary, (bytes, bytearray)):
+                pdf_bytes = bytes(binary)
+            else:
+                pdf_bytes = bytes(binary.getbuffer())
+        else:
+            src_path = Path(filepath)
+            if not src_path.exists():
+                raise FileNotFoundError(f"PDF not found: {src_path}")
+            with open(src_path, "rb") as f:
+                pdf_bytes = f.read()
+
+        if callback:
+            callback(0.2, f"[Docling] Requesting external server: {server_url}")
+
+        filename = Path(filepath).name or "input.pdf"
+        b64 = base64.b64encode(pdf_bytes).decode("ascii")
+        v1_payload = {
+            "options": {
+                "from_formats": ["pdf"],
+                "to_formats": ["json", "md", "text"],
+            },
+            "sources": [
+                {
+                    "kind": "file",
+                    "filename": filename,
+                    "base64_string": b64,
+                }
+            ],
+        }
+        v1alpha_payload = {
+            "options": {
+                "from_formats": ["pdf"],
+                "to_formats": ["json", "md", "text"],
+            },
+            "file_sources": [
+                {
+                    "filename": filename,
+                    "base64_string": b64,
+                }
+            ],
+        }
+        errors = []
+        response_json = None
+        for endpoint, payload in (
+            ("/v1/convert/source", v1_payload),
+            ("/v1alpha/convert/source", v1alpha_payload),
+        ):
+            try:
+                resp = requests.post(
+                    f"{server_url}{endpoint}",
+                    json=payload,
+                    timeout=timeout,
+                )
+                if resp.status_code < 300:
+                    response_json = resp.json()
+                    break
+                errors.append(f"{endpoint}: HTTP {resp.status_code} {resp.text[:300]}")
+            except Exception as exc:
+                errors.append(f"{endpoint}: {exc}")
+
+        if response_json is None:
+            raise RuntimeError("[Docling] remote convert failed: " + " | ".join(errors))
+
+        docs = self._extract_remote_document_entries(response_json)
+        if not docs:
+            raise RuntimeError("[Docling] remote response does not contain parsed documents.")
+
+        sections: list[tuple[str, ...]] = []
+        tables = []
+        for doc in docs:
+            md = doc.get("md_content")
+            txt = doc.get("text_content")
+            if isinstance(md, str) and md.strip():
+                sections.extend(self._sections_from_remote_text(md, parse_method=parse_method))
+            elif isinstance(txt, str) and txt.strip():
+                sections.extend(self._sections_from_remote_text(txt, parse_method=parse_method))
+
+            json_content = doc.get("json_content")
+            if isinstance(json_content, dict):
+                md_fallback = json_content.get("md_content")
+                if isinstance(md_fallback, str) and md_fallback.strip() and not sections:
+                    sections.extend(self._sections_from_remote_text(md_fallback, parse_method=parse_method))
+
+        if callback:
+            callback(0.95, f"[Docling] Remote sections: {len(sections)}")
+        return sections, tables
+
    def parse_pdf(
        self,
        filepath: str | PathLike[str],
@ -287,12 +451,25 @@ class DoclingParser(RAGFlowPdfParser):
        lang: Optional[str] = None,        
        method: str = "auto",             
        delete_output: bool = True,
-        parse_method: str = "raw"     
+        parse_method: str = "raw",
+        docling_server_url: Optional[str] = None,
+        request_timeout: Optional[int] = None,
    ):

-        if not self.check_installation():
+        if not self.check_installation(docling_server_url=docling_server_url):
            raise RuntimeError("Docling not available, please install `docling`")

+        server_url = self._effective_server_url(docling_server_url)
+        if server_url:
+            return self._parse_pdf_remote(
+                filepath=filepath,
+                binary=binary,
+                callback=callback,
+                parse_method=parse_method,
+                docling_server_url=server_url,
+                request_timeout=request_timeout,
+            )
+
        if binary is not None:
            tmpdir = Path(output_dir) if output_dir else Path.cwd() / ".docling_tmp"
            tmpdir.mkdir(parents=True, exist_ok=True)