mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-04-27 22:07:58 +08:00
feat(parser): support external Docling server via DOCLING_SERVER_URL (#13527)
### What problem does this PR solve? This PR adds support for parsing PDFs through an external Docling server, so RAGFlow can connect to remote `docling serve` deployments instead of relying only on local in-process Docling. It addresses the feature request in [#13426](https://github.com/infiniflow/ragflow/issues/13426) and aligns with the external-server usage pattern already used by MinerU. ### Type of change - [ ] Bug Fix (non-breaking change which fixes an issue) - [x] New Feature (non-breaking change which adds functionality) - [x] Documentation Update - [ ] Refactoring - [ ] Performance Improvement - [ ] Other (please describe): ### What is changed? - Add external Docling server support in `DoclingParser`: - Use `DOCLING_SERVER_URL` to enable remote parsing mode. - Try `POST /v1/convert/source` first, and fallback to `/v1alpha/convert/source`. - Keep existing local Docling behavior when `DOCLING_SERVER_URL` is not set. - Wire Docling env settings into parser invocation paths: - `rag/app/naive.py` - `rag/flow/parser/parser.py` - Add Docling env hints in constants and update docs: - `docs/guides/dataset/select_pdf_parser.md` - `docs/guides/agent/agent_component_reference/parser.md` - `docs/faq.mdx` ### Why this approach? This keeps the change focused on one issue and one capability (external Docling connectivity), without introducing unrelated provider-model plumbing. ### Validation - Static checks: - `python -m py_compile` on changed Python files - `python -m ruff check` on changed Python files - Functional checks: - Remote v1 endpoint path works - v1alpha fallback works - Local Docling path remains available when server URL is unset ### Related links - Feature request: [Support external Docling server (issue #13426)](https://github.com/infiniflow/ragflow/issues/13426) - Compare view for this branch: [main...feat/docling-server](https://github.com/infiniflow/ragflow/compare/main...spider-yamet:ragflow:feat/docling-server?expand=1) ##### Fixes [#13426](https://github.com/infiniflow/ragflow/issues/13426)
This commit is contained in:
@ -219,6 +219,9 @@ class ForgettingPolicy(StrEnum):
|
||||
# ENV_MINERU_OUTPUT_DIR = "MINERU_OUTPUT_DIR"
|
||||
# ENV_MINERU_BACKEND = "MINERU_BACKEND"
|
||||
# ENV_MINERU_DELETE_OUTPUT = "MINERU_DELETE_OUTPUT"
|
||||
# ENV_DOCLING_SERVER_URL = "DOCLING_SERVER_URL"
|
||||
# ENV_DOCLING_OUTPUT_DIR = "DOCLING_OUTPUT_DIR"
|
||||
# ENV_DOCLING_DELETE_OUTPUT = "DOCLING_DELETE_OUTPUT"
|
||||
# ENV_TCADP_OUTPUT_DIR = "TCADP_OUTPUT_DIR"
|
||||
# ENV_LM_TIMEOUT_SECONDS = "LM_TIMEOUT_SECONDS"
|
||||
# ENV_LLM_MAX_RETRIES = "LLM_MAX_RETRIES"
|
||||
|
||||
@ -17,6 +17,8 @@ from __future__ import annotations
|
||||
|
||||
import logging
|
||||
import re
|
||||
import base64
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
@ -25,6 +27,7 @@ from pathlib import Path
|
||||
from typing import Any, Callable, Iterable, Optional
|
||||
|
||||
import pdfplumber
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
try:
|
||||
@ -74,15 +77,41 @@ def _extract_bbox_from_prov(item, prov_attr: str = "prov") -> Optional[_BBox]:
|
||||
|
||||
|
||||
class DoclingParser(RAGFlowPdfParser):
|
||||
def __init__(self):
|
||||
def __init__(self, docling_server_url: str = "", request_timeout: int = 600):
|
||||
self.logger = logging.getLogger(self.__class__.__name__)
|
||||
self.page_images: list[Image.Image] = []
|
||||
self.page_from = 0
|
||||
self.page_to = 10_000
|
||||
self.outlines = []
|
||||
|
||||
|
||||
def check_installation(self) -> bool:
|
||||
self.docling_server_url = (docling_server_url or "").rstrip("/")
|
||||
self.request_timeout = request_timeout
|
||||
|
||||
def _effective_server_url(self, docling_server_url: Optional[str] = None) -> str:
|
||||
return (docling_server_url or self.docling_server_url or "").rstrip("/") or (
|
||||
os.environ.get("DOCLING_SERVER_URL", "").rstrip("/")
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def _is_http_endpoint_valid(url: str, timeout: int = 5) -> bool:
|
||||
try:
|
||||
response = requests.head(url, timeout=timeout, allow_redirects=True)
|
||||
return response.status_code in [200, 301, 302, 307, 308]
|
||||
except Exception:
|
||||
try:
|
||||
response = requests.get(url, timeout=timeout, allow_redirects=True)
|
||||
return response.status_code in [200, 301, 302, 307, 308]
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
def check_installation(self, docling_server_url: Optional[str] = None) -> bool:
|
||||
server_url = self._effective_server_url(docling_server_url)
|
||||
if server_url:
|
||||
for path in ("/openapi.json", "/docs", "/v1/convert/source"):
|
||||
if self._is_http_endpoint_valid(f"{server_url}{path}", timeout=5):
|
||||
return True
|
||||
self.logger.warning(f"[Docling] external server not reachable: {server_url}")
|
||||
return False
|
||||
|
||||
if DocumentConverter is None:
|
||||
self.logger.warning("[Docling] 'docling' is not importable, please: pip install docling")
|
||||
return False
|
||||
@ -277,6 +306,141 @@ class DoclingParser(RAGFlowPdfParser):
|
||||
tables.append(((img, [captions]), positions if positions else ""))
|
||||
return tables
|
||||
|
||||
@staticmethod
|
||||
def _sections_from_remote_text(text: str, parse_method: str) -> list[tuple[str, ...]]:
|
||||
txt = (text or "").strip()
|
||||
if not txt:
|
||||
return []
|
||||
if parse_method == "manual":
|
||||
return [(txt, DoclingContentType.TEXT.value, "")]
|
||||
if parse_method == "paper":
|
||||
return [(txt, DoclingContentType.TEXT.value)]
|
||||
return [(txt, "")]
|
||||
|
||||
@staticmethod
|
||||
def _extract_remote_document_entries(payload: Any) -> list[dict[str, Any]]:
|
||||
if not isinstance(payload, dict):
|
||||
return []
|
||||
if isinstance(payload.get("document"), dict):
|
||||
return [payload["document"]]
|
||||
if isinstance(payload.get("documents"), list):
|
||||
return [d for d in payload["documents"] if isinstance(d, dict)]
|
||||
if isinstance(payload.get("results"), list):
|
||||
docs = []
|
||||
for it in payload["results"]:
|
||||
if isinstance(it, dict):
|
||||
if isinstance(it.get("document"), dict):
|
||||
docs.append(it["document"])
|
||||
elif isinstance(it.get("result"), dict):
|
||||
docs.append(it["result"])
|
||||
else:
|
||||
docs.append(it)
|
||||
return docs
|
||||
return []
|
||||
|
||||
def _parse_pdf_remote(
|
||||
self,
|
||||
filepath: str | PathLike[str],
|
||||
binary: BytesIO | bytes | None = None,
|
||||
callback: Optional[Callable] = None,
|
||||
*,
|
||||
parse_method: str = "raw",
|
||||
docling_server_url: Optional[str] = None,
|
||||
request_timeout: Optional[int] = None,
|
||||
):
|
||||
server_url = self._effective_server_url(docling_server_url)
|
||||
if not server_url:
|
||||
raise RuntimeError("[Docling] DOCLING_SERVER_URL is not configured.")
|
||||
|
||||
timeout = request_timeout or self.request_timeout
|
||||
if binary is not None:
|
||||
if isinstance(binary, (bytes, bytearray)):
|
||||
pdf_bytes = bytes(binary)
|
||||
else:
|
||||
pdf_bytes = bytes(binary.getbuffer())
|
||||
else:
|
||||
src_path = Path(filepath)
|
||||
if not src_path.exists():
|
||||
raise FileNotFoundError(f"PDF not found: {src_path}")
|
||||
with open(src_path, "rb") as f:
|
||||
pdf_bytes = f.read()
|
||||
|
||||
if callback:
|
||||
callback(0.2, f"[Docling] Requesting external server: {server_url}")
|
||||
|
||||
filename = Path(filepath).name or "input.pdf"
|
||||
b64 = base64.b64encode(pdf_bytes).decode("ascii")
|
||||
v1_payload = {
|
||||
"options": {
|
||||
"from_formats": ["pdf"],
|
||||
"to_formats": ["json", "md", "text"],
|
||||
},
|
||||
"sources": [
|
||||
{
|
||||
"kind": "file",
|
||||
"filename": filename,
|
||||
"base64_string": b64,
|
||||
}
|
||||
],
|
||||
}
|
||||
v1alpha_payload = {
|
||||
"options": {
|
||||
"from_formats": ["pdf"],
|
||||
"to_formats": ["json", "md", "text"],
|
||||
},
|
||||
"file_sources": [
|
||||
{
|
||||
"filename": filename,
|
||||
"base64_string": b64,
|
||||
}
|
||||
],
|
||||
}
|
||||
errors = []
|
||||
response_json = None
|
||||
for endpoint, payload in (
|
||||
("/v1/convert/source", v1_payload),
|
||||
("/v1alpha/convert/source", v1alpha_payload),
|
||||
):
|
||||
try:
|
||||
resp = requests.post(
|
||||
f"{server_url}{endpoint}",
|
||||
json=payload,
|
||||
timeout=timeout,
|
||||
)
|
||||
if resp.status_code < 300:
|
||||
response_json = resp.json()
|
||||
break
|
||||
errors.append(f"{endpoint}: HTTP {resp.status_code} {resp.text[:300]}")
|
||||
except Exception as exc:
|
||||
errors.append(f"{endpoint}: {exc}")
|
||||
|
||||
if response_json is None:
|
||||
raise RuntimeError("[Docling] remote convert failed: " + " | ".join(errors))
|
||||
|
||||
docs = self._extract_remote_document_entries(response_json)
|
||||
if not docs:
|
||||
raise RuntimeError("[Docling] remote response does not contain parsed documents.")
|
||||
|
||||
sections: list[tuple[str, ...]] = []
|
||||
tables = []
|
||||
for doc in docs:
|
||||
md = doc.get("md_content")
|
||||
txt = doc.get("text_content")
|
||||
if isinstance(md, str) and md.strip():
|
||||
sections.extend(self._sections_from_remote_text(md, parse_method=parse_method))
|
||||
elif isinstance(txt, str) and txt.strip():
|
||||
sections.extend(self._sections_from_remote_text(txt, parse_method=parse_method))
|
||||
|
||||
json_content = doc.get("json_content")
|
||||
if isinstance(json_content, dict):
|
||||
md_fallback = json_content.get("md_content")
|
||||
if isinstance(md_fallback, str) and md_fallback.strip() and not sections:
|
||||
sections.extend(self._sections_from_remote_text(md_fallback, parse_method=parse_method))
|
||||
|
||||
if callback:
|
||||
callback(0.95, f"[Docling] Remote sections: {len(sections)}")
|
||||
return sections, tables
|
||||
|
||||
def parse_pdf(
|
||||
self,
|
||||
filepath: str | PathLike[str],
|
||||
@ -287,12 +451,25 @@ class DoclingParser(RAGFlowPdfParser):
|
||||
lang: Optional[str] = None,
|
||||
method: str = "auto",
|
||||
delete_output: bool = True,
|
||||
parse_method: str = "raw"
|
||||
parse_method: str = "raw",
|
||||
docling_server_url: Optional[str] = None,
|
||||
request_timeout: Optional[int] = None,
|
||||
):
|
||||
|
||||
if not self.check_installation():
|
||||
if not self.check_installation(docling_server_url=docling_server_url):
|
||||
raise RuntimeError("Docling not available, please install `docling`")
|
||||
|
||||
server_url = self._effective_server_url(docling_server_url)
|
||||
if server_url:
|
||||
return self._parse_pdf_remote(
|
||||
filepath=filepath,
|
||||
binary=binary,
|
||||
callback=callback,
|
||||
parse_method=parse_method,
|
||||
docling_server_url=server_url,
|
||||
request_timeout=request_timeout,
|
||||
)
|
||||
|
||||
if binary is not None:
|
||||
tmpdir = Path(output_dir) if output_dir else Path.cwd() / ".docling_tmp"
|
||||
tmpdir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
18
docs/faq.mdx
18
docs/faq.mdx
@ -567,6 +567,24 @@ RAGFlow supports MinerU's `vlm-http-client` backend, enabling you to delegate do
|
||||
When using the `vlm-http-client` backend, the RAGFlow server requires no GPU, only network connectivity. This enables cost-effective distributed deployment with multiple RAGFlow instances sharing one remote vLLM server.
|
||||
:::
|
||||
|
||||
### How to use an external Docling Serve server for document parsing?
|
||||
|
||||
RAGFlow supports Docling in two modes:
|
||||
|
||||
1. **Local Docling** (existing mode): install Docling in the RAGFlow runtime (`USE_DOCLING=true`) and parse in-process.
|
||||
2. **External Docling Serve** (remote mode): point RAGFlow to a Docling Serve endpoint.
|
||||
|
||||
To enable remote mode, set:
|
||||
|
||||
```bash
|
||||
DOCLING_SERVER_URL=http://your-docling-serve-host:5001
|
||||
```
|
||||
|
||||
Behavior:
|
||||
|
||||
- When `DOCLING_SERVER_URL` is set, RAGFlow sends PDFs to Docling Serve using `/v1/convert/source` (and falls back to `/v1alpha/convert/source` for older servers).
|
||||
- When `DOCLING_SERVER_URL` is not set, RAGFlow uses local in-process Docling.
|
||||
|
||||
### How to use PaddleOCR for document parsing?
|
||||
|
||||
From v0.24.0 onwards, RAGFlow includes PaddleOCR as an optional PDF parser. Please note that RAGFlow acts only as a *remote client* for PaddleOCR, calling the PaddleOCR API to parse PDFs and reading the returned files.
|
||||
|
||||
@ -65,6 +65,12 @@ Starting from v0.22.0, RAGFlow includes MinerU (≥ 2.6.3) as an optional PDF p
|
||||
- If you decide to use a chunking method from the **Built-in** dropdown, ensure it supports PDF parsing, then select **MinerU** from the **PDF parser** dropdown.
|
||||
- If you use a custom ingestion pipeline instead, select **MinerU** in the **PDF parser** section of the **Parser** component.
|
||||
|
||||
To use an external Docling Serve instance (instead of local in-process Docling), set:
|
||||
|
||||
- `DOCLING_SERVER_URL`: The Docling Serve API endpoint (for example, `http://docling-host:5001`).
|
||||
|
||||
When `DOCLING_SERVER_URL` is set, RAGFlow sends PDF content to Docling Serve (`/v1/convert/source`, with fallback to `/v1alpha/convert/source`) and ingests the returned markdown/text. If the variable is not set, RAGFlow keeps using local Docling (`USE_DOCLING=true` + installed package) behavior.
|
||||
|
||||
:::note
|
||||
All MinerU environment variables are optional. When set, these values are used to auto-provision a MinerU OCR model for the tenant on first use. To avoid auto-provisioning, skip the environment variable settings and only configure MinerU from the **Model providers** page in the UI.
|
||||
:::
|
||||
|
||||
@ -65,6 +65,12 @@ Starting from v0.22.0, RAGFlow includes MinerU (≥ 2.6.3) as an optional PDF p
|
||||
- If you decide to use a chunking method from the **Built-in** dropdown, ensure it supports PDF parsing, then select **MinerU** from the **PDF parser** dropdown.
|
||||
- If you use a custom ingestion pipeline instead, select **MinerU** in the **PDF parser** section of the **Parser** component.
|
||||
|
||||
To use an external Docling Serve instance (instead of local in-process Docling), set:
|
||||
|
||||
- `DOCLING_SERVER_URL`: The Docling Serve API endpoint (for example, `http://docling-host:5001`).
|
||||
|
||||
When `DOCLING_SERVER_URL` is set, RAGFlow sends PDF content to Docling Serve (`/v1/convert/source`, with fallback to `/v1alpha/convert/source`) and ingests the returned markdown/text. If the variable is not set, RAGFlow keeps using local Docling (`USE_DOCLING=true` + installed package) behavior.
|
||||
|
||||
:::note
|
||||
All MinerU environment variables are optional. When set, these values are used to auto-provision a MinerU OCR model for the tenant on first use. To avoid auto-provisioning, skip the environment variable settings and only configure MinerU from the **Model providers** page in the UI.
|
||||
:::
|
||||
|
||||
@ -153,15 +153,17 @@ def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
|
||||
parse_method = kwargs.get("parse_method", "raw")
|
||||
|
||||
if not pdf_parser.check_installation():
|
||||
callback(-1, "Docling not found.")
|
||||
if callback:
|
||||
callback(-1, "Docling not found.")
|
||||
return None, None, pdf_parser
|
||||
|
||||
sections, tables = pdf_parser.parse_pdf(
|
||||
filepath=filename,
|
||||
binary=binary,
|
||||
callback=callback,
|
||||
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
||||
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
||||
output_dir=os.environ.get("DOCLING_OUTPUT_DIR", ""),
|
||||
delete_output=bool(int(os.environ.get("DOCLING_DELETE_OUTPUT", 1))),
|
||||
docling_server_url=os.environ.get("DOCLING_SERVER_URL", ""),
|
||||
parse_method=parse_method,
|
||||
)
|
||||
return sections, tables, pdf_parser
|
||||
|
||||
@ -32,6 +32,7 @@ from common import settings
|
||||
from common.constants import LLMType
|
||||
from common.misc_utils import get_uuid
|
||||
from deepdoc.parser import ExcelParser
|
||||
from deepdoc.parser.docling_parser import DoclingParser
|
||||
from deepdoc.parser.pdf_parser import PlainParser, RAGFlowPdfParser, VisionParser
|
||||
from deepdoc.parser.tcadp_parser import TCADPParser
|
||||
from rag.app.naive import Docx
|
||||
@ -173,7 +174,7 @@ class ParserParam(ProcessParamBase):
|
||||
pdf_parse_method = pdf_config.get("parse_method", "")
|
||||
self.check_empty(pdf_parse_method, "Parse method abnormal.")
|
||||
|
||||
if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru", "tcadp parser", "paddleocr"]:
|
||||
if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru", "docling", "tcadp parser", "paddleocr"]:
|
||||
self.check_empty(pdf_config.get("lang", ""), "PDF VLM language")
|
||||
|
||||
pdf_output_format = pdf_config.get("output_format", "")
|
||||
@ -371,6 +372,29 @@ class Parser(ProcessBase):
|
||||
"text": t,
|
||||
}
|
||||
bboxes.append(box)
|
||||
elif parse_method.lower() == "docling":
|
||||
pdf_parser = DoclingParser(docling_server_url=os.environ.get("DOCLING_SERVER_URL", ""))
|
||||
lines, _ = pdf_parser.parse_pdf(
|
||||
filepath=name,
|
||||
binary=blob,
|
||||
callback=self.callback,
|
||||
parse_method=conf.get("docling_parse_method", "raw"),
|
||||
docling_server_url=os.environ.get("DOCLING_SERVER_URL", ""),
|
||||
)
|
||||
bboxes = []
|
||||
for item in lines:
|
||||
if not isinstance(item, tuple) or not item:
|
||||
continue
|
||||
text = item[0]
|
||||
poss = item[-1] if len(item) >= 2 else ""
|
||||
box = {
|
||||
"text": text,
|
||||
"image": pdf_parser.crop(poss, 1) if isinstance(poss, str) and poss else None,
|
||||
"positions": [[pos[0][-1], *pos[1:]] for pos in pdf_parser.extract_positions(poss)]
|
||||
if isinstance(poss, str) and poss
|
||||
else [],
|
||||
}
|
||||
bboxes.append(box)
|
||||
elif parse_method.lower() == "tcadp parser":
|
||||
# ADP is a document parsing tool using Tencent Cloud API
|
||||
table_result_type = conf.get("table_result_type", "1")
|
||||
|
||||
Reference in New Issue
Block a user