feat: PaddleOCR PDF parser supports thumnails and positions (#12565)

### What problem does this PR solve? 1. PaddleOCR PDF parser supports thumnails and positions. 2. Add FAQ documentation for PaddleOCR PDF parser. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-19 11:45:10 +08:00 · 2026-01-13 09:51:08 +08:00
parent 44bada64c9
commit 4fe3c24198
4 changed files with 259 additions and 60 deletions
--- a/deepdoc/parser/paddleocr_parser.py
+++ b/deepdoc/parser/paddleocr_parser.py
@ -24,7 +24,10 @@ from os import PathLike
 from pathlib import Path
 from typing import Any, Callable, ClassVar, Literal, Optional, Union, Tuple, List

+import numpy as np
+import pdfplumber
 import requests
+from PIL import Image

 try:
    from deepdoc.parser.pdf_parser import RAGFlowPdfParser
@ -60,8 +63,8 @@ def _remove_images_from_markdown(markdown: str) -> str:
 class PaddleOCRVLConfig:
    """Configuration for PaddleOCR-VL algorithm."""

-    use_doc_orientation_classify: Optional[bool] = None
-    use_doc_unwarping: Optional[bool] = None
+    use_doc_orientation_classify: Optional[bool] = False
+    use_doc_unwarping: Optional[bool] = False
    use_layout_detection: Optional[bool] = None
    use_polygon_points: Optional[bool] = None
    use_chart_recognition: Optional[bool] = None
@ -79,7 +82,7 @@ class PaddleOCRVLConfig:
    min_pixels: Optional[int] = None
    max_pixels: Optional[int] = None
    max_new_tokens: Optional[int] = None
-    merge_layout_blocks: Optional[bool] = None
+    merge_layout_blocks: Optional[bool] = False
    markdown_ignore_labels: Optional[List[str]] = None
    vlm_extra_args: Optional[dict] = None

@ -116,14 +119,12 @@ class PaddleOCRConfig:
        if algorithm == "PaddleOCR-VL":
            # Create default PaddleOCRVLConfig object and convert to dict
            algorithm_config = asdict(PaddleOCRVLConfig())
-
-            # Apply user-provided VL config
-            vl_config = cfg.get("vl")
-            if isinstance(vl_config, dict):
-                algorithm_config.update({k: v for k, v in vl_config.items() if v is not None})
+        algorithm_config_user = cfg.get("algorithm_config")
+        if isinstance(algorithm_config_user, dict):
+            algorithm_config.update({k: v for k, v in algorithm_config_user.items() if v is not None})

        # Remove processed keys
-        cfg.pop("vl", None)
+        cfg.pop("algorithm_config", None)

        # Prepare initialization arguments
        field_names = {field.name for field in fields(cls)}
@ -146,6 +147,8 @@ class PaddleOCRConfig:
 class PaddleOCRParser(RAGFlowPdfParser):
    """Parser for PDF documents using PaddleOCR API."""

+    _ZOOMIN = 2
+
    _COMMON_FIELD_MAPPING: ClassVar[dict[str, str]] = {
        "prettify_markdown": "prettifyMarkdown",
        "show_formula_number": "showFormulaNumber",
@ -188,6 +191,8 @@ class PaddleOCRParser(RAGFlowPdfParser):
        request_timeout: int = 600,
    ):
        """Initialize PaddleOCR parser."""
+        super().__init__()
+
        self.api_url = api_url.rstrip("/") if api_url else os.getenv("PADDLEOCR_API_URL", "")
        self.access_token = access_token or os.getenv("PADDLEOCR_ACCESS_TOKEN")
        self.algorithm = algorithm
@ -197,6 +202,10 @@ class PaddleOCRParser(RAGFlowPdfParser):
        # Force PDF file type
        self.file_type = 0

+        # Initialize page images for cropping
+        self.page_images: list[Image.Image] = []
+        self.page_from = 0
+
    # Public methods
    def check_installation(self) -> tuple[bool, str]:
        """Check if the parser is properly installed and configured."""
@ -222,7 +231,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
        show_formula_number: Optional[bool] = None,
        visualize: Optional[bool] = None,
        additional_params: Optional[dict[str, Any]] = None,
-        vl_config: Optional[dict[str, Any]] = None,
+        algorithm_config: Optional[dict[str, Any]] = None,
        **kwargs: Any,
    ) -> ParseResult:
        """Parse PDF document using PaddleOCR API."""
@ -241,22 +250,24 @@ class PaddleOCRParser(RAGFlowPdfParser):
            config_dict["visualize"] = visualize
        if additional_params is not None:
            config_dict["additional_params"] = additional_params
-        if vl_config is not None:
-            config_dict["vl"] = vl_config
-
-        # Add any VL config parameters from kwargs
-        for key, value in kwargs.items():
-            if key in {field.name for field in fields(PaddleOCRVLConfig)}:
-                config_dict[key] = value
+        if algorithm_config is not None:
+            config_dict["algorithm_config"] = algorithm_config

        cfg = PaddleOCRConfig.from_dict(config_dict)

        if not cfg.api_url:
            raise RuntimeError("[PaddleOCR] API URL missing")

-        # Prepare file data
+        # Prepare file data and generate page images for cropping
        data_bytes = self._prepare_file_data(filepath, binary)

+        # Generate page images for cropping functionality
+        input_source = filepath if binary is None else binary
+        try:
+            self.__images__(input_source, callback=callback)
+        except Exception as e:
+            self.logger.warning(f"[PaddleOCR] Failed to generate page images for cropping: {e}")
+
        # Build and send request
        result = self._send_request(data_bytes, cfg, callback)

@ -377,7 +388,7 @@ class PaddleOCRParser(RAGFlowPdfParser):
                    label = block.get("block_label", "")
                    block_bbox = block.get("block_bbox", [0, 0, 0, 0])

-                    tag = f"@@{page_idx + 1}\t{block_bbox[0]}\t{block_bbox[2]}\t{block_bbox[1]}\t{block_bbox[3]}##"
+                    tag = f"@@{page_idx + 1}\t{block_bbox[0] // self._ZOOMIN}\t{block_bbox[2] // self._ZOOMIN}\t{block_bbox[1] // self._ZOOMIN}\t{block_bbox[3] // self._ZOOMIN}##"

                    if parse_method == "manual":
                        sections.append((block_content, label, tag))
@ -392,6 +403,149 @@ class PaddleOCRParser(RAGFlowPdfParser):
        """Convert API response to table tuples."""
        return []

+    def __images__(self, fnm, page_from=0, page_to=100, callback=None):
+        """Generate page images from PDF for cropping."""
+        self.page_from = page_from
+        self.page_to = page_to
+        try:
+            with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf:
+                self.pdf = pdf
+                self.page_images = [p.to_image(resolution=72, antialias=True).original for i, p in enumerate(self.pdf.pages[page_from:page_to])]
+        except Exception as e:
+            self.page_images = None
+            self.logger.exception(e)
+
+    @staticmethod
+    def extract_positions(txt: str):
+        """Extract position information from text tags."""
+        poss = []
+        for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt):
+            pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t")
+            left, right, top, bottom = float(left), float(right), float(top), float(bottom)
+            poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom))
+        return poss
+
+    def crop(self, text: str, need_position: bool = False):
+        """Crop images from PDF based on position tags in text."""
+        imgs = []
+        poss = self.extract_positions(text)
+
+        if not poss:
+            if need_position:
+                return None, None
+            return
+
+        if not getattr(self, "page_images", None):
+            self.logger.warning("[PaddleOCR] crop called without page images; skipping image generation.")
+            if need_position:
+                return None, None
+            return
+
+        page_count = len(self.page_images)
+
+        filtered_poss = []
+        for pns, left, right, top, bottom in poss:
+            if not pns:
+                self.logger.warning("[PaddleOCR] Empty page index list in crop; skipping this position.")
+                continue
+            valid_pns = [p for p in pns if 0 <= p < page_count]
+            if not valid_pns:
+                self.logger.warning(f"[PaddleOCR] All page indices {pns} out of range for {page_count} pages; skipping.")
+                continue
+            filtered_poss.append((valid_pns, left, right, top, bottom))
+
+        poss = filtered_poss
+        if not poss:
+            self.logger.warning("[PaddleOCR] No valid positions after filtering; skip cropping.")
+            if need_position:
+                return None, None
+            return
+
+        max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6)
+        GAP = 6
+        pos = poss[0]
+        first_page_idx = pos[0][0]
+        poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0)))
+        pos = poss[-1]
+        last_page_idx = pos[0][-1]
+        if not (0 <= last_page_idx < page_count):
+            self.logger.warning(f"[PaddleOCR] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.")
+            if need_position:
+                return None, None
+            return
+        last_page_height = self.page_images[last_page_idx].size[1]
+        poss.append(
+            (
+                [last_page_idx],
+                pos[1],
+                pos[2],
+                min(last_page_height, pos[4] + GAP),
+                min(last_page_height, pos[4] + 120),
+            )
+        )
+
+        positions = []
+        for ii, (pns, left, right, top, bottom) in enumerate(poss):
+            right = left + max_width
+
+            if bottom <= top:
+                bottom = top + 2
+
+            for pn in pns[1:]:
+                if 0 <= pn - 1 < page_count:
+                    bottom += self.page_images[pn - 1].size[1]
+                else:
+                    self.logger.warning(f"[PaddleOCR] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.")
+
+            if not (0 <= pns[0] < page_count):
+                self.logger.warning(f"[PaddleOCR] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.")
+                continue
+
+            img0 = self.page_images[pns[0]]
+            x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1]))
+            crop0 = img0.crop((x0, y0, x1, y1))
+            imgs.append(crop0)
+            if 0 < ii < len(poss) - 1:
+                positions.append((pns[0] + self.page_from, x0, x1, y0, y1))
+
+            bottom -= img0.size[1]
+            for pn in pns[1:]:
+                if not (0 <= pn < page_count):
+                    self.logger.warning(f"[PaddleOCR] Page index {pn} out of range for {page_count} pages during crop; skipping this page.")
+                    continue
+                page = self.page_images[pn]
+                x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1]))
+                cimgp = page.crop((x0, y0, x1, y1))
+                imgs.append(cimgp)
+                if 0 < ii < len(poss) - 1:
+                    positions.append((pn + self.page_from, x0, x1, y0, y1))
+                bottom -= page.size[1]
+
+        if not imgs:
+            if need_position:
+                return None, None
+            return
+
+        height = 0
+        for img in imgs:
+            height += img.size[1] + GAP
+        height = int(height)
+        width = int(np.max([i.size[0] for i in imgs]))
+        pic = Image.new("RGB", (width, height), (245, 245, 245))
+        height = 0
+        for ii, img in enumerate(imgs):
+            if ii == 0 or ii + 1 == len(imgs):
+                img = img.convert("RGBA")
+                overlay = Image.new("RGBA", img.size, (0, 0, 0, 0))
+                overlay.putalpha(128)
+                img = Image.alpha_composite(img, overlay).convert("RGB")
+            pic.paste(img, (0, int(height)))
+            height += img.size[1] + GAP
+
+        if need_position:
+            return pic, positions
+        return pic
+

 if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO)
--- a/docs/faq.mdx
+++ b/docs/faq.mdx
@ -566,3 +566,82 @@ RAGFlow supports MinerU's `vlm-http-client` backend, enabling you to delegate do
 :::tip NOTE
 When using the `vlm-http-client` backend, the RAGFlow server requires no GPU, only network connectivity. This enables cost-effective distributed deployment with multiple RAGFlow instances sharing one remote vLLM server.
 :::
+
+### How to use PaddleOCR for document parsing?
+
+From v0.24.0 onwards, RAGFlow includes PaddleOCR as an optional PDF parser. Please note that RAGFlow acts only as a *remote client* for PaddleOCR, calling the PaddleOCR API to parse PDFs and reading the returned files.
+
+There are two main ways to configure and use PaddleOCR in RAGFlow:
+
+#### 1. Using PaddleOCR Official API
+
+This method uses PaddleOCR's official API service with an access token.
+
+**Step 1: Configure RAGFlow**
+- **Via Environment Variables:**
+   ```bash
+   # In your docker/.env file:
+   PADDLEOCR_API_URL=https://your-paddleocr-api-endpoint
+   PADDLEOCR_ALGORITHM=PaddleOCR-VL
+   PADDLEOCR_ACCESS_TOKEN=your-access-token-here
+   ```
+
+- **Via UI:**
+   - Navigate to **Model providers** page
+   - Add a new OCR model with factory type "PaddleOCR"
+   - Configure the following fields:
+      - **PaddleOCR API URL**: Your PaddleOCR API endpoint
+      - **PaddleOCR Algorithm**: Select the algorithm corresponding to the API endpoint
+      - **AI Studio Access Token**: Your access token for the PaddleOCR API
+
+**Step 2: Usage in Dataset Configuration**
+- In your dataset's **Configuration** page, find the **Ingestion pipeline** section
+- If using built-in chunking methods that support PDF parsing, select **PaddleOCR** from the **PDF parser** dropdown
+- If using custom ingestion pipeline, select **PaddleOCR** in the **Parser** component
+
+**Notes:**
+- To obtain the API URL, visit the [PaddleOCR official website](https://aistudio.baidu.com/paddleocr/task), click the **API** button in the upper-left corner, choose the example code for the specific algorithm you want to use (e.g., PaddleOCR-VL), and copy the `API_URL`.
+- Access tokens can be obtained from the [AI Studio platform](https://aistudio.baidu.com/account/accessToken).
+- This method requires internet connectivity to reach the official PaddleOCR API.
+
+#### 2. Using Self-Hosted PaddleOCR Service
+
+This method allows you to deploy your own PaddleOCR service and use it without an access token.
+
+**Step 1: Deploy PaddleOCR Service**
+Follow the [PaddleOCR serving documentation](https://www.paddleocr.ai/latest/en/version3.x/deployment/serving.html) to deploy your own service. For layout parsing, you can use an endpoint like:
+
+```bash
+http://localhost:8080/layout-parsing
+```
+
+**Step 2: Configure RAGFlow**
+- **Via Environment Variables:**
+  ```bash
+  PADDLEOCR_API_URL=http://localhost:8080/layout-parsing
+  PADDLEOCR_ALGORITHM=PaddleOCR-VL
+  # No access token required for self-hosted service
+  ```
+
+- **Via UI:**
+   - Navigate to **Model providers** page
+   - Add a new OCR model with factory type "PaddleOCR"
+   - Configure the following fields:
+      - **PaddleOCR API URL**: The endpoint of your deployed service
+      - **PaddleOCR Algorithm**: Select the algorithm corresponding to the deployed service
+      - **AI Studio Access Token**: Leave empty
+
+**Step 3: Usage in Dataset Configuration**
+- In your dataset's **Configuration** page, find the **Ingestion pipeline** section
+- If using built-in chunking methods that support PDF parsing, select **PaddleOCR** from the **PDF parser** dropdown
+- If using custom ingestion pipeline, select **PaddleOCR** in the **Parser** component
+
+#### Environment Variables Summary
+
+| Environment Variable | Description | Default | Required |
+|---------------------|-------------|---------|----------|
+| `PADDLEOCR_API_URL` | PaddleOCR API endpoint URL | `""` | Yes, when using environment variables |
+| `PADDLEOCR_ALGORITHM` | Algorithm to use for parsing | `"PaddleOCR-VL"` | No |
+| `PADDLEOCR_ACCESS_TOKEN` | Access token for official API | `None` | Only when using official API |
+
+Environment variables can be used for auto-provisioning, but are not required if configuring via UI. When environment variables are set, these values are used to auto-provision a PaddleOCR model for the tenant on first use.
--- a/rag/flow/parser/parser.py
+++ b/rag/flow/parser/parser.py
@ -358,48 +358,14 @@ class Parser(ProcessBase):
                parse_method=conf.get("paddleocr_parse_method", "raw"),
            )
            bboxes = []
-            for section in lines:
-                # PaddleOCRParser returns sections as tuple, different formats based on parse_method:
-                # - "raw": (text, position_tag)
-                # - "manual": (text, label, position_tag)
-                # - "paper": (text_with_tag, label)
-                text = section[0]
-
-                # Parse position tag if exists
-                position_tag = ""
-                if len(section) > 1:
-                    if len(section) == 2:  # raw format: (text, tag)
-                        position_tag = section[1]
-                    elif len(section) == 3:  # manual format: (text, label, tag)
-                        position_tag = section[2]
-                    elif "paper" in conf.get("paddleocr_parse_method", "") and len(section) == 2:
-                        # paper format: text may contain tag
-                        text_with_tag = text
-                        import re
-
-                        tag_match = re.search(r"(@@[0-9-]+\t[0-9.\t]+##)", text_with_tag)
-                        if tag_match:
-                            position_tag = tag_match.group(1)
-                            text = text_with_tag.replace(position_tag, "").strip()
-
-                # Extract coordinate information from position tag
-                page_number, x0, x1, top, bottom = 1, 0, 0, 0, 0
-                if position_tag:
-                    import re
-
-                    tag_match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
-                    if tag_match:
-                        pn, x0_str, x1_str, top_str, bottom_str = tag_match.groups()
-                        page_number = int(pn.split("-")[0])  # Take first page number
-                        x0, x1, top, bottom = float(x0_str), float(x1_str), float(top_str), float(bottom_str)
+            for t, poss in lines:
+                # Get cropped image and positions
+                cropped_image, positions = pdf_parser.crop(poss, need_position=True)

                box = {
-                    "text": text,
-                    "page_number": page_number,
-                    "x0": x0,
-                    "x1": x1,
-                    "top": top,
-                    "bottom": bottom,
+                    "text": t,
+                    "image": cropped_image,
+                    "positions": positions,
                }
                bboxes.append(box)
        else:
--- a/web/src/pages/user-setting/setting-model/modal/paddleocr-modal/index.tsx
+++ b/web/src/pages/user-setting/setting-model/modal/paddleocr-modal/index.tsx
@ -122,7 +122,7 @@ const PaddleOCRModal = ({
                disabled={loading}
                className="btn btn-primary"
              >
-                {loading ? t('common.adding') : t('common.add')}
+                {t('common.add')}
              </button>
            </div>
          </form>