diff --git a/deepdoc/parser/paddleocr_parser.py b/deepdoc/parser/paddleocr_parser.py index fca69da79..f6611e0c4 100644 --- a/deepdoc/parser/paddleocr_parser.py +++ b/deepdoc/parser/paddleocr_parser.py @@ -24,7 +24,10 @@ from os import PathLike from pathlib import Path from typing import Any, Callable, ClassVar, Literal, Optional, Union, Tuple, List +import numpy as np +import pdfplumber import requests +from PIL import Image try: from deepdoc.parser.pdf_parser import RAGFlowPdfParser @@ -60,8 +63,8 @@ def _remove_images_from_markdown(markdown: str) -> str: class PaddleOCRVLConfig: """Configuration for PaddleOCR-VL algorithm.""" - use_doc_orientation_classify: Optional[bool] = None - use_doc_unwarping: Optional[bool] = None + use_doc_orientation_classify: Optional[bool] = False + use_doc_unwarping: Optional[bool] = False use_layout_detection: Optional[bool] = None use_polygon_points: Optional[bool] = None use_chart_recognition: Optional[bool] = None @@ -79,7 +82,7 @@ class PaddleOCRVLConfig: min_pixels: Optional[int] = None max_pixels: Optional[int] = None max_new_tokens: Optional[int] = None - merge_layout_blocks: Optional[bool] = None + merge_layout_blocks: Optional[bool] = False markdown_ignore_labels: Optional[List[str]] = None vlm_extra_args: Optional[dict] = None @@ -116,14 +119,12 @@ class PaddleOCRConfig: if algorithm == "PaddleOCR-VL": # Create default PaddleOCRVLConfig object and convert to dict algorithm_config = asdict(PaddleOCRVLConfig()) - - # Apply user-provided VL config - vl_config = cfg.get("vl") - if isinstance(vl_config, dict): - algorithm_config.update({k: v for k, v in vl_config.items() if v is not None}) + algorithm_config_user = cfg.get("algorithm_config") + if isinstance(algorithm_config_user, dict): + algorithm_config.update({k: v for k, v in algorithm_config_user.items() if v is not None}) # Remove processed keys - cfg.pop("vl", None) + cfg.pop("algorithm_config", None) # Prepare initialization arguments field_names = {field.name for field in fields(cls)} @@ -146,6 +147,8 @@ class PaddleOCRConfig: class PaddleOCRParser(RAGFlowPdfParser): """Parser for PDF documents using PaddleOCR API.""" + _ZOOMIN = 2 + _COMMON_FIELD_MAPPING: ClassVar[dict[str, str]] = { "prettify_markdown": "prettifyMarkdown", "show_formula_number": "showFormulaNumber", @@ -188,6 +191,8 @@ class PaddleOCRParser(RAGFlowPdfParser): request_timeout: int = 600, ): """Initialize PaddleOCR parser.""" + super().__init__() + self.api_url = api_url.rstrip("/") if api_url else os.getenv("PADDLEOCR_API_URL", "") self.access_token = access_token or os.getenv("PADDLEOCR_ACCESS_TOKEN") self.algorithm = algorithm @@ -197,6 +202,10 @@ class PaddleOCRParser(RAGFlowPdfParser): # Force PDF file type self.file_type = 0 + # Initialize page images for cropping + self.page_images: list[Image.Image] = [] + self.page_from = 0 + # Public methods def check_installation(self) -> tuple[bool, str]: """Check if the parser is properly installed and configured.""" @@ -222,7 +231,7 @@ class PaddleOCRParser(RAGFlowPdfParser): show_formula_number: Optional[bool] = None, visualize: Optional[bool] = None, additional_params: Optional[dict[str, Any]] = None, - vl_config: Optional[dict[str, Any]] = None, + algorithm_config: Optional[dict[str, Any]] = None, **kwargs: Any, ) -> ParseResult: """Parse PDF document using PaddleOCR API.""" @@ -241,22 +250,24 @@ class PaddleOCRParser(RAGFlowPdfParser): config_dict["visualize"] = visualize if additional_params is not None: config_dict["additional_params"] = additional_params - if vl_config is not None: - config_dict["vl"] = vl_config - - # Add any VL config parameters from kwargs - for key, value in kwargs.items(): - if key in {field.name for field in fields(PaddleOCRVLConfig)}: - config_dict[key] = value + if algorithm_config is not None: + config_dict["algorithm_config"] = algorithm_config cfg = PaddleOCRConfig.from_dict(config_dict) if not cfg.api_url: raise RuntimeError("[PaddleOCR] API URL missing") - # Prepare file data + # Prepare file data and generate page images for cropping data_bytes = self._prepare_file_data(filepath, binary) + # Generate page images for cropping functionality + input_source = filepath if binary is None else binary + try: + self.__images__(input_source, callback=callback) + except Exception as e: + self.logger.warning(f"[PaddleOCR] Failed to generate page images for cropping: {e}") + # Build and send request result = self._send_request(data_bytes, cfg, callback) @@ -377,7 +388,7 @@ class PaddleOCRParser(RAGFlowPdfParser): label = block.get("block_label", "") block_bbox = block.get("block_bbox", [0, 0, 0, 0]) - tag = f"@@{page_idx + 1}\t{block_bbox[0]}\t{block_bbox[2]}\t{block_bbox[1]}\t{block_bbox[3]}##" + tag = f"@@{page_idx + 1}\t{block_bbox[0] // self._ZOOMIN}\t{block_bbox[2] // self._ZOOMIN}\t{block_bbox[1] // self._ZOOMIN}\t{block_bbox[3] // self._ZOOMIN}##" if parse_method == "manual": sections.append((block_content, label, tag)) @@ -392,6 +403,149 @@ class PaddleOCRParser(RAGFlowPdfParser): """Convert API response to table tuples.""" return [] + def __images__(self, fnm, page_from=0, page_to=100, callback=None): + """Generate page images from PDF for cropping.""" + self.page_from = page_from + self.page_to = page_to + try: + with pdfplumber.open(fnm) if isinstance(fnm, (str, PathLike)) else pdfplumber.open(BytesIO(fnm)) as pdf: + self.pdf = pdf + self.page_images = [p.to_image(resolution=72, antialias=True).original for i, p in enumerate(self.pdf.pages[page_from:page_to])] + except Exception as e: + self.page_images = None + self.logger.exception(e) + + @staticmethod + def extract_positions(txt: str): + """Extract position information from text tags.""" + poss = [] + for tag in re.findall(r"@@[0-9-]+\t[0-9.\t]+##", txt): + pn, left, right, top, bottom = tag.strip("#").strip("@").split("\t") + left, right, top, bottom = float(left), float(right), float(top), float(bottom) + poss.append(([int(p) - 1 for p in pn.split("-")], left, right, top, bottom)) + return poss + + def crop(self, text: str, need_position: bool = False): + """Crop images from PDF based on position tags in text.""" + imgs = [] + poss = self.extract_positions(text) + + if not poss: + if need_position: + return None, None + return + + if not getattr(self, "page_images", None): + self.logger.warning("[PaddleOCR] crop called without page images; skipping image generation.") + if need_position: + return None, None + return + + page_count = len(self.page_images) + + filtered_poss = [] + for pns, left, right, top, bottom in poss: + if not pns: + self.logger.warning("[PaddleOCR] Empty page index list in crop; skipping this position.") + continue + valid_pns = [p for p in pns if 0 <= p < page_count] + if not valid_pns: + self.logger.warning(f"[PaddleOCR] All page indices {pns} out of range for {page_count} pages; skipping.") + continue + filtered_poss.append((valid_pns, left, right, top, bottom)) + + poss = filtered_poss + if not poss: + self.logger.warning("[PaddleOCR] No valid positions after filtering; skip cropping.") + if need_position: + return None, None + return + + max_width = max(np.max([right - left for (_, left, right, _, _) in poss]), 6) + GAP = 6 + pos = poss[0] + first_page_idx = pos[0][0] + poss.insert(0, ([first_page_idx], pos[1], pos[2], max(0, pos[3] - 120), max(pos[3] - GAP, 0))) + pos = poss[-1] + last_page_idx = pos[0][-1] + if not (0 <= last_page_idx < page_count): + self.logger.warning(f"[PaddleOCR] Last page index {last_page_idx} out of range for {page_count} pages; skipping crop.") + if need_position: + return None, None + return + last_page_height = self.page_images[last_page_idx].size[1] + poss.append( + ( + [last_page_idx], + pos[1], + pos[2], + min(last_page_height, pos[4] + GAP), + min(last_page_height, pos[4] + 120), + ) + ) + + positions = [] + for ii, (pns, left, right, top, bottom) in enumerate(poss): + right = left + max_width + + if bottom <= top: + bottom = top + 2 + + for pn in pns[1:]: + if 0 <= pn - 1 < page_count: + bottom += self.page_images[pn - 1].size[1] + else: + self.logger.warning(f"[PaddleOCR] Page index {pn}-1 out of range for {page_count} pages during crop; skipping height accumulation.") + + if not (0 <= pns[0] < page_count): + self.logger.warning(f"[PaddleOCR] Base page index {pns[0]} out of range for {page_count} pages during crop; skipping this segment.") + continue + + img0 = self.page_images[pns[0]] + x0, y0, x1, y1 = int(left), int(top), int(right), int(min(bottom, img0.size[1])) + crop0 = img0.crop((x0, y0, x1, y1)) + imgs.append(crop0) + if 0 < ii < len(poss) - 1: + positions.append((pns[0] + self.page_from, x0, x1, y0, y1)) + + bottom -= img0.size[1] + for pn in pns[1:]: + if not (0 <= pn < page_count): + self.logger.warning(f"[PaddleOCR] Page index {pn} out of range for {page_count} pages during crop; skipping this page.") + continue + page = self.page_images[pn] + x0, y0, x1, y1 = int(left), 0, int(right), int(min(bottom, page.size[1])) + cimgp = page.crop((x0, y0, x1, y1)) + imgs.append(cimgp) + if 0 < ii < len(poss) - 1: + positions.append((pn + self.page_from, x0, x1, y0, y1)) + bottom -= page.size[1] + + if not imgs: + if need_position: + return None, None + return + + height = 0 + for img in imgs: + height += img.size[1] + GAP + height = int(height) + width = int(np.max([i.size[0] for i in imgs])) + pic = Image.new("RGB", (width, height), (245, 245, 245)) + height = 0 + for ii, img in enumerate(imgs): + if ii == 0 or ii + 1 == len(imgs): + img = img.convert("RGBA") + overlay = Image.new("RGBA", img.size, (0, 0, 0, 0)) + overlay.putalpha(128) + img = Image.alpha_composite(img, overlay).convert("RGB") + pic.paste(img, (0, int(height))) + height += img.size[1] + GAP + + if need_position: + return pic, positions + return pic + if __name__ == "__main__": logging.basicConfig(level=logging.INFO) diff --git a/docs/faq.mdx b/docs/faq.mdx index dc685d37a..d08bb9361 100644 --- a/docs/faq.mdx +++ b/docs/faq.mdx @@ -566,3 +566,82 @@ RAGFlow supports MinerU's `vlm-http-client` backend, enabling you to delegate do :::tip NOTE When using the `vlm-http-client` backend, the RAGFlow server requires no GPU, only network connectivity. This enables cost-effective distributed deployment with multiple RAGFlow instances sharing one remote vLLM server. ::: + +### How to use PaddleOCR for document parsing? + +From v0.24.0 onwards, RAGFlow includes PaddleOCR as an optional PDF parser. Please note that RAGFlow acts only as a *remote client* for PaddleOCR, calling the PaddleOCR API to parse PDFs and reading the returned files. + +There are two main ways to configure and use PaddleOCR in RAGFlow: + +#### 1. Using PaddleOCR Official API + +This method uses PaddleOCR's official API service with an access token. + +**Step 1: Configure RAGFlow** +- **Via Environment Variables:** + ```bash + # In your docker/.env file: + PADDLEOCR_API_URL=https://your-paddleocr-api-endpoint + PADDLEOCR_ALGORITHM=PaddleOCR-VL + PADDLEOCR_ACCESS_TOKEN=your-access-token-here + ``` + +- **Via UI:** + - Navigate to **Model providers** page + - Add a new OCR model with factory type "PaddleOCR" + - Configure the following fields: + - **PaddleOCR API URL**: Your PaddleOCR API endpoint + - **PaddleOCR Algorithm**: Select the algorithm corresponding to the API endpoint + - **AI Studio Access Token**: Your access token for the PaddleOCR API + +**Step 2: Usage in Dataset Configuration** +- In your dataset's **Configuration** page, find the **Ingestion pipeline** section +- If using built-in chunking methods that support PDF parsing, select **PaddleOCR** from the **PDF parser** dropdown +- If using custom ingestion pipeline, select **PaddleOCR** in the **Parser** component + +**Notes:** +- To obtain the API URL, visit the [PaddleOCR official website](https://aistudio.baidu.com/paddleocr/task), click the **API** button in the upper-left corner, choose the example code for the specific algorithm you want to use (e.g., PaddleOCR-VL), and copy the `API_URL`. +- Access tokens can be obtained from the [AI Studio platform](https://aistudio.baidu.com/account/accessToken). +- This method requires internet connectivity to reach the official PaddleOCR API. + +#### 2. Using Self-Hosted PaddleOCR Service + +This method allows you to deploy your own PaddleOCR service and use it without an access token. + +**Step 1: Deploy PaddleOCR Service** +Follow the [PaddleOCR serving documentation](https://www.paddleocr.ai/latest/en/version3.x/deployment/serving.html) to deploy your own service. For layout parsing, you can use an endpoint like: + +```bash +http://localhost:8080/layout-parsing +``` + +**Step 2: Configure RAGFlow** +- **Via Environment Variables:** + ```bash + PADDLEOCR_API_URL=http://localhost:8080/layout-parsing + PADDLEOCR_ALGORITHM=PaddleOCR-VL + # No access token required for self-hosted service + ``` + +- **Via UI:** + - Navigate to **Model providers** page + - Add a new OCR model with factory type "PaddleOCR" + - Configure the following fields: + - **PaddleOCR API URL**: The endpoint of your deployed service + - **PaddleOCR Algorithm**: Select the algorithm corresponding to the deployed service + - **AI Studio Access Token**: Leave empty + +**Step 3: Usage in Dataset Configuration** +- In your dataset's **Configuration** page, find the **Ingestion pipeline** section +- If using built-in chunking methods that support PDF parsing, select **PaddleOCR** from the **PDF parser** dropdown +- If using custom ingestion pipeline, select **PaddleOCR** in the **Parser** component + +#### Environment Variables Summary + +| Environment Variable | Description | Default | Required | +|---------------------|-------------|---------|----------| +| `PADDLEOCR_API_URL` | PaddleOCR API endpoint URL | `""` | Yes, when using environment variables | +| `PADDLEOCR_ALGORITHM` | Algorithm to use for parsing | `"PaddleOCR-VL"` | No | +| `PADDLEOCR_ACCESS_TOKEN` | Access token for official API | `None` | Only when using official API | + +Environment variables can be used for auto-provisioning, but are not required if configuring via UI. When environment variables are set, these values are used to auto-provision a PaddleOCR model for the tenant on first use. diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index 2cc941b72..b681e5d5d 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -358,48 +358,14 @@ class Parser(ProcessBase): parse_method=conf.get("paddleocr_parse_method", "raw"), ) bboxes = [] - for section in lines: - # PaddleOCRParser returns sections as tuple, different formats based on parse_method: - # - "raw": (text, position_tag) - # - "manual": (text, label, position_tag) - # - "paper": (text_with_tag, label) - text = section[0] - - # Parse position tag if exists - position_tag = "" - if len(section) > 1: - if len(section) == 2: # raw format: (text, tag) - position_tag = section[1] - elif len(section) == 3: # manual format: (text, label, tag) - position_tag = section[2] - elif "paper" in conf.get("paddleocr_parse_method", "") and len(section) == 2: - # paper format: text may contain tag - text_with_tag = text - import re - - tag_match = re.search(r"(@@[0-9-]+\t[0-9.\t]+##)", text_with_tag) - if tag_match: - position_tag = tag_match.group(1) - text = text_with_tag.replace(position_tag, "").strip() - - # Extract coordinate information from position tag - page_number, x0, x1, top, bottom = 1, 0, 0, 0, 0 - if position_tag: - import re - - tag_match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag) - if tag_match: - pn, x0_str, x1_str, top_str, bottom_str = tag_match.groups() - page_number = int(pn.split("-")[0]) # Take first page number - x0, x1, top, bottom = float(x0_str), float(x1_str), float(top_str), float(bottom_str) + for t, poss in lines: + # Get cropped image and positions + cropped_image, positions = pdf_parser.crop(poss, need_position=True) box = { - "text": text, - "page_number": page_number, - "x0": x0, - "x1": x1, - "top": top, - "bottom": bottom, + "text": t, + "image": cropped_image, + "positions": positions, } bboxes.append(box) else: diff --git a/web/src/pages/user-setting/setting-model/modal/paddleocr-modal/index.tsx b/web/src/pages/user-setting/setting-model/modal/paddleocr-modal/index.tsx index 2df23c3de..5c4fcbfef 100644 --- a/web/src/pages/user-setting/setting-model/modal/paddleocr-modal/index.tsx +++ b/web/src/pages/user-setting/setting-model/modal/paddleocr-modal/index.tsx @@ -122,7 +122,7 @@ const PaddleOCRModal = ({ disabled={loading} className="btn btn-primary" > - {loading ? t('common.adding') : t('common.add')} + {t('common.add')}