From 2e09db02f3a9462379ed9b9a0663e0255f251644 Mon Sep 17 00:00:00 2001 From: Lin Manhui Date: Fri, 9 Jan 2026 17:48:45 +0800 Subject: [PATCH] feat: add paddleocr parser (#12513) ### What problem does this PR solve? Add PaddleOCR as a new PDF parser. ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- api/apps/llm_app.py | 8 +- api/db/services/tenant_llm_service.py | 106 +++-- common/constants.py | 40 +- common/parser_config_utils.py | 3 + conf/llm_factories.json | 8 + deepdoc/parser/paddleocr_parser.py | 400 ++++++++++++++++++ rag/app/book.py | 90 ++-- rag/app/laws.py | 68 ++- rag/app/manual.py | 92 ++-- rag/app/naive.py | 283 ++++++------- rag/app/one.py | 55 +-- rag/app/presentation.py | 69 +-- rag/flow/parser/parser.py | 86 +++- rag/llm/ocr_model.py | 66 ++- web/src/assets/svg/llm/paddleocr.svg | 14 + .../layout-recognize-form-field.tsx | 4 + .../paddleocr-options-form-field.tsx | 95 +++++ web/src/components/svg-icon.tsx | 1 + web/src/constants/llm.ts | 3 + web/src/locales/de.ts | 26 +- web/src/locales/en.ts | 24 +- web/src/locales/es.ts | 25 ++ web/src/locales/fr.ts | 22 + web/src/locales/id.ts | 22 + web/src/locales/it.ts | 22 + web/src/locales/ja.ts | 26 +- web/src/locales/pt-br.ts | 22 + web/src/locales/ru.ts | 24 +- web/src/locales/vi.ts | 22 + web/src/locales/zh-traditional.ts | 22 + web/src/locales/zh.ts | 22 + .../user-setting/setting-model/hooks.tsx | 40 ++ .../user-setting/setting-model/index.tsx | 18 + .../modal/paddleocr-modal/index.tsx | 135 ++++++ 34 files changed, 1510 insertions(+), 453 deletions(-) create mode 100644 deepdoc/parser/paddleocr_parser.py create mode 100644 web/src/assets/svg/llm/paddleocr.svg create mode 100644 web/src/components/paddleocr-options-form-field.tsx create mode 100644 web/src/pages/user-setting/setting-model/modal/paddleocr-modal/index.tsx diff --git a/api/apps/llm_app.py b/api/apps/llm_app.py index 9a68e8256..3272a36ad 100644 --- a/api/apps/llm_app.py +++ b/api/apps/llm_app.py @@ -195,6 +195,9 @@ async def add_llm(): elif factory == "MinerU": api_key = apikey_json(["api_key", "provider_order"]) + elif factory == "PaddleOCR": + api_key = apikey_json(["api_key", "provider_order"]) + llm = { "tenant_id": current_user.id, "llm_factory": factory, @@ -230,8 +233,7 @@ async def add_llm(): **extra, ) try: - m, tc = await mdl.async_chat(None, [{"role": "user", "content": "Hello! How are you doing!"}], - {"temperature": 0.9}) + m, tc = await mdl.async_chat(None, [{"role": "user", "content": "Hello! How are you doing!"}], {"temperature": 0.9}) if not tc and m.find("**ERROR**:") >= 0: raise Exception(m) except Exception as e: @@ -381,7 +383,7 @@ def list_app(): facts = set([o.to_dict()["llm_factory"] for o in objs if o.api_key and o.status == StatusEnum.VALID.value]) status = {(o.llm_name + "@" + o.llm_factory) for o in objs if o.status == StatusEnum.VALID.value} llms = LLMService.get_all() - llms = [m.to_dict() for m in llms if m.status == StatusEnum.VALID.value and m.fid not in weighted and (m.fid == 'Builtin' or (m.llm_name + "@" + m.fid) in status)] + llms = [m.to_dict() for m in llms if m.status == StatusEnum.VALID.value and m.fid not in weighted and (m.fid == "Builtin" or (m.llm_name + "@" + m.fid) in status)] for m in llms: m["available"] = m["fid"] in facts or m["llm_name"].lower() == "flag-embedding" or m["fid"] in self_deployed if "tei-" in os.getenv("COMPOSE_PROFILES", "") and m["model_type"] == LLMType.EMBEDDING and m["fid"] == "Builtin" and m["llm_name"] == os.getenv("TEI_MODEL", ""): diff --git a/api/db/services/tenant_llm_service.py b/api/db/services/tenant_llm_service.py index 65771f60f..43f9107b2 100644 --- a/api/db/services/tenant_llm_service.py +++ b/api/db/services/tenant_llm_service.py @@ -19,7 +19,7 @@ import logging from peewee import IntegrityError from langfuse import Langfuse from common import settings -from common.constants import MINERU_DEFAULT_CONFIG, MINERU_ENV_KEYS, LLMType +from common.constants import MINERU_DEFAULT_CONFIG, MINERU_ENV_KEYS, PADDLEOCR_DEFAULT_CONFIG, PADDLEOCR_ENV_KEYS, LLMType from api.db.db_models import DB, LLMFactories, TenantLLM from api.db.services.common_service import CommonService from api.db.services.langfuse_service import TenantLangfuseService @@ -60,10 +60,8 @@ class TenantLLMService(CommonService): @classmethod @DB.connection_context() def get_my_llms(cls, tenant_id): - fields = [cls.model.llm_factory, LLMFactories.logo, LLMFactories.tags, cls.model.model_type, cls.model.llm_name, - cls.model.used_tokens, cls.model.status] - objs = cls.model.select(*fields).join(LLMFactories, on=(cls.model.llm_factory == LLMFactories.name)).where( - cls.model.tenant_id == tenant_id, ~cls.model.api_key.is_null()).dicts() + fields = [cls.model.llm_factory, LLMFactories.logo, LLMFactories.tags, cls.model.model_type, cls.model.llm_name, cls.model.used_tokens, cls.model.status] + objs = cls.model.select(*fields).join(LLMFactories, on=(cls.model.llm_factory == LLMFactories.name)).where(cls.model.tenant_id == tenant_id, ~cls.model.api_key.is_null()).dicts() return list(objs) @@ -90,6 +88,7 @@ class TenantLLMService(CommonService): @DB.connection_context() def get_model_config(cls, tenant_id, llm_type, llm_name=None): from api.db.services.llm_service import LLMService + e, tenant = TenantService.get_by_id(tenant_id) if not e: raise LookupError("Tenant not found") @@ -119,9 +118,9 @@ class TenantLLMService(CommonService): model_config = cls.get_api_key(tenant_id, mdlnm) if model_config: model_config = model_config.to_dict() - elif llm_type == LLMType.EMBEDDING and fid == 'Builtin' and "tei-" in os.getenv("COMPOSE_PROFILES", "") and mdlnm == os.getenv('TEI_MODEL', ''): + elif llm_type == LLMType.EMBEDDING and fid == "Builtin" and "tei-" in os.getenv("COMPOSE_PROFILES", "") and mdlnm == os.getenv("TEI_MODEL", ""): embedding_cfg = settings.EMBEDDING_CFG - model_config = {"llm_factory": 'Builtin', "api_key": embedding_cfg["api_key"], "llm_name": mdlnm, "api_base": embedding_cfg["base_url"]} + model_config = {"llm_factory": "Builtin", "api_key": embedding_cfg["api_key"], "llm_name": mdlnm, "api_base": embedding_cfg["base_url"]} else: raise LookupError(f"Model({mdlnm}@{fid}) not authorized") @@ -140,33 +139,27 @@ class TenantLLMService(CommonService): if llm_type == LLMType.EMBEDDING.value: if model_config["llm_factory"] not in EmbeddingModel: return None - return EmbeddingModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], - base_url=model_config["api_base"]) + return EmbeddingModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"]) elif llm_type == LLMType.RERANK: if model_config["llm_factory"] not in RerankModel: return None - return RerankModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], - base_url=model_config["api_base"]) + return RerankModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"]) elif llm_type == LLMType.IMAGE2TEXT.value: if model_config["llm_factory"] not in CvModel: return None - return CvModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], lang, - base_url=model_config["api_base"], **kwargs) + return CvModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], lang, base_url=model_config["api_base"], **kwargs) elif llm_type == LLMType.CHAT.value: if model_config["llm_factory"] not in ChatModel: return None - return ChatModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], - base_url=model_config["api_base"], **kwargs) + return ChatModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"], **kwargs) elif llm_type == LLMType.SPEECH2TEXT: if model_config["llm_factory"] not in Seq2txtModel: return None - return Seq2txtModel[model_config["llm_factory"]](key=model_config["api_key"], - model_name=model_config["llm_name"], lang=lang, - base_url=model_config["api_base"]) + return Seq2txtModel[model_config["llm_factory"]](key=model_config["api_key"], model_name=model_config["llm_name"], lang=lang, base_url=model_config["api_base"]) elif llm_type == LLMType.TTS: if model_config["llm_factory"] not in TTSModel: return None @@ -216,14 +209,11 @@ class TenantLLMService(CommonService): try: num = ( cls.model.update(used_tokens=cls.model.used_tokens + used_tokens) - .where(cls.model.tenant_id == tenant_id, cls.model.llm_name == llm_name, - cls.model.llm_factory == llm_factory if llm_factory else True) + .where(cls.model.tenant_id == tenant_id, cls.model.llm_name == llm_name, cls.model.llm_factory == llm_factory if llm_factory else True) .execute() ) except Exception: - logging.exception( - "TenantLLMService.increase_usage got exception,Failed to update used_tokens for tenant_id=%s, llm_name=%s", - tenant_id, llm_name) + logging.exception("TenantLLMService.increase_usage got exception,Failed to update used_tokens for tenant_id=%s, llm_name=%s", tenant_id, llm_name) return 0 return num @@ -231,9 +221,7 @@ class TenantLLMService(CommonService): @classmethod @DB.connection_context() def get_openai_models(cls): - objs = cls.model.select().where((cls.model.llm_factory == "OpenAI"), - ~(cls.model.llm_name == "text-embedding-3-small"), - ~(cls.model.llm_name == "text-embedding-3-large")).dicts() + objs = cls.model.select().where((cls.model.llm_factory == "OpenAI"), ~(cls.model.llm_name == "text-embedding-3-small"), ~(cls.model.llm_name == "text-embedding-3-large")).dicts() return list(objs) @classmethod @@ -298,6 +286,68 @@ class TenantLLMService(CommonService): idx += 1 continue + @classmethod + def _collect_paddleocr_env_config(cls) -> dict | None: + cfg = PADDLEOCR_DEFAULT_CONFIG + found = False + for key in PADDLEOCR_ENV_KEYS: + val = os.environ.get(key) + if val: + found = True + cfg[key] = val + return cfg if found else None + + @classmethod + @DB.connection_context() + def ensure_paddleocr_from_env(cls, tenant_id: str) -> str | None: + """ + Ensure a PaddleOCR model exists for the tenant if env variables are present. + Return the existing or newly created llm_name, or None if env not set. + """ + cfg = cls._collect_paddleocr_env_config() + if not cfg: + return None + + saved_paddleocr_models = cls.query(tenant_id=tenant_id, llm_factory="PaddleOCR", model_type=LLMType.OCR.value) + + def _parse_api_key(raw: str) -> dict: + try: + return json.loads(raw or "{}") + except Exception: + return {} + + for item in saved_paddleocr_models: + api_cfg = _parse_api_key(item.api_key) + normalized = {k: api_cfg.get(k, PADDLEOCR_DEFAULT_CONFIG.get(k)) for k in PADDLEOCR_ENV_KEYS} + if normalized == cfg: + return item.llm_name + + used_names = {item.llm_name for item in saved_paddleocr_models} + idx = 1 + base_name = "paddleocr-from-env" + while True: + candidate = f"{base_name}-{idx}" + if candidate in used_names: + idx += 1 + continue + + try: + cls.save( + tenant_id=tenant_id, + llm_factory="PaddleOCR", + llm_name=candidate, + model_type=LLMType.OCR.value, + api_key=json.dumps(cfg), + api_base="", + max_tokens=0, + ) + return candidate + except IntegrityError: + logging.warning("PaddleOCR env model %s already exists for tenant %s, retry with next name", candidate, tenant_id) + used_names.add(candidate) + idx += 1 + continue + @classmethod @DB.connection_context() def delete_by_tenant_id(cls, tenant_id): @@ -306,6 +356,7 @@ class TenantLLMService(CommonService): @staticmethod def llm_id2llm_type(llm_id: str) -> str | None: from api.db.services.llm_service import LLMService + llm_id, *_ = TenantLLMService.split_model_name_and_factory(llm_id) llm_factories = settings.FACTORY_LLM_INFOS for llm_factory in llm_factories: @@ -340,8 +391,7 @@ class LLM4Tenant: langfuse_keys = TenantLangfuseService.filter_by_tenant(tenant_id=tenant_id) self.langfuse = None if langfuse_keys: - langfuse = Langfuse(public_key=langfuse_keys.public_key, secret_key=langfuse_keys.secret_key, - host=langfuse_keys.host) + langfuse = Langfuse(public_key=langfuse_keys.public_key, secret_key=langfuse_keys.secret_key, host=langfuse_keys.host) if langfuse.auth_check(): self.langfuse = langfuse trace_id = self.langfuse.create_trace_id() diff --git a/common/constants.py b/common/constants.py index 4aea764b2..de228bbe8 100644 --- a/common/constants.py +++ b/common/constants.py @@ -20,6 +20,7 @@ from strenum import StrEnum SERVICE_CONF = "service_conf.yaml" RAG_FLOW_SERVICE_NAME = "ragflow" + class CustomEnum(Enum): @classmethod def valid(cls, value): @@ -68,13 +69,13 @@ class ActiveEnum(Enum): class LLMType(StrEnum): - CHAT = 'chat' - EMBEDDING = 'embedding' - SPEECH2TEXT = 'speech2text' - IMAGE2TEXT = 'image2text' - RERANK = 'rerank' - TTS = 'tts' - OCR = 'ocr' + CHAT = "chat" + EMBEDDING = "embedding" + SPEECH2TEXT = "speech2text" + IMAGE2TEXT = "image2text" + RERANK = "rerank" + TTS = "tts" + OCR = "ocr" class TaskStatus(StrEnum): @@ -86,8 +87,7 @@ class TaskStatus(StrEnum): SCHEDULE = "5" -VALID_TASK_STATUS = {TaskStatus.UNSTART, TaskStatus.RUNNING, TaskStatus.CANCEL, TaskStatus.DONE, TaskStatus.FAIL, - TaskStatus.SCHEDULE} +VALID_TASK_STATUS = {TaskStatus.UNSTART, TaskStatus.RUNNING, TaskStatus.CANCEL, TaskStatus.DONE, TaskStatus.FAIL, TaskStatus.SCHEDULE} class ParserType(StrEnum): @@ -136,6 +136,7 @@ class FileSource(StrEnum): BITBUCKET = "bitbucket" ZENDESK = "zendesk" + class PipelineTaskType(StrEnum): PARSE = "Parse" DOWNLOAD = "Download" @@ -145,15 +146,17 @@ class PipelineTaskType(StrEnum): MEMORY = "Memory" -VALID_PIPELINE_TASK_TYPES = {PipelineTaskType.PARSE, PipelineTaskType.DOWNLOAD, PipelineTaskType.RAPTOR, - PipelineTaskType.GRAPH_RAG, PipelineTaskType.MINDMAP} +VALID_PIPELINE_TASK_TYPES = {PipelineTaskType.PARSE, PipelineTaskType.DOWNLOAD, PipelineTaskType.RAPTOR, PipelineTaskType.GRAPH_RAG, PipelineTaskType.MINDMAP} + class MCPServerType(StrEnum): SSE = "sse" STREAMABLE_HTTP = "streamable-http" + VALID_MCP_SERVER_TYPES = {MCPServerType.SSE, MCPServerType.STREAMABLE_HTTP} + class Storage(Enum): MINIO = 1 AZURE_SPN = 2 @@ -165,10 +168,10 @@ class Storage(Enum): class MemoryType(Enum): - RAW = 0b0001 # 1 << 0 = 1 (0b00000001) - SEMANTIC = 0b0010 # 1 << 1 = 2 (0b00000010) - EPISODIC = 0b0100 # 1 << 2 = 4 (0b00000100) - PROCEDURAL = 0b1000 # 1 << 3 = 8 (0b00001000) + RAW = 0b0001 # 1 << 0 = 1 (0b00000001) + SEMANTIC = 0b0010 # 1 << 1 = 2 (0b00000010) + EPISODIC = 0b0100 # 1 << 2 = 4 (0b00000100) + PROCEDURAL = 0b1000 # 1 << 3 = 8 (0b00001000) class MemoryStorageType(StrEnum): @@ -239,3 +242,10 @@ MINERU_DEFAULT_CONFIG = { "MINERU_SERVER_URL": "", "MINERU_DELETE_OUTPUT": 1, } + +PADDLEOCR_ENV_KEYS = ["PADDLEOCR_API_URL", "PADDLEOCR_ACCESS_TOKEN", "PADDLEOCR_ALGORITHM"] +PADDLEOCR_DEFAULT_CONFIG = { + "PADDLEOCR_API_URL": "", + "PADDLEOCR_ACCESS_TOKEN": None, + "PADDLEOCR_ALGORITHM": "PaddleOCR-VL", +} diff --git a/common/parser_config_utils.py b/common/parser_config_utils.py index 0a79f3ad1..0bc7ffc28 100644 --- a/common/parser_config_utils.py +++ b/common/parser_config_utils.py @@ -26,5 +26,8 @@ def normalize_layout_recognizer(layout_recognizer_raw: Any) -> tuple[Any, str | if lowered.endswith("@mineru"): parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0] layout_recognizer = "MinerU" + elif lowered.endswith("@paddleocr"): + parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0] + layout_recognizer = "PaddleOCR" return layout_recognizer, parser_model_name diff --git a/conf/llm_factories.json b/conf/llm_factories.json index 451c8f452..b128f4e67 100644 --- a/conf/llm_factories.json +++ b/conf/llm_factories.json @@ -5531,6 +5531,14 @@ "status": "1", "rank": "900", "llm": [] + }, + { + "name": "PaddleOCR", + "logo": "", + "tags": "OCR", + "status": "1", + "rank": "910", + "llm": [] } ] } diff --git a/deepdoc/parser/paddleocr_parser.py b/deepdoc/parser/paddleocr_parser.py new file mode 100644 index 000000000..fca69da79 --- /dev/null +++ b/deepdoc/parser/paddleocr_parser.py @@ -0,0 +1,400 @@ +# Copyright 2026 The InfiniFlow Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from __future__ import annotations + +import base64 +import logging +import os +import re +from dataclasses import asdict, dataclass, field, fields +from io import BytesIO +from os import PathLike +from pathlib import Path +from typing import Any, Callable, ClassVar, Literal, Optional, Union, Tuple, List + +import requests + +try: + from deepdoc.parser.pdf_parser import RAGFlowPdfParser +except Exception: + + class RAGFlowPdfParser: + pass + + +AlgorithmType = Literal["PaddleOCR-VL"] +SectionTuple = tuple[str, ...] +TableTuple = tuple[str, ...] +ParseResult = tuple[list[SectionTuple], list[TableTuple]] + + +_MARKDOWN_IMAGE_PATTERN = re.compile( + r""" + ]*>\s* + ]*/>\s* + + | + ]*/> + """, + re.IGNORECASE | re.VERBOSE | re.DOTALL, +) + + +def _remove_images_from_markdown(markdown: str) -> str: + return _MARKDOWN_IMAGE_PATTERN.sub("", markdown) + + +@dataclass +class PaddleOCRVLConfig: + """Configuration for PaddleOCR-VL algorithm.""" + + use_doc_orientation_classify: Optional[bool] = None + use_doc_unwarping: Optional[bool] = None + use_layout_detection: Optional[bool] = None + use_polygon_points: Optional[bool] = None + use_chart_recognition: Optional[bool] = None + use_seal_recognition: Optional[bool] = None + use_ocr_for_image_block: Optional[bool] = None + layout_threshold: Optional[Union[float, dict]] = None + layout_nms: Optional[bool] = None + layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]] = None + layout_merge_bboxes_mode: Optional[Union[str, dict]] = None + prompt_label: Optional[str] = None + format_block_content: Optional[bool] = True + repetition_penalty: Optional[float] = None + temperature: Optional[float] = None + top_p: Optional[float] = None + min_pixels: Optional[int] = None + max_pixels: Optional[int] = None + max_new_tokens: Optional[int] = None + merge_layout_blocks: Optional[bool] = None + markdown_ignore_labels: Optional[List[str]] = None + vlm_extra_args: Optional[dict] = None + + +@dataclass +class PaddleOCRConfig: + """Main configuration for PaddleOCR parser.""" + + api_url: str = "" + access_token: Optional[str] = None + algorithm: AlgorithmType = "PaddleOCR-VL" + request_timeout: int = 600 + prettify_markdown: bool = True + show_formula_number: bool = True + visualize: bool = False + additional_params: dict[str, Any] = field(default_factory=dict) + algorithm_config: dict[str, Any] = field(default_factory=dict) + + @classmethod + def from_dict(cls, config: Optional[dict[str, Any]]) -> "PaddleOCRConfig": + """Create configuration from dictionary.""" + if not config: + return cls() + + cfg = config.copy() + algorithm = cfg.get("algorithm", "PaddleOCR-VL") + + # Validate algorithm + if algorithm not in ("PaddleOCR-VL",): + raise ValueError(f"Unsupported algorithm: {algorithm}") + + # Extract algorithm-specific configuration + algorithm_config: dict[str, Any] = {} + if algorithm == "PaddleOCR-VL": + # Create default PaddleOCRVLConfig object and convert to dict + algorithm_config = asdict(PaddleOCRVLConfig()) + + # Apply user-provided VL config + vl_config = cfg.get("vl") + if isinstance(vl_config, dict): + algorithm_config.update({k: v for k, v in vl_config.items() if v is not None}) + + # Remove processed keys + cfg.pop("vl", None) + + # Prepare initialization arguments + field_names = {field.name for field in fields(cls)} + init_kwargs: dict[str, Any] = {} + + for field_name in field_names: + if field_name in cfg: + init_kwargs[field_name] = cfg[field_name] + + init_kwargs["algorithm_config"] = algorithm_config + + return cls(**init_kwargs) + + @classmethod + def from_kwargs(cls, **kwargs: Any) -> "PaddleOCRConfig": + """Create configuration from keyword arguments.""" + return cls.from_dict(kwargs) + + +class PaddleOCRParser(RAGFlowPdfParser): + """Parser for PDF documents using PaddleOCR API.""" + + _COMMON_FIELD_MAPPING: ClassVar[dict[str, str]] = { + "prettify_markdown": "prettifyMarkdown", + "show_formula_number": "showFormulaNumber", + "visualize": "visualize", + } + + _ALGORITHM_FIELD_MAPPINGS: ClassVar[dict[str, dict[str, str]]] = { + "PaddleOCR-VL": { + "use_doc_orientation_classify": "useDocOrientationClassify", + "use_doc_unwarping": "useDocUnwarping", + "use_layout_detection": "useLayoutDetection", + "use_polygon_points": "usePolygonPoints", + "use_chart_recognition": "useChartRecognition", + "use_seal_recognition": "useSealRecognition", + "use_ocr_for_image_block": "useOcrForImageBlock", + "layout_threshold": "layoutThreshold", + "layout_nms": "layoutNms", + "layout_unclip_ratio": "layoutUnclipRatio", + "layout_merge_bboxes_mode": "layoutMergeBboxesMode", + "prompt_label": "promptLabel", + "format_block_content": "formatBlockContent", + "repetition_penalty": "repetitionPenalty", + "temperature": "temperature", + "top_p": "topP", + "min_pixels": "minPixels", + "max_pixels": "maxPixels", + "max_new_tokens": "maxNewTokens", + "merge_layout_blocks": "mergeLayoutBlocks", + "markdown_ignore_labels": "markdownIgnoreLabels", + "vlm_extra_args": "vlmExtraArgs", + }, + } + + def __init__( + self, + api_url: Optional[str] = None, + access_token: Optional[str] = None, + algorithm: AlgorithmType = "PaddleOCR-VL", + *, + request_timeout: int = 600, + ): + """Initialize PaddleOCR parser.""" + self.api_url = api_url.rstrip("/") if api_url else os.getenv("PADDLEOCR_API_URL", "") + self.access_token = access_token or os.getenv("PADDLEOCR_ACCESS_TOKEN") + self.algorithm = algorithm + self.request_timeout = request_timeout + self.logger = logging.getLogger(self.__class__.__name__) + + # Force PDF file type + self.file_type = 0 + + # Public methods + def check_installation(self) -> tuple[bool, str]: + """Check if the parser is properly installed and configured.""" + if not self.api_url: + return False, "[PaddleOCR] API URL not configured" + + # TODO [@Bobholamovic]: Check URL availability and token validity + + return True, "" + + def parse_pdf( + self, + filepath: str | PathLike[str], + binary: BytesIO | bytes | None = None, + callback: Optional[Callable[[float, str], None]] = None, + *, + parse_method: str = "raw", + api_url: Optional[str] = None, + access_token: Optional[str] = None, + algorithm: Optional[AlgorithmType] = None, + request_timeout: Optional[int] = None, + prettify_markdown: Optional[bool] = None, + show_formula_number: Optional[bool] = None, + visualize: Optional[bool] = None, + additional_params: Optional[dict[str, Any]] = None, + vl_config: Optional[dict[str, Any]] = None, + **kwargs: Any, + ) -> ParseResult: + """Parse PDF document using PaddleOCR API.""" + # Create configuration - pass all kwargs to capture VL config parameters + config_dict = { + "api_url": api_url if api_url is not None else self.api_url, + "access_token": access_token if access_token is not None else self.access_token, + "algorithm": algorithm if algorithm is not None else self.algorithm, + "request_timeout": request_timeout if request_timeout is not None else self.request_timeout, + } + if prettify_markdown is not None: + config_dict["prettify_markdown"] = prettify_markdown + if show_formula_number is not None: + config_dict["show_formula_number"] = show_formula_number + if visualize is not None: + config_dict["visualize"] = visualize + if additional_params is not None: + config_dict["additional_params"] = additional_params + if vl_config is not None: + config_dict["vl"] = vl_config + + # Add any VL config parameters from kwargs + for key, value in kwargs.items(): + if key in {field.name for field in fields(PaddleOCRVLConfig)}: + config_dict[key] = value + + cfg = PaddleOCRConfig.from_dict(config_dict) + + if not cfg.api_url: + raise RuntimeError("[PaddleOCR] API URL missing") + + # Prepare file data + data_bytes = self._prepare_file_data(filepath, binary) + + # Build and send request + result = self._send_request(data_bytes, cfg, callback) + + # Process response + sections = self._transfer_to_sections(result, algorithm=cfg.algorithm, parse_method=parse_method) + if callback: + callback(0.9, f"[PaddleOCR] done, sections: {len(sections)}") + + tables = self._transfer_to_tables(result) + if callback: + callback(1.0, f"[PaddleOCR] done, tables: {len(tables)}") + + return sections, tables + + def _prepare_file_data(self, filepath: str | PathLike[str], binary: BytesIO | bytes | None) -> bytes: + """Prepare file data for API request.""" + source_path = Path(filepath) + + if binary is not None: + if isinstance(binary, (bytes, bytearray)): + return binary + return binary.getbuffer().tobytes() + + if not source_path.exists(): + raise FileNotFoundError(f"[PaddleOCR] file not found: {source_path}") + + return source_path.read_bytes() + + def _build_payload(self, data: bytes, file_type: int, config: PaddleOCRConfig) -> dict[str, Any]: + """Build payload for API request.""" + payload: dict[str, Any] = { + "file": base64.b64encode(data).decode("ascii"), + "fileType": file_type, + } + + # Add common parameters + for param_key, param_value in [ + ("prettify_markdown", config.prettify_markdown), + ("show_formula_number", config.show_formula_number), + ("visualize", config.visualize), + ]: + if param_value is not None: + api_param = self._COMMON_FIELD_MAPPING[param_key] + payload[api_param] = param_value + + # Add algorithm-specific parameters + algorithm_mapping = self._ALGORITHM_FIELD_MAPPINGS.get(config.algorithm, {}) + for param_key, param_value in config.algorithm_config.items(): + if param_value is not None and param_key in algorithm_mapping: + api_param = algorithm_mapping[param_key] + payload[api_param] = param_value + + # Add any additional parameters + if config.additional_params: + payload.update(config.additional_params) + + return payload + + def _send_request(self, data: bytes, config: PaddleOCRConfig, callback: Optional[Callable[[float, str], None]]) -> dict[str, Any]: + """Send request to PaddleOCR API and parse response.""" + # Build payload + payload = self._build_payload(data, self.file_type, config) + + # Prepare headers + headers = {"Content-Type": "application/json", "Client-Platform": "ragflow"} + if config.access_token: + headers["Authorization"] = f"token {config.access_token}" + + self.logger.info("[PaddleOCR] invoking API") + if callback: + callback(0.1, "[PaddleOCR] submitting request") + + # Send request + try: + resp = requests.post(config.api_url, json=payload, headers=headers, timeout=self.request_timeout) + resp.raise_for_status() + except Exception as exc: + if callback: + callback(-1, f"[PaddleOCR] request failed: {exc}") + raise RuntimeError(f"[PaddleOCR] request failed: {exc}") + + # Parse response + try: + response_data = resp.json() + except Exception as exc: + raise RuntimeError(f"[PaddleOCR] response is not JSON: {exc}") from exc + + if callback: + callback(0.8, "[PaddleOCR] response received") + + # Validate response format + if response_data.get("errorCode") != 0 or not isinstance(response_data.get("result"), dict): + if callback: + callback(-1, "[PaddleOCR] invalid response format") + raise RuntimeError("[PaddleOCR] invalid response format") + + return response_data["result"] + + def _transfer_to_sections(self, result: dict[str, Any], algorithm: AlgorithmType, parse_method: str) -> list[SectionTuple]: + """Convert API response to section tuples.""" + sections: list[SectionTuple] = [] + + if algorithm == "PaddleOCR-VL": + layout_parsing_results = result.get("layoutParsingResults", []) + + for page_idx, layout_result in enumerate(layout_parsing_results): + pruned_result = layout_result.get("prunedResult", {}) + parsing_res_list = pruned_result.get("parsing_res_list", []) + + for block in parsing_res_list: + block_content = block.get("block_content", "").strip() + if not block_content: + continue + + # Remove images + block_content = _remove_images_from_markdown(block_content) + + label = block.get("block_label", "") + block_bbox = block.get("block_bbox", [0, 0, 0, 0]) + + tag = f"@@{page_idx + 1}\t{block_bbox[0]}\t{block_bbox[2]}\t{block_bbox[1]}\t{block_bbox[3]}##" + + if parse_method == "manual": + sections.append((block_content, label, tag)) + elif parse_method == "paper": + sections.append((block_content + tag, label)) + else: + sections.append((block_content, tag)) + + return sections + + def _transfer_to_tables(self, result: dict[str, Any]) -> list[TableTuple]: + """Convert API response to table tuples.""" + return [] + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO) + parser = PaddleOCRParser(api_url=os.getenv("PADDLEOCR_API_URL", ""), algorithm=os.getenv("PADDLEOCR_ALGORITHM", "PaddleOCR-VL")) + ok, reason = parser.check_installation() + print("PaddleOCR available:", ok, reason) diff --git a/rag/app/book.py b/rag/app/book.py index 86763adf2..d3c45b425 100644 --- a/rag/app/book.py +++ b/rag/app/book.py @@ -22,9 +22,7 @@ from deepdoc.parser.utils import get_text from rag.app import naive from rag.app.naive import by_plaintext, PARSERS from common.parser_config_utils import normalize_layout_recognizer -from rag.nlp import bullets_category, is_english, remove_contents_table, \ - hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \ - tokenize_chunks, attach_media_context +from rag.nlp import bullets_category, is_english, remove_contents_table, hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, tokenize_chunks, attach_media_context from rag.nlp import rag_tokenizer from deepdoc.parser import PdfParser, HtmlParser from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper @@ -32,17 +30,12 @@ from PIL import Image class Pdf(PdfParser): - def __call__(self, filename, binary=None, from_page=0, - to_page=100000, zoomin=3, callback=None): + def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): from timeit import default_timer as timer + start = timer() callback(msg="OCR started") - self.__images__( - filename if not binary else binary, - zoomin, - from_page, - to_page, - callback) + self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback) callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) start = timer() @@ -62,24 +55,17 @@ class Pdf(PdfParser): self._merge_with_same_bullet() callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start)) - return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) - for b in self.boxes], tbls + return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes], tbls -def chunk(filename, binary=None, from_page=0, to_page=100000, - lang="Chinese", callback=None, **kwargs): +def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): """ - Supported file formats are docx, pdf, txt. - Since a book is long and not all the parts are useful, if it's a PDF, - please set up the page ranges for every book in order eliminate negative effects and save elapsed computing time. + Supported file formats are docx, pdf, txt. + Since a book is long and not all the parts are useful, if it's a PDF, + please set up the page ranges for every book in order eliminate negative effects and save elapsed computing time. """ - parser_config = kwargs.get( - "parser_config", { - "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"}) - doc = { - "docnm_kwd": filename, - "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) - } + parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"}) + doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))} doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) pdf_parser = None sections, tbls = [], [] @@ -87,28 +73,23 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback(0.1, "Start to parse.") doc_parser = naive.Docx() # TODO: table of contents need to be removed - main_sections = doc_parser( - filename, binary=binary, from_page=from_page, to_page=to_page) - + main_sections = doc_parser(filename, binary=binary, from_page=from_page, to_page=to_page) + sections = [] tbls = [] for text, image, html in main_sections: sections.append((text, image)) tbls.append(((None, html), "")) - - remove_contents_table(sections, eng=is_english( - random_choices([t for t, _ in sections], k=200))) + + remove_contents_table(sections, eng=is_english(random_choices([t for t, _ in sections], k=200))) tbls = vision_figure_parser_docx_wrapper(sections=sections, tbls=tbls, callback=callback, **kwargs) # tbls = [((None, lns), None) for lns in tbls] - sections = [(item[0], item[1] if item[1] is not None else "") for item in sections if - not isinstance(item[1], Image.Image)] + sections = [(item[0], item[1] if item[1] is not None else "") for item in sections if not isinstance(item[1], Image.Image)] callback(0.8, "Finish parsing.") elif re.search(r"\.pdf$", filename, re.IGNORECASE): - layout_recognizer, parser_model_name = normalize_layout_recognizer( - parser_config.get("layout_recognize", "DeepDOC") - ) + layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC")) if isinstance(layout_recognizer, bool): layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" @@ -127,13 +108,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, pdf_cls=Pdf, layout_recognizer=layout_recognizer, mineru_llm_name=parser_model_name, - **kwargs + paddleocr_llm_name=parser_model_name, + **kwargs, ) if not sections and not tables: return [] - if name in ["tcadp", "docling", "mineru"]: + if name in ["tcadp", "docling", "mineru", "paddleocr"]: parser_config["chunk_token_num"] = 0 callback(0.8, "Finish parsing.") @@ -142,16 +124,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, txt = get_text(filename, binary) sections = txt.split("\n") sections = [(line, "") for line in sections if line] - remove_contents_table(sections, eng=is_english( - random_choices([t for t, _ in sections], k=200))) + remove_contents_table(sections, eng=is_english(random_choices([t for t, _ in sections], k=200))) callback(0.8, "Finish parsing.") elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") sections = HtmlParser()(filename, binary) sections = [(line, "") for line in sections if line] - remove_contents_table(sections, eng=is_english( - random_choices([t for t, _ in sections], k=200))) + remove_contents_table(sections, eng=is_english(random_choices([t for t, _ in sections], k=200))) callback(0.8, "Finish parsing.") elif re.search(r"\.doc$", filename, re.IGNORECASE): @@ -165,31 +145,23 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, binary = BytesIO(binary) doc_parsed = tika_parser.from_buffer(binary) - if doc_parsed.get('content', None) is not None: - sections = doc_parsed['content'].split('\n') + if doc_parsed.get("content", None) is not None: + sections = doc_parsed["content"].split("\n") sections = [(line, "") for line in sections if line] - remove_contents_table(sections, eng=is_english( - random_choices([t for t, _ in sections], k=200))) + remove_contents_table(sections, eng=is_english(random_choices([t for t, _ in sections], k=200))) callback(0.8, "Finish parsing.") else: - raise NotImplementedError( - "file type not supported yet(doc, docx, pdf, txt supported)") + raise NotImplementedError("file type not supported yet(doc, docx, pdf, txt supported)") make_colon_as_title(sections) - bull = bullets_category( - [t for t in random_choices([t for t, _ in sections], k=100)]) + bull = bullets_category([t for t in random_choices([t for t, _ in sections], k=100)]) if bull >= 0: - chunks = ["\n".join(ck) - for ck in hierarchical_merge(bull, sections, 5)] + chunks = ["\n".join(ck) for ck in hierarchical_merge(bull, sections, 5)] else: sections = [s.split("@") for s, _ in sections] - sections = [(pr[0], "@" + pr[1]) if len(pr) == 2 else (pr[0], '') for pr in sections] - chunks = naive_merge( - sections, - parser_config.get("chunk_token_num", 256), - parser_config.get("delimiter", "\n。;!?") - ) + sections = [(pr[0], "@" + pr[1]) if len(pr) == 2 else (pr[0], "") for pr in sections] + chunks = naive_merge(sections, parser_config.get("chunk_token_num", 256), parser_config.get("delimiter", "\n。;!?")) # is it English # is_english(random_choices([t for t, _ in sections], k=218)) @@ -208,9 +180,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if __name__ == "__main__": import sys - def dummy(prog=None, msg=""): pass - chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy) diff --git a/rag/app/laws.py b/rag/app/laws.py index 15c43e368..eb26c154d 100644 --- a/rag/app/laws.py +++ b/rag/app/laws.py @@ -21,8 +21,7 @@ from docx import Document from common.constants import ParserType from deepdoc.parser.utils import get_text -from rag.nlp import bullets_category, remove_contents_table, \ - make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge +from rag.nlp import bullets_category, remove_contents_table, make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge from rag.nlp import rag_tokenizer, Node from deepdoc.parser import PdfParser, DocxParser, HtmlParser from rag.app.naive import by_plaintext, PARSERS @@ -38,8 +37,7 @@ class Docx(DocxParser): return line def old_call(self, filename, binary=None, from_page=0, to_page=100000): - self.doc = Document( - filename) if not binary else Document(BytesIO(binary)) + self.doc = Document(filename) if not binary else Document(BytesIO(binary)) pn = 0 lines = [] for p in self.doc.paragraphs: @@ -48,16 +46,15 @@ class Docx(DocxParser): if from_page <= pn < to_page and p.text.strip(): lines.append(self.__clean(p.text)) for run in p.runs: - if 'lastRenderedPageBreak' in run._element.xml: + if "lastRenderedPageBreak" in run._element.xml: pn += 1 continue - if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: + if "w:br" in run._element.xml and 'type="page"' in run._element.xml: pn += 1 return [line for line in lines if line] def __call__(self, filename, binary=None, from_page=0, to_page=100000): - self.doc = Document( - filename) if not binary else Document(BytesIO(binary)) + self.doc = Document(filename) if not binary else Document(BytesIO(binary)) pn = 0 lines = [] level_set = set() @@ -71,10 +68,10 @@ class Docx(DocxParser): lines.append((question_level, p_text)) level_set.add(question_level) for run in p.runs: - if 'lastRenderedPageBreak' in run._element.xml: + if "lastRenderedPageBreak" in run._element.xml: pn += 1 continue - if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: + if "w:br" in run._element.xml and 'type="page"' in run._element.xml: pn += 1 sorted_levels = sorted(level_set) @@ -88,12 +85,12 @@ class Docx(DocxParser): return [element for element in root.get_tree() if element] def __str__(self) -> str: - return f''' + return f""" question:{self.question}, answer:{self.answer}, level:{self.level}, childs:{self.childs} - ''' + """ class Pdf(PdfParser): @@ -101,18 +98,12 @@ class Pdf(PdfParser): self.model_speciess = ParserType.LAWS.value super().__init__() - def __call__(self, filename, binary=None, from_page=0, - to_page=100000, zoomin=3, callback=None): + def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): from timeit import default_timer as timer + start = timer() callback(msg="OCR started") - self.__images__( - filename if not binary else binary, - zoomin, - from_page, - to_page, - callback - ) + self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback) callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) start = timer() @@ -123,22 +114,15 @@ class Pdf(PdfParser): callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start)) - return [(b["text"], self._line_tag(b, zoomin)) - for b in self.boxes], None + return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], None -def chunk(filename, binary=None, from_page=0, to_page=100000, - lang="Chinese", callback=None, **kwargs): +def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): """ - Supported file formats are docx, pdf, txt. + Supported file formats are docx, pdf, txt. """ - parser_config = kwargs.get( - "parser_config", { - "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"}) - doc = { - "docnm_kwd": filename, - "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) - } + parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"}) + doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))} doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) pdf_parser = None sections = [] @@ -152,9 +136,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, return tokenize_chunks(chunks, doc, eng, None) elif re.search(r"\.pdf$", filename, re.IGNORECASE): - layout_recognizer, parser_model_name = normalize_layout_recognizer( - parser_config.get("layout_recognize", "DeepDOC") - ) + layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC")) if isinstance(layout_recognizer, bool): layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" @@ -173,13 +155,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, pdf_cls=Pdf, layout_recognizer=layout_recognizer, mineru_llm_name=parser_model_name, - **kwargs + paddleocr_llm_name=parser_model_name, + **kwargs, ) if not raw_sections and not tables: return [] - if name in ["tcadp", "docling", "mineru"]: + if name in ["tcadp", "docling", "mineru", "paddleocr"]: parser_config["chunk_token_num"] = 0 for txt, poss in raw_sections: @@ -210,8 +193,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, binary = BytesIO(binary) doc_parsed = tika_parser.from_buffer(binary) - if doc_parsed.get('content', None) is not None: - sections = doc_parsed['content'].split('\n') + if doc_parsed.get("content", None) is not None: + sections = doc_parsed["content"].split("\n") sections = [s for s in sections if s] callback(0.8, "Finish parsing.") else: @@ -219,8 +202,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, logging.warning(f"tika.parser got empty content from {filename}.") return [] else: - raise NotImplementedError( - "file type not supported yet(doc, docx, pdf, txt supported)") + raise NotImplementedError("file type not supported yet(doc, docx, pdf, txt supported)") # Remove 'Contents' part remove_contents_table(sections, eng) @@ -241,9 +223,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if __name__ == "__main__": import sys - def dummy(prog=None, msg=""): pass - chunk(sys.argv[1], callback=dummy) diff --git a/rag/app/manual.py b/rag/app/manual.py index 8a39bffec..5f3b58792 100644 --- a/rag/app/manual.py +++ b/rag/app/manual.py @@ -20,8 +20,7 @@ import re from common.constants import ParserType from io import BytesIO -from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, \ - docx_question_level, attach_media_context +from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context from common.token_utils import num_tokens_from_string from deepdoc.parser import PdfParser, DocxParser from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper, vision_figure_parser_docx_wrapper @@ -36,18 +35,12 @@ class Pdf(PdfParser): self.model_speciess = ParserType.MANUAL.value super().__init__() - def __call__(self, filename, binary=None, from_page=0, - to_page=100000, zoomin=3, callback=None): + def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): from timeit import default_timer as timer + start = timer() callback(msg="OCR started") - self.__images__( - filename if not binary else binary, - zoomin, - from_page, - to_page, - callback - ) + self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback) callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) logging.debug("OCR: {}".format(timer() - start)) @@ -71,8 +64,7 @@ class Pdf(PdfParser): for b in self.boxes: b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip()) - return [(b["text"], b.get("layoutno", ""), self.get_position(b, zoomin)) - for i, b in enumerate(self.boxes)], tbls + return [(b["text"], b.get("layoutno", ""), self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)], tbls class Docx(DocxParser): @@ -80,12 +72,12 @@ class Docx(DocxParser): pass def get_picture(self, document, paragraph): - img = paragraph._element.xpath('.//pic:pic') + img = paragraph._element.xpath(".//pic:pic") if not img: return None try: img = img[0] - embed = img.xpath('.//a:blip/@r:embed')[0] + embed = img.xpath(".//a:blip/@r:embed")[0] related_part = document.part.related_parts[embed] image = related_part.image if image is not None: @@ -111,7 +103,7 @@ class Docx(DocxParser): new_width = max(width1, width2) new_height = height1 + height2 - new_image = Image.new('RGB', (new_width, new_height)) + new_image = Image.new("RGB", (new_width, new_height)) new_image.paste(img1, (0, 0)) new_image.paste(img2, (0, height1)) @@ -119,8 +111,7 @@ class Docx(DocxParser): return new_image def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None): - self.doc = Document( - filename) if not binary else Document(BytesIO(binary)) + self.doc = Document(filename) if not binary else Document(BytesIO(binary)) pn = 0 last_answer, last_image = "", None question_stack, level_stack = [], [] @@ -128,19 +119,19 @@ class Docx(DocxParser): for p in self.doc.paragraphs: if pn > to_page: break - question_level, p_text = 0, '' + question_level, p_text = 0, "" if from_page <= pn < to_page and p.text.strip(): question_level, p_text = docx_question_level(p) if not question_level or question_level > 6: # not a question - last_answer = f'{last_answer}\n{p_text}' + last_answer = f"{last_answer}\n{p_text}" current_image = self.get_picture(self.doc, p) last_image = self.concat_img(last_image, current_image) else: # is a question if last_answer or last_image: - sum_question = '\n'.join(question_stack) + sum_question = "\n".join(question_stack) if sum_question: - ti_list.append((f'{sum_question}\n{last_answer}', last_image)) - last_answer, last_image = '', None + ti_list.append((f"{sum_question}\n{last_answer}", last_image)) + last_answer, last_image = "", None i = question_level while question_stack and i <= level_stack[-1]: @@ -149,15 +140,15 @@ class Docx(DocxParser): question_stack.append(p_text) level_stack.append(question_level) for run in p.runs: - if 'lastRenderedPageBreak' in run._element.xml: + if "lastRenderedPageBreak" in run._element.xml: pn += 1 continue - if 'w:br' in run._element.xml and 'type="page"' in run._element.xml: + if "w:br" in run._element.xml and 'type="page"' in run._element.xml: pn += 1 if last_answer: - sum_question = '\n'.join(question_stack) + sum_question = "\n".join(question_stack) if sum_question: - ti_list.append((f'{sum_question}\n{last_answer}', last_image)) + ti_list.append((f"{sum_question}\n{last_answer}", last_image)) tbls = [] for tb in self.doc.tables: @@ -182,26 +173,19 @@ class Docx(DocxParser): return ti_list, tbls -def chunk(filename, binary=None, from_page=0, to_page=100000, - lang="Chinese", callback=None, **kwargs): +def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): """ - Only pdf is supported. + Only pdf is supported. """ - parser_config = kwargs.get( - "parser_config", { - "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"}) + parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"}) pdf_parser = None - doc = { - "docnm_kwd": filename - } + doc = {"docnm_kwd": filename} doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"])) doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) # is it English eng = lang.lower() == "english" # pdf_parser.is_english if re.search(r"\.pdf$", filename, re.IGNORECASE): - layout_recognizer, parser_model_name = normalize_layout_recognizer( - parser_config.get("layout_recognize", "DeepDOC") - ) + layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC")) if isinstance(layout_recognizer, bool): layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" @@ -222,8 +206,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, pdf_cls=Pdf, layout_recognizer=layout_recognizer, mineru_llm_name=parser_model_name, + paddleocr_llm_name=parser_model_name, parse_method="manual", - **kwargs + **kwargs, ) def _normalize_section(section): @@ -252,7 +237,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if not sections and not tbls: return [] - if name in ["tcadp", "docling", "mineru"]: + if name in ["tcadp", "docling", "mineru", "paddleocr"]: parser_config["chunk_token_num"] = 0 callback(0.8, "Finish parsing.") @@ -264,8 +249,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, for txt, _, _ in sections: for t, lvl in pdf_parser.outlines: tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)]) - tks_ = set([txt[i] + txt[i + 1] - for i in range(min(len(t), len(txt) - 1))]) + tks_ = set([txt[i] + txt[i + 1] for i in range(min(len(t), len(txt) - 1))]) if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8: levels.append(lvl) break @@ -274,8 +258,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, else: bull = bullets_category([txt for txt, _, _ in sections]) - most_level, levels = title_frequency( - bull, [(txt, lvl) for txt, lvl, _ in sections]) + most_level, levels = title_frequency(bull, [(txt, lvl) for txt, lvl, _ in sections]) assert len(sections) == len(levels) sec_ids = [] @@ -285,25 +268,21 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, sid += 1 sec_ids.append(sid) - sections = [(txt, sec_ids[i], poss) - for i, (txt, _, poss) in enumerate(sections)] + sections = [(txt, sec_ids[i], poss) for i, (txt, _, poss) in enumerate(sections)] for (img, rows), poss in tbls: if not rows: continue - sections.append((rows if isinstance(rows, str) else rows[0], -1, - [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss])) + sections.append((rows if isinstance(rows, str) else rows[0], -1, [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss])) def tag(pn, left, right, top, bottom): if pn + left + right + top + bottom == 0: return "" - return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \ - .format(pn, left, right, top, bottom) + return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format(pn, left, right, top, bottom) chunks = [] last_sid = -2 tk_cnt = 0 - for txt, sec_id, poss in sorted(sections, key=lambda x: ( - x[-1][0][0], x[-1][0][3], x[-1][0][1])): + for txt, sec_id, poss in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1])): poss = "\t".join([tag(*pos) for pos in poss]) if tk_cnt < 32 or (tk_cnt < 1024 and (sec_id == last_sid or sec_id == -1)): if chunks: @@ -330,14 +309,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, elif re.search(r"\.docx?$", filename, re.IGNORECASE): docx_parser = Docx() - ti_list, tbls = docx_parser(filename, binary, - from_page=0, to_page=10000, callback=callback) + ti_list, tbls = docx_parser(filename, binary, from_page=0, to_page=10000, callback=callback) tbls = vision_figure_parser_docx_wrapper(sections=ti_list, tbls=tbls, callback=callback, **kwargs) res = tokenize_table(tbls, doc, eng) for text, image in ti_list: d = copy.deepcopy(doc) if image: - d['image'] = image + d["image"] = image d["doc_type_kwd"] = "image" tokenize(d, text, eng) res.append(d) @@ -353,9 +331,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if __name__ == "__main__": import sys - def dummy(prog=None, msg=""): pass - chunk(sys.argv[1], callback=dummy) diff --git a/rag/app/naive.py b/rag/app/naive.py index 05d673e4b..86ac85bc8 100644 --- a/rag/app/naive.py +++ b/rag/app/naive.py @@ -33,29 +33,32 @@ from common.token_utils import num_tokens_from_string from common.constants import LLMType from api.db.services.llm_service import LLMBundle from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html -from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, \ - PdfParser, TxtParser -from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, \ - vision_figure_parser_pdf_wrapper +from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser +from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, vision_figure_parser_pdf_wrapper from deepdoc.parser.pdf_parser import PlainParser, VisionParser from deepdoc.parser.docling_parser import DoclingParser from deepdoc.parser.tcadp_parser import TCADPParser from common.parser_config_utils import normalize_layout_recognizer -from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, \ - tokenize_chunks, doc_tokenize_chunks_with_images, tokenize_table, append_context2table_image4pdf, tokenize_chunks_with_images, \ - attach_media_context # noqa: F401 +from rag.nlp import ( + concat_img, + find_codec, + naive_merge, + naive_merge_with_images, + naive_merge_docx, + rag_tokenizer, + tokenize_chunks, + doc_tokenize_chunks_with_images, + tokenize_table, + append_context2table_image4pdf, + tokenize_chunks_with_images, +) # noqa: F401 -def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None, - **kwargs): + +def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None, **kwargs): callback = callback binary = binary pdf_parser = pdf_cls() if pdf_cls else Pdf() - sections, tables = pdf_parser( - filename if not binary else binary, - from_page=from_page, - to_page=to_page, - callback=callback - ) + sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) tables = vision_figure_parser_pdf_wrapper( tbls=tables, @@ -67,17 +70,17 @@ def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese def by_mineru( - filename, - binary=None, - from_page=0, - to_page=100000, - lang="Chinese", - callback=None, - pdf_cls=None, - parse_method: str = "raw", - mineru_llm_name: str | None = None, - tenant_id: str | None = None, - **kwargs, + filename, + binary=None, + from_page=0, + to_page=100000, + lang="Chinese", + callback=None, + pdf_cls=None, + parse_method: str = "raw", + mineru_llm_name: str | None = None, + tenant_id: str | None = None, + **kwargs, ): pdf_parser = None if tenant_id: @@ -115,8 +118,7 @@ def by_mineru( return None, None, None -def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None, - **kwargs): +def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None, **kwargs): pdf_parser = DoclingParser() parse_method = kwargs.get("parse_method", "raw") @@ -130,7 +132,7 @@ def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese callback=callback, output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""), delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))), - parse_method=parse_method + parse_method=parse_method, ) return sections, tables, pdf_parser @@ -142,16 +144,60 @@ def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.") return None, None, tcadp_parser - sections, tables = tcadp_parser.parse_pdf( - filepath=filename, - binary=binary, - callback=callback, - output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), - file_type="PDF" - ) + sections, tables = tcadp_parser.parse_pdf(filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), file_type="PDF") return sections, tables, tcadp_parser +def by_paddleocr( + filename, + binary=None, + from_page=0, + to_page=100000, + lang="Chinese", + callback=None, + pdf_cls=None, + parse_method: str = "raw", + paddleocr_llm_name: str | None = None, + tenant_id: str | None = None, + **kwargs, +): + pdf_parser = None + if tenant_id: + if not paddleocr_llm_name: + try: + from api.db.services.tenant_llm_service import TenantLLMService + + env_name = TenantLLMService.ensure_paddleocr_from_env(tenant_id) + candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="PaddleOCR", model_type=LLMType.OCR) + if candidates: + paddleocr_llm_name = candidates[0].llm_name + elif env_name: + paddleocr_llm_name = env_name + except Exception as e: # best-effort fallback + logging.warning(f"fallback to env paddleocr: {e}") + + if paddleocr_llm_name: + try: + ocr_model = LLMBundle(tenant_id=tenant_id, llm_type=LLMType.OCR, llm_name=paddleocr_llm_name, lang=lang) + pdf_parser = ocr_model.mdl + sections, tables = pdf_parser.parse_pdf( + filepath=filename, + binary=binary, + callback=callback, + parse_method=parse_method, + **kwargs, + ) + return sections, tables, pdf_parser + except Exception as e: + logging.error(f"Failed to parse pdf via LLMBundle PaddleOCR ({paddleocr_llm_name}): {e}") + + return None, None, None + + if callback: + callback(-1, "PaddleOCR not found.") + return None, None, None + + def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): layout_recognizer = (kwargs.get("layout_recognizer") or "").strip() if (not layout_recognizer) or (layout_recognizer == "Plain Text"): @@ -168,12 +214,7 @@ def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=No ) pdf_parser = VisionParser(vision_model=vision_model, **kwargs) - sections, tables = pdf_parser( - filename if not binary else binary, - from_page=from_page, - to_page=to_page, - callback=callback - ) + sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback) return sections, tables, pdf_parser @@ -182,6 +223,7 @@ PARSERS = { "mineru": by_mineru, "docling": by_docling, "tcadp": by_tcadp, + "paddleocr": by_paddleocr, "plaintext": by_plaintext, # default } @@ -191,12 +233,12 @@ class Docx(DocxParser): pass def get_picture(self, document, paragraph): - imgs = paragraph._element.xpath('.//pic:pic') + imgs = paragraph._element.xpath(".//pic:pic") if not imgs: return None res_img = None for img in imgs: - embed = img.xpath('.//a:blip/@r:embed') + embed = img.xpath(".//a:blip/@r:embed") if not embed: continue embed = embed[0] @@ -219,7 +261,7 @@ class Docx(DocxParser): logging.warning(f"The recognized image stream appears to be corrupted. Skipping image, exception: {e}") continue try: - image = Image.open(BytesIO(image_blob)).convert('RGB') + image = Image.open(BytesIO(image_blob)).convert("RGB") if res_img is None: res_img = image else: @@ -251,11 +293,11 @@ class Docx(DocxParser): try: # Iterate through all paragraphs and tables in document order for i, block in enumerate(self.doc._element.body): - if block.tag.endswith('p'): # Paragraph + if block.tag.endswith("p"): # Paragraph p = Paragraph(block, self.doc) - blocks.append(('p', i, p)) - elif block.tag.endswith('tbl'): # Table - blocks.append(('t', i, None)) # Table object will be retrieved later + blocks.append(("p", i, p)) + elif block.tag.endswith("tbl"): # Table + blocks.append(("t", i, None)) # Table object will be retrieved later except Exception as e: logging.error(f"Error collecting blocks: {e}") return "" @@ -264,7 +306,7 @@ class Docx(DocxParser): target_table_pos = -1 table_count = 0 for i, (block_type, pos, _) in enumerate(blocks): - if block_type == 't': + if block_type == "t": if table_count == table_index: target_table_pos = pos break @@ -280,7 +322,7 @@ class Docx(DocxParser): if pos >= target_table_pos: # Skip blocks after the table continue - if block_type != 'p': + if block_type != "p": continue if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I): @@ -309,7 +351,7 @@ class Docx(DocxParser): if pos >= target_table_pos: # Skip blocks after the table continue - if block_type != 'p': + if block_type != "p": continue if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I): @@ -340,8 +382,7 @@ class Docx(DocxParser): return "" def __call__(self, filename, binary=None, from_page=0, to_page=100000): - self.doc = Document( - filename) if not binary else Document(BytesIO(binary)) + self.doc = Document(filename) if not binary else Document(BytesIO(binary)) pn = 0 lines = [] last_image = None @@ -357,7 +398,7 @@ class Docx(DocxParser): if pn > to_page: break - if block.tag.endswith('p'): + if block.tag.endswith("p"): p = Paragraph(block, self.doc) if from_page <= pn < to_page: @@ -417,7 +458,7 @@ class Docx(DocxParser): if "w:br" in xml and 'type="page"' in xml: pn += 1 - elif block.tag.endswith('tbl'): + elif block.tag.endswith("tbl"): if pn < from_page or pn > to_page: table_idx += 1 continue @@ -455,7 +496,6 @@ class Docx(DocxParser): return new_line - def to_markdown(self, filename=None, binary=None, inline_images: bool = True): """ This function uses mammoth, licensed under the BSD 2-Clause License. @@ -486,8 +526,7 @@ class Docx(DocxParser): try: if inline_images: - result = mammoth.convert_to_html(docx_file, - convert_image=mammoth.images.img_element(_convert_image_to_base64)) + result = mammoth.convert_to_html(docx_file, convert_image=mammoth.images.img_element(_convert_image_to_base64)) else: result = mammoth.convert_to_html(docx_file) @@ -505,18 +544,11 @@ class Pdf(PdfParser): def __init__(self): super().__init__() - def __call__(self, filename, binary=None, from_page=0, - to_page=100000, zoomin=3, callback=None, separate_tables_figures=False): + def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None, separate_tables_figures=False): start = timer() first_start = start callback(msg="OCR started") - self.__images__( - filename if not binary else binary, - zoomin, - from_page, - to_page, - callback - ) + self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback) callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start)) @@ -559,13 +591,14 @@ class Markdown(MarkdownParser): return [] from bs4 import BeautifulSoup + html_content = markdown(text) - soup = BeautifulSoup(html_content, 'html.parser') + soup = BeautifulSoup(html_content, "html.parser") return soup def get_hyperlink_urls(self, soup): if soup: - return set([a.get('href') for a in soup.find_all('a') if a.get('href')]) + return set([a.get("href") for a in soup.find_all("a") if a.get("href")]) return [] def extract_image_urls_with_lines(self, text): @@ -588,10 +621,10 @@ class Markdown(MarkdownParser): try: from bs4 import BeautifulSoup - soup = BeautifulSoup(text, 'html.parser') + soup = BeautifulSoup(text, "html.parser") newline_offsets = [m.start() for m in re.finditer(r"\n", text)] + [len(text)] - for img_tag in soup.find_all('img'): - src = img_tag.get('src') + for img_tag in soup.find_all("img"): + src = img_tag.get("src") if not src: continue @@ -627,14 +660,14 @@ class Markdown(MarkdownParser): continue img_obj = None try: - if url.startswith(('http://', 'https://')): + if url.startswith(("http://", "https://")): response = requests.get(url, stream=True, timeout=30) - if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image/'): - img_obj = Image.open(BytesIO(response.content)).convert('RGB') + if response.status_code == 200 and response.headers.get("Content-Type", "").startswith("image/"): + img_obj = Image.open(BytesIO(response.content)).convert("RGB") else: local_path = Path(url) if local_path.exists(): - img_obj = Image.open(url).convert('RGB') + img_obj = Image.open(url).convert("RGB") else: logging.warning(f"Local image file not found: {url}") except Exception as e: @@ -652,7 +685,7 @@ class Markdown(MarkdownParser): with open(filename, "r") as f: txt = f.read() - remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables) + remainder, tables = self.extract_tables_and_remainder(f"{txt}\n", separate_tables=separate_tables) # To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410. # extractor = MarkdownElementExtractor(remainder) extractor = MarkdownElementExtractor(txt) @@ -678,7 +711,7 @@ class Markdown(MarkdownParser): tbls = [] for table in tables: - tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), "")) + tbls.append(((None, markdown(table, extensions=["markdown.extensions.tables"])), "")) if return_section_images: return sections, tbls, section_images return sections, tbls @@ -694,7 +727,7 @@ def load_from_xml_v2(baseURI, rels_item_xml): if rels_item_xml is not None: rels_elm = parse_xml(rels_item_xml) for rel_elm in rels_elm.Relationship_lst: - if rel_elm.target_ref in ('../NULL', 'NULL'): + if rel_elm.target_ref in ("../NULL", "NULL"): continue srels._srels.append(_SerializedRelationship(baseURI, rel_elm)) return srels @@ -702,21 +735,18 @@ def load_from_xml_v2(baseURI, rels_item_xml): def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): """ - Supported file formats are docx, pdf, excel, txt. - This method apply the naive ways to chunk files. - Successive text will be sliced into pieces using 'delimiter'. - Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'. + Supported file formats are docx, pdf, excel, txt. + This method apply the naive ways to chunk files. + Successive text will be sliced into pieces using 'delimiter'. + Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'. """ urls = set() url_res = [] is_english = lang.lower() == "english" # is_english(cks) - parser_config = kwargs.get( - "parser_config", { - "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True}) + parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True}) - child_deli = (parser_config.get("children_delimiter") or "").encode('utf-8').decode('unicode_escape').encode( - 'latin1').decode('utf-8') + child_deli = (parser_config.get("children_delimiter") or "").encode("utf-8").decode("unicode_escape").encode("latin1").decode("utf-8") cust_child_deli = re.findall(r"`([^`]+)`", child_deli) child_deli = "|".join(re.sub(r"`([^`]+)`", "", child_deli)) if cust_child_deli: @@ -728,10 +758,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca table_context_size = max(0, int(parser_config.get("table_context_size", 0) or 0)) image_context_size = max(0, int(parser_config.get("image_context_size", 0) or 0)) - doc = { - "docnm_kwd": filename, - "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) - } + doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))} doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) res = [] pdf_parser = None @@ -750,8 +777,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca # Recursively chunk each embedded file and collect results for embed_filename, embed_bytes in embeds: try: - sub_res = chunk(embed_filename, binary=embed_bytes, lang=lang, callback=callback, is_root=False, - **kwargs) or [] + sub_res = chunk(embed_filename, binary=embed_bytes, lang=lang, callback=callback, is_root=False, **kwargs) or [] embed_res.extend(sub_res) except Exception as e: error_msg = f"Failed to chunk embed {embed_filename}: {e}" @@ -772,8 +798,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs) except Exception as e: logging.info(f"Failed to chunk url in registered file type {url}: {e}") - sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, - **kwargs) + sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs) url_res.extend(sub_url_res) # fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246 @@ -784,11 +809,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca # chunks list[dict] # images list - index of image chunk in chunks - chunks, images = naive_merge_docx( - sections, int(parser_config.get( - "chunk_token_num", 128)), parser_config.get( - "delimiter", "\n!?。;!?"), table_context_size, image_context_size) - + chunks, images = naive_merge_docx(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"), table_context_size, image_context_size) + vision_figure_parser_docx_wrapper_naive(chunks=chunks, idx_lst=images, callback=callback, **kwargs) callback(0.8, "Finish parsing.") @@ -801,9 +823,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca return res elif re.search(r"\.pdf$", filename, re.IGNORECASE): - layout_recognizer, parser_model_name = normalize_layout_recognizer( - parser_config.get("layout_recognize", "DeepDOC") - ) + layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC")) if parser_config.get("analyze_hyperlink", False) and is_root: urls = extract_links_from_pdf(binary) @@ -824,7 +844,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca callback=callback, layout_recognizer=layout_recognizer, mineru_llm_name=parser_model_name, - **kwargs + paddleocr_llm_name=parser_model_name, + **kwargs, ) if not sections and not tables: @@ -833,7 +854,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca if table_context_size or image_context_size: tables = append_context2table_image4pdf(sections, tables, image_context_size) - if name in ["tcadp", "docling", "mineru"]: + if name in ["tcadp", "docling", "mineru", "paddleocr"]: parser_config["chunk_token_num"] = 0 res = tokenize_table(tables, doc, is_english) @@ -847,10 +868,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca if layout_recognizer == "TCADP Parser": table_result_type = parser_config.get("table_result_type", "1") markdown_image_response_type = parser_config.get("markdown_image_response_type", "1") - tcadp_parser = TCADPParser( - table_result_type=table_result_type, - markdown_image_response_type=markdown_image_response_type - ) + tcadp_parser = TCADPParser(table_result_type=table_result_type, markdown_image_response_type=markdown_image_response_type) if not tcadp_parser.check_installation(): callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.") return res @@ -858,13 +876,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca # Determine file type based on extension file_type = "XLSX" if re.search(r"\.xlsx?$", filename, re.IGNORECASE) else "CSV" - sections, tables = tcadp_parser.parse_pdf( - filepath=filename, - binary=binary, - callback=callback, - output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), - file_type=file_type - ) + sections, tables = tcadp_parser.parse_pdf(filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), file_type=file_type) parser_config["chunk_token_num"] = 0 res = tokenize_table(tables, doc, is_english) callback(0.8, "Finish parsing.") @@ -879,9 +891,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE): callback(0.1, "Start to parse.") - sections = TxtParser()(filename, binary, - parser_config.get("chunk_token_num", 128), - parser_config.get("delimiter", "\n!?;。;!?")) + sections = TxtParser()(filename, binary, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?;。;!?")) callback(0.8, "Finish parsing.") elif re.search(r"\.(md|markdown|mdx)$", filename, re.IGNORECASE): @@ -919,11 +929,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca else: section_images = [None] * len(sections) section_images[idx] = combined_image - markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=[ - ((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs) + markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=[((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs) boosted_figures = markdown_vision_parser(callback=callback) - sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), - sections[idx][1]) + sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1]) else: logging.warning("No visual model detected. Skipping figure parsing enhancement.") @@ -962,8 +970,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca binary = BytesIO(binary) doc_parsed = tika_parser.from_buffer(binary) - if doc_parsed.get('content', None) is not None: - sections = doc_parsed['content'].split('\n') + if doc_parsed.get("content", None) is not None: + sections = doc_parsed["content"].split("\n") sections = [(_, "") for _ in sections if _] callback(0.8, "Finish parsing.") else: @@ -972,8 +980,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca logging.warning(error_msg) return [] else: - raise NotImplementedError( - "file type not supported yet(pdf, xlsx, doc, docx, txt supported)") + raise NotImplementedError("file type not supported yet(pdf, xlsx, doc, docx, txt supported)") st = timer() if is_markdown: @@ -1021,8 +1028,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca has_images = merged_images and any(img is not None for img in merged_images) if has_images: - res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images, - child_delimiters_pattern=child_deli)) + res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images, child_delimiters_pattern=child_deli)) else: res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli)) else: @@ -1031,17 +1037,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca section_images = None if section_images: - chunks, images = naive_merge_with_images(sections, section_images, - int(parser_config.get( - "chunk_token_num", 128)), parser_config.get( - "delimiter", "\n!?。;!?")) - res.extend( - tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli)) + chunks, images = naive_merge_with_images(sections, section_images, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?")) + res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli)) else: - chunks = naive_merge( - sections, int(parser_config.get( - "chunk_token_num", 128)), parser_config.get( - "delimiter", "\n!?。;!?")) + chunks = naive_merge(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?")) res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli)) @@ -1071,9 +1070,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca if __name__ == "__main__": import sys - def dummy(prog=None, msg=""): pass - chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) diff --git a/rag/app/one.py b/rag/app/one.py index a53d00ea9..e445f881f 100644 --- a/rag/app/one.py +++ b/rag/app/one.py @@ -28,18 +28,12 @@ from common.parser_config_utils import normalize_layout_recognizer class Pdf(PdfParser): - def __call__(self, filename, binary=None, from_page=0, - to_page=100000, zoomin=3, callback=None): + def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None): from timeit import default_timer as timer + start = timer() callback(msg="OCR started") - self.__images__( - filename if not binary else binary, - zoomin, - from_page, - to_page, - callback - ) + self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback) callback(msg="OCR finished ({:.2f}s)".format(timer() - start)) start = timer() @@ -57,21 +51,16 @@ class Pdf(PdfParser): tbls = self._extract_table_figure(True, zoomin, True, True) self._concat_downward() - sections = [(b["text"], self.get_position(b, zoomin)) - for i, b in enumerate(self.boxes)] - return [(txt, "") for txt, _ in sorted(sections, key=lambda x: ( - x[-1][0][0], x[-1][0][3], x[-1][0][1]))], tbls + sections = [(b["text"], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)] + return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))], tbls -def chunk(filename, binary=None, from_page=0, to_page=100000, - lang="Chinese", callback=None, **kwargs): +def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs): """ - Supported file formats are docx, pdf, excel, txt. - One file forms a chunk which maintains original text order. + Supported file formats are docx, pdf, excel, txt. + One file forms a chunk which maintains original text order. """ - parser_config = kwargs.get( - "parser_config", { - "chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"}) + parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"}) eng = lang.lower() == "english" # is_english(cks) if re.search(r"\.docx$", filename, re.IGNORECASE): @@ -99,9 +88,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, callback(0.8, "Finish parsing.") elif re.search(r"\.pdf$", filename, re.IGNORECASE): - layout_recognizer, parser_model_name = normalize_layout_recognizer( - parser_config.get("layout_recognize", "DeepDOC") - ) + layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC")) if isinstance(layout_recognizer, bool): layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" @@ -120,13 +107,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, pdf_cls=Pdf, layout_recognizer=layout_recognizer, mineru_llm_name=parser_model_name, - **kwargs + paddleocr_llm_name=parser_model_name, + **kwargs, ) if not sections and not tbls: return [] - if name in ["tcadp", "docling", "mineru"]: + if name in ["tcadp", "docling", "mineru", "paddleocr"]: parser_config["chunk_token_num"] = 0 callback(0.8, "Finish parsing.") @@ -134,8 +122,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, for (img, rows), poss in tbls: if not rows: continue - sections.append((rows if isinstance(rows, str) else rows[0], - [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss])) + sections.append((rows if isinstance(rows, str) else rows[0], [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss])) sections = [s for s, _ in sections if s] elif re.search(r"\.xlsx?$", filename, re.IGNORECASE): @@ -167,19 +154,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, binary = BytesIO(binary) doc_parsed = tika_parser.from_buffer(binary) - if doc_parsed.get('content', None) is not None: - sections = doc_parsed['content'].split('\n') + if doc_parsed.get("content", None) is not None: + sections = doc_parsed["content"].split("\n") sections = [s for s in sections if s] callback(0.8, "Finish parsing.") else: - raise NotImplementedError( - "file type not supported yet(doc, docx, pdf, txt supported)") + raise NotImplementedError("file type not supported yet(doc, docx, pdf, txt supported)") - doc = { - "docnm_kwd": filename, - "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename)) - } + doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))} doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) tokenize(doc, "\n".join(sections), eng) return [doc] @@ -188,9 +171,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if __name__ == "__main__": import sys - def dummy(prog=None, msg=""): pass - chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy) diff --git a/rag/app/presentation.py b/rag/app/presentation.py index 26c08183e..e4247e8cc 100644 --- a/rag/app/presentation.py +++ b/rag/app/presentation.py @@ -36,22 +36,18 @@ class Ppt(PptParser): callback(0.5, "Text extraction finished.") import aspose.slides as slides import aspose.pydrawing as drawing + imgs = [] with slides.Presentation(BytesIO(fnm)) as presentation: - for i, slide in enumerate(presentation.slides[from_page: to_page]): + for i, slide in enumerate(presentation.slides[from_page:to_page]): try: with BytesIO() as buffered: - slide.get_thumbnail( - 0.1, 0.1).save( - buffered, drawing.imaging.ImageFormat.jpeg) + slide.get_thumbnail(0.1, 0.1).save(buffered, drawing.imaging.ImageFormat.jpeg) buffered.seek(0) imgs.append(Image.open(buffered).copy()) except RuntimeError as e: - raise RuntimeError( - f'ppt parse error at page {i + 1}, original error: {str(e)}') from e - assert len(imgs) == len( - txts), "Slides text and image do not match: {} vs. {}".format( - len(imgs), len(txts)) + raise RuntimeError(f"ppt parse error at page {i + 1}, original error: {str(e)}") from e + assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts)) callback(0.9, "Image extraction finished") self.is_english = is_english(txts) return [(txts[i], imgs[i]) for i in range(len(txts))] @@ -61,12 +57,10 @@ class Pdf(PdfParser): def __init__(self): super().__init__() - def __call__(self, filename, binary=None, from_page=0, - to_page=100000, zoomin=3, callback=None, **kwargs): + def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None, **kwargs): # 1. OCR callback(msg="OCR started") - self.__images__(filename if not binary else binary, zoomin, from_page, - to_page, callback) + self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback) # 2. Layout Analysis callback(msg="Layout Analysis") @@ -91,12 +85,7 @@ class Pdf(PdfParser): global_page_num = b["page_number"] + from_page if not (from_page < global_page_num <= to_page + from_page): continue - page_items[global_page_num].append({ - "top": b["top"], - "x0": b["x0"], - "text": b["text"], - "type": "text" - }) + page_items[global_page_num].append({"top": b["top"], "x0": b["x0"], "text": b["text"], "type": "text"}) # (B) Add table and figure for (img, content), positions in tbls: @@ -127,12 +116,7 @@ class Pdf(PdfParser): top = positions[0][3] left = positions[0][1] - page_items[current_page_num].append({ - "top": top, - "x0": left, - "text": final_text, - "type": "table_or_figure" - }) + page_items[current_page_num].append({"top": top, "x0": left, "text": final_text, "type": "table_or_figure"}) # 7. Generate result res = [] @@ -153,18 +137,16 @@ class Pdf(PdfParser): class PlainPdf(PlainParser): - def __call__(self, filename, binary=None, from_page=0, - to_page=100000, callback=None, **kwargs): + def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs): self.pdf = pdf2_read(filename if not binary else BytesIO(binary)) page_txt = [] - for page in self.pdf.pages[from_page: to_page]: + for page in self.pdf.pages[from_page:to_page]: page_txt.append(page.extract_text()) callback(0.9, "Parsing finished") return [(txt, None) for txt in page_txt], [] -def chunk(filename, binary=None, from_page=0, to_page=100000, - lang="Chinese", callback=None, parser_config=None, **kwargs): +def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, parser_config=None, **kwargs): """ The supported file formats are pdf, pptx. Every page will be treated as a chunk. And the thumbnail of every page will be stored. @@ -173,18 +155,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, if parser_config is None: parser_config = {} eng = lang.lower() == "english" - doc = { - "docnm_kwd": filename, - "title_tks": rag_tokenizer.tokenize( - re.sub(r"\.[a-zA-Z]+$", "", filename)) - } + doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))} doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"]) res = [] if re.search(r"\.pptx?$", filename, re.IGNORECASE): ppt_parser = Ppt() - for pn, (txt, img) in enumerate(ppt_parser( - filename if not binary else binary, from_page, 1000000, - callback)): + for pn, (txt, img) in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)): d = copy.deepcopy(doc) pn += from_page d["image"] = img @@ -196,9 +172,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, res.append(d) return res elif re.search(r"\.pdf$", filename, re.IGNORECASE): - layout_recognizer, parser_model_name = normalize_layout_recognizer( - parser_config.get("layout_recognize", "DeepDOC") - ) + layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC")) if isinstance(layout_recognizer, bool): layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text" @@ -217,13 +191,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, pdf_cls=Pdf, layout_recognizer=layout_recognizer, mineru_llm_name=parser_model_name, - **kwargs + paddleocr_llm_name=parser_model_name, + **kwargs, ) if not sections: return [] - if name in ["tcadp", "docling", "mineru"]: + if name in ["tcadp", "docling", "mineru", "paddleocr"]: parser_config["chunk_token_num"] = 0 callback(0.8, "Finish parsing.") @@ -236,22 +211,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, d["image"] = img d["page_num_int"] = [pn + 1] d["top_int"] = [0] - d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, - img.size[1] if img else 0)] + d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)] tokenize(d, txt, eng) res.append(d) return res - raise NotImplementedError( - "file type not supported yet(pptx, pdf supported)") + raise NotImplementedError("file type not supported yet(pptx, pdf supported)") if __name__ == "__main__": import sys - def dummy(a, b): pass - chunk(sys.argv[1], callback=dummy) diff --git a/rag/flow/parser/parser.py b/rag/flow/parser/parser.py index 1c7154424..2cc941b72 100644 --- a/rag/flow/parser/parser.py +++ b/rag/flow/parser/parser.py @@ -166,7 +166,7 @@ class ParserParam(ProcessParamBase): pdf_parse_method = pdf_config.get("parse_method", "") self.check_empty(pdf_parse_method, "Parse method abnormal.") - if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru", "tcadp parser"]: + if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru", "tcadp parser", "paddleocr"]: self.check_empty(pdf_config.get("lang", ""), "PDF VLM language") pdf_output_format = pdf_config.get("output_format", "") @@ -232,6 +232,9 @@ class Parser(ProcessBase): if lowered.endswith("@mineru"): parser_model_name = raw_parse_method.rsplit("@", 1)[0] parse_method = "MinerU" + elif lowered.endswith("@paddleocr"): + parser_model_name = raw_parse_method.rsplit("@", 1)[0] + parse_method = "PaddleOCR" if parse_method.lower() == "deepdoc": bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback) @@ -239,6 +242,7 @@ class Parser(ProcessBase): lines, _ = PlainParser()(blob) bboxes = [{"text": t} for t, _ in lines] elif parse_method.lower() == "mineru": + def resolve_mineru_llm_name(): configured = parser_model_name or conf.get("mineru_llm_name") if configured: @@ -320,6 +324,84 @@ class Parser(ProcessBase): bboxes.append({"text": section}) else: bboxes.append({"text": section}) + elif parse_method.lower() == "paddleocr": + + def resolve_paddleocr_llm_name(): + configured = parser_model_name or conf.get("paddleocr_llm_name") + if configured: + return configured + + tenant_id = self._canvas._tenant_id + if not tenant_id: + return None + + from api.db.services.tenant_llm_service import TenantLLMService + + env_name = TenantLLMService.ensure_paddleocr_from_env(tenant_id) + candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="PaddleOCR", model_type=LLMType.OCR.value) + if candidates: + return candidates[0].llm_name + return env_name + + parser_model_name = resolve_paddleocr_llm_name() + if not parser_model_name: + raise RuntimeError("PaddleOCR model not configured. Please add PaddleOCR in Model Providers or set PADDLEOCR_* env.") + + tenant_id = self._canvas._tenant_id + ocr_model = LLMBundle(tenant_id, LLMType.OCR, llm_name=parser_model_name) + pdf_parser = ocr_model.mdl + + lines, _ = pdf_parser.parse_pdf( + filepath=name, + binary=blob, + callback=self.callback, + parse_method=conf.get("paddleocr_parse_method", "raw"), + ) + bboxes = [] + for section in lines: + # PaddleOCRParser returns sections as tuple, different formats based on parse_method: + # - "raw": (text, position_tag) + # - "manual": (text, label, position_tag) + # - "paper": (text_with_tag, label) + text = section[0] + + # Parse position tag if exists + position_tag = "" + if len(section) > 1: + if len(section) == 2: # raw format: (text, tag) + position_tag = section[1] + elif len(section) == 3: # manual format: (text, label, tag) + position_tag = section[2] + elif "paper" in conf.get("paddleocr_parse_method", "") and len(section) == 2: + # paper format: text may contain tag + text_with_tag = text + import re + + tag_match = re.search(r"(@@[0-9-]+\t[0-9.\t]+##)", text_with_tag) + if tag_match: + position_tag = tag_match.group(1) + text = text_with_tag.replace(position_tag, "").strip() + + # Extract coordinate information from position tag + page_number, x0, x1, top, bottom = 1, 0, 0, 0, 0 + if position_tag: + import re + + tag_match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag) + if tag_match: + pn, x0_str, x1_str, top_str, bottom_str = tag_match.groups() + page_number = int(pn.split("-")[0]) # Take first page number + x0, x1, top, bottom = float(x0_str), float(x1_str), float(top_str), float(bottom_str) + + box = { + "text": text, + "page_number": page_number, + "x0": x0, + "x1": x1, + "top": top, + "bottom": bottom, + } + bboxes.append(box) else: vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("parse_method"), lang=self._param.setups["pdf"].get("lang")) lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback) @@ -802,7 +884,7 @@ class Parser(ProcessBase): outs = self.output() tasks = [] for d in outs.get("json", []): - tasks.append(asyncio.create_task(image2id(d,partial(settings.STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id),get_uuid()))) + tasks.append(asyncio.create_task(image2id(d, partial(settings.STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id), get_uuid()))) try: await asyncio.gather(*tasks, return_exceptions=False) diff --git a/rag/llm/ocr_model.py b/rag/llm/ocr_model.py index 9b69eb5a5..800935467 100644 --- a/rag/llm/ocr_model.py +++ b/rag/llm/ocr_model.py @@ -19,6 +19,7 @@ import os from typing import Any, Optional from deepdoc.parser.mineru_parser import MinerUParser +from deepdoc.parser.paddleocr_parser import PaddleOCRParser class Base: @@ -60,16 +61,11 @@ class MinerUOcrModel(Base, MinerUParser): # Redact sensitive config keys before logging redacted_config = {} for k, v in config.items(): - if any( - sensitive_word in k.lower() - for sensitive_word in ("key", "password", "token", "secret") - ): + if any(sensitive_word in k.lower() for sensitive_word in ("key", "password", "token", "secret")): redacted_config[k] = "[REDACTED]" else: redacted_config[k] = v - logging.info( - f"Parsed MinerU config (sensitive fields redacted): {redacted_config}" - ) + logging.info(f"Parsed MinerU config (sensitive fields redacted): {redacted_config}") MinerUParser.__init__(self, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url) @@ -93,6 +89,60 @@ class MinerUOcrModel(Base, MinerUParser): server_url=self.mineru_server_url, delete_output=self.mineru_delete_output, parse_method=parse_method, - **kwargs + **kwargs, ) return sections, tables + + +class PaddleOCROcrModel(Base, PaddleOCRParser): + _FACTORY_NAME = "PaddleOCR" + + def __init__(self, key: str | dict, model_name: str, **kwargs): + Base.__init__(self, key, model_name, **kwargs) + raw_config = {} + if key: + try: + raw_config = json.loads(key) + except Exception: + raw_config = {} + + # nested {"api_key": {...}} from UI + # flat {"PADDLEOCR_*": "..."} payload auto-provisioned from env vars + config = raw_config.get("api_key", raw_config) + if not isinstance(config, dict): + config = {} + + def _resolve_config(key: str, env_key: str, default=""): + # lower-case keys (UI), upper-case PADDLEOCR_* (env auto-provision), env vars + return config.get(key, config.get(env_key, os.environ.get(env_key, default))) + + self.paddleocr_api_url = _resolve_config("paddleocr_api_url", "PADDLEOCR_API_URL", "") + self.paddleocr_algorithm = _resolve_config("paddleocr_algorithm", "PADDLEOCR_ALGORITHM", "PaddleOCR-VL") + self.paddleocr_access_token = _resolve_config("paddleocr_access_token", "PADDLEOCR_ACCESS_TOKEN", None) + + # Redact sensitive config keys before logging + redacted_config = {} + for k, v in config.items(): + if any(sensitive_word in k.lower() for sensitive_word in ("key", "password", "token", "secret")): + redacted_config[k] = "[REDACTED]" + else: + redacted_config[k] = v + logging.info(f"Parsed PaddleOCR config (sensitive fields redacted): {redacted_config}") + + PaddleOCRParser.__init__( + self, + api_url=self.paddleocr_api_url, + access_token=self.paddleocr_access_token, + algorithm=self.paddleocr_algorithm, + ) + + def check_available(self) -> tuple[bool, str]: + return self.check_installation() + + def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw", **kwargs): + ok, reason = self.check_available() + if not ok: + raise RuntimeError(f"PaddleOCR server not accessible: {reason}") + + sections, tables = PaddleOCRParser.parse_pdf(self, filepath=filepath, binary=binary, callback=callback, parse_method=parse_method, **kwargs) + return sections, tables diff --git a/web/src/assets/svg/llm/paddleocr.svg b/web/src/assets/svg/llm/paddleocr.svg new file mode 100644 index 000000000..e2e3f13e7 --- /dev/null +++ b/web/src/assets/svg/llm/paddleocr.svg @@ -0,0 +1,14 @@ + + + + + + + + + + + + + + diff --git a/web/src/components/layout-recognize-form-field.tsx b/web/src/components/layout-recognize-form-field.tsx index 965eee833..e122055e4 100644 --- a/web/src/components/layout-recognize-form-field.tsx +++ b/web/src/components/layout-recognize-form-field.tsx @@ -6,6 +6,7 @@ import { camelCase } from 'lodash'; import { ReactNode, useMemo } from 'react'; import { useFormContext } from 'react-hook-form'; import { MinerUOptionsFormField } from './mineru-options-form-field'; +import { PaddleOCROptionsFormField } from './paddleocr-options-form-field'; import { SelectWithSearch } from './originui/select-with-search'; import { FormControl, @@ -28,12 +29,14 @@ export function LayoutRecognizeFormField({ optionsWithoutLLM, label, showMineruOptions = true, + showPaddleocrOptions = true, }: { name?: string; horizontal?: boolean; optionsWithoutLLM?: { value: string; label: string }[]; label?: ReactNode; showMineruOptions?: boolean; + showPaddleocrOptions?: boolean; }) { const form = useFormContext(); @@ -113,6 +116,7 @@ export function LayoutRecognizeFormField({ {showMineruOptions && } + {showPaddleocrOptions && } ); }} diff --git a/web/src/components/paddleocr-options-form-field.tsx b/web/src/components/paddleocr-options-form-field.tsx new file mode 100644 index 000000000..0d70519eb --- /dev/null +++ b/web/src/components/paddleocr-options-form-field.tsx @@ -0,0 +1,95 @@ +import { RAGFlowFormItem } from '@/components/ragflow-form'; +import { Input } from '@/components/ui/input'; +import { RAGFlowSelect } from '@/components/ui/select'; +import { LLMFactory } from '@/constants/llm'; +import { buildOptions } from '@/utils/form'; +import { useFormContext, useWatch } from 'react-hook-form'; +import { useTranslation } from 'react-i18next'; + +const algorithmOptions = buildOptions(['PaddleOCR-VL']); + +export function PaddleOCROptionsFormField({ + namePrefix = 'parser_config', +}: { + namePrefix?: string; +}) { + const form = useFormContext(); + const { t } = useTranslation(); + const buildName = (field: string) => + namePrefix ? `${namePrefix}.${field}` : field; + + const layoutRecognize = useWatch({ + control: form.control, + name: 'parser_config.layout_recognize', + }); + + // Check if PaddleOCR is selected (the value contains 'PaddleOCR' or matches the factory name) + const isPaddleOCRSelected = + layoutRecognize?.includes(LLMFactory.PaddleOCR) || + layoutRecognize?.toLowerCase()?.includes('paddleocr'); + + if (!isPaddleOCRSelected) { + return null; + } + + return ( +
+
+ {t('knowledgeConfiguration.paddleocrOptions', 'PaddleOCR Options')} +
+ + + {(field) => ( + + )} + + + + {(field) => ( + + )} + + + + {(field) => ( + + )} + +
+ ); +} diff --git a/web/src/components/svg-icon.tsx b/web/src/components/svg-icon.tsx index b93d4a01b..8931a292f 100644 --- a/web/src/components/svg-icon.tsx +++ b/web/src/components/svg-icon.tsx @@ -105,6 +105,7 @@ export const LlmIcon = ({ LLMFactory.Gemini, LLMFactory.StepFun, LLMFactory.MinerU, + LLMFactory.PaddleOCR, // LLMFactory.DeerAPI, ]; if (svgIcons.includes(name as LLMFactory)) { diff --git a/web/src/constants/llm.ts b/web/src/constants/llm.ts index d603dbf4f..5551ad3ce 100644 --- a/web/src/constants/llm.ts +++ b/web/src/constants/llm.ts @@ -61,6 +61,7 @@ export enum LLMFactory { JiekouAI = 'Jiekou.AI', Builtin = 'Builtin', MinerU = 'MinerU', + PaddleOCR = 'PaddleOCR', } // Please lowercase the file name @@ -127,6 +128,7 @@ export const IconMap = { [LLMFactory.JiekouAI]: 'jiekouai', [LLMFactory.Builtin]: 'builtin', [LLMFactory.MinerU]: 'mineru', + [LLMFactory.PaddleOCR]: 'paddleocr', }; export const APIMapUrl = { @@ -178,4 +180,5 @@ export const APIMapUrl = { [LLMFactory.DeerAPI]: 'https://api.deerapi.com/token', [LLMFactory.TokenPony]: 'https://www.tokenpony.cn/#/user/keys', [LLMFactory.DeepInfra]: 'https://deepinfra.com/dash/api_keys', + [LLMFactory.PaddleOCR]: 'https://www.paddleocr.ai/latest/', }; diff --git a/web/src/locales/de.ts b/web/src/locales/de.ts index a3860f5ab..a0bae495b 100644 --- a/web/src/locales/de.ts +++ b/web/src/locales/de.ts @@ -385,6 +385,17 @@ Prozedurales Gedächtnis: Erlernte Fähigkeiten, Gewohnheiten und automatisierte 'Formelerkennung aktivieren. Hinweis: Dies funktioniert möglicherweise nicht korrekt bei kyrillischen Dokumenten.', mineruTableEnable: 'Tabellenerkennung', mineruTableEnableTip: 'Tabellenerkennung und -extraktion aktivieren.', + paddleocrOptions: 'PaddleOCR-Optionen', + paddleocrApiUrl: 'PaddleOCR API-URL', + paddleocrApiUrlTip: 'API-Endpunkt-URL des PaddleOCR-Dienstes', + paddleocrApiUrlPlaceholder: 'Zum Beispiel: https://paddleocr-server.com/layout-parsing', + paddleocrAccessToken: 'AI Studio-Zugriffstoken', + paddleocrAccessTokenTip: 'Zugriffstoken für die PaddleOCR-API (optional)', + paddleocrAccessTokenPlaceholder: 'Ihr AI Studio-Token (optional)', + paddleocrAlgorithm: 'PaddleOCR-Algorithmus', + paddleocrAlgorithmTip: 'Algorithmus, der für die PaddleOCR-Verarbeitung verwendet wird', + paddleocrSelectAlgorithm: 'Algorithmus auswählen', + paddleocrModelNamePlaceholder: 'Zum Beispiel: paddleocr-umgebung-1', overlappedPercent: 'Überlappungsprozent(%)', generationScopeTip: 'Bestimmt, ob RAPTOR für den gesamten Datensatz oder für eine einzelne Datei generiert wird.', @@ -475,7 +486,7 @@ Prozedurales Gedächtnis: Erlernte Fähigkeiten, Gewohnheiten und automatisierte book: `

Unterstützte Dateiformate sind DOCX, PDF, TXT.

Für jedes Buch im PDF-Format stellen Sie bitte die Seitenbereiche ein, um unerwünschte Informationen zu entfernen und die Analysezeit zu reduzieren.

`, laws: `

Unterstützte Dateiformate sind DOCX, PDF, TXT.

- Rechtliche Dokumente folgen in der Regel einem strengen Schreibformat. Wir verwenden Textmerkmale, um Teilungspunkte zu identifizieren. + Rechtliche Dokumente folgen in der Regel einem strengen Schreibformat. Wir verwenden Textmerkmale, um Teilungspunkte zu identifizieren.

Der Chunk hat eine Granularität, die mit 'ARTIKEL' übereinstimmt, wobei sichergestellt wird, dass der gesamte übergeordnete Text im Chunk enthalten ist.

`, @@ -489,7 +500,7 @@ Prozedurales Gedächtnis: Erlernte Fähigkeiten, Gewohnheiten und automatisierte
  • Dann werden benachbarte Segmente kombiniert, bis die Token-Anzahl den durch 'Chunk-Token-Anzahl' festgelegten Schwellenwert überschreitet, woraufhin ein Chunk erstellt wird.
  • `, paper: `

    Nur PDF-Dateien werden unterstützt.

    Papers werden nach Abschnitten wie abstract, 1.1, 1.2 aufgeteilt.

    - Dieser Ansatz ermöglicht es dem LLM, das Paper effektiver zusammenzufassen und umfassendere, verständlichere Antworten zu liefern. + Dieser Ansatz ermöglicht es dem LLM, das Paper effektiver zusammenzufassen und umfassendere, verständlichere Antworten zu liefern. Es erhöht jedoch auch den Kontext für KI-Gespräche und die Rechenkosten für das LLM. Daher sollten Sie während eines Gesprächs erwägen, den Wert von 'topN' zu reduzieren.

    `, presentation: `

    Unterstützte Dateiformate sind PDF, PPTX.

    Jede Seite in den Folien wird als Chunk behandelt, wobei ihr Vorschaubild gespeichert wird.

    @@ -1108,6 +1119,17 @@ Beispiel: Virtual Hosted Style`, modelTypeMessage: 'Bitte geben Sie Ihren Modelltyp ein!', addLlmBaseUrl: 'Basis-URL', baseUrlNameMessage: 'Bitte geben Sie Ihre Basis-URL ein!', + paddleocr: { + apiUrl: 'PaddleOCR API-URL', + apiUrlPlaceholder: 'Zum Beispiel: https://paddleocr-server.com/layout-parsing', + accessToken: 'AI Studio-Zugriffstoken', + accessTokenPlaceholder: 'Ihr AI Studio-Token (optional)', + algorithm: 'PaddleOCR-Algorithmus', + selectAlgorithm: 'Algorithmus auswählen', + modelNamePlaceholder: 'Zum Beispiel: paddleocr-from-env-1', + modelNameRequired: 'Der Modellname ist ein Pflichtfeld', + apiUrlRequired: 'Die PaddleOCR API-URL ist ein Pflichtfeld' + }, vision: 'Unterstützt es Vision?', ollamaLink: 'Wie integriere ich {{name}}', FishAudioLink: 'Wie verwende ich FishAudio', diff --git a/web/src/locales/en.ts b/web/src/locales/en.ts index a15edf75d..89416aea7 100644 --- a/web/src/locales/en.ts +++ b/web/src/locales/en.ts @@ -148,7 +148,7 @@ Procedural Memory: Learned skills, habits, and automated procedures.`, action: 'Action', }, config: { - memorySizeTooltip: `Accounts for each message's content + its embedding vector (≈ Content + Dimensions × 8 Bytes). + memorySizeTooltip: `Accounts for each message's content + its embedding vector (≈ Content + Dimensions × 8 Bytes). Example: A 1 KB message with 1024-dim embedding uses ~9 KB. The 5 MB default limit holds ~500 such messages.`, avatar: 'Avatar', description: 'Description', @@ -424,6 +424,17 @@ Example: A 1 KB message with 1024-dim embedding uses ~9 KB. The 5 MB default lim 'Enable formula recognition. Note: This may not work correctly for Cyrillic documents.', mineruTableEnable: 'Table recognition', mineruTableEnableTip: 'Enable table recognition and extraction.', + paddleocrOptions: 'PaddleOCR Options', + paddleocrApiUrl: 'PaddleOCR API URL', + paddleocrApiUrlTip: 'The API endpoint URL for PaddleOCR service', + paddleocrApiUrlPlaceholder: 'e.g. https://paddleocr-server.com/layout-parsing', + paddleocrAccessToken: 'AI Studio Access Token', + paddleocrAccessTokenTip: 'Access token for PaddleOCR API (optional)', + paddleocrAccessTokenPlaceholder: 'Your AI Studio token (optional)', + paddleocrAlgorithm: 'PaddleOCR Algorithm', + paddleocrAlgorithmTip: 'Algorithm to use for PaddleOCR parsing', + paddleocrSelectAlgorithm: 'Select Algorithm', + paddleocrModelNamePlaceholder: 'e.g. paddleocr-from-env-1', overlappedPercent: 'Overlapped percent(%)', generationScopeTip: 'Determines whether RAPTOR is generated for the entire dataset or for a single file.', @@ -1094,6 +1105,17 @@ Example: Virtual Hosted Style`, modelTypeMessage: 'Please input your model type!', addLlmBaseUrl: 'Base url', baseUrlNameMessage: 'Please input your base url!', + paddleocr: { + apiUrl: 'PaddleOCR API URL', + apiUrlPlaceholder: 'For example: https://paddleocr-server.com/layout-parsing', + accessToken: 'AI Studio Access Token', + accessTokenPlaceholder: 'Your AI Studio token (optional)', + algorithm: 'PaddleOCR Algorithm', + selectAlgorithm: 'Select Algorithm', + modelNamePlaceholder: 'For example: paddleocr-from-env-1', + modelNameRequired: 'Model name is required', + apiUrlRequired: 'PaddleOCR API URL is required' + }, vision: 'Does it support Vision?', ollamaLink: 'How to integrate {{name}}', FishAudioLink: 'How to use FishAudio', diff --git a/web/src/locales/es.ts b/web/src/locales/es.ts index a666adb59..a94caea81 100644 --- a/web/src/locales/es.ts +++ b/web/src/locales/es.ts @@ -159,6 +159,20 @@ export default { html4excelTip: `Usar junto con el método de fragmentación General. Cuando está desactivado, los archivos de hoja de cálculo (XLSX, XLS (Excel 97-2003)) se analizan línea por línea como pares clave-valor. Cuando está activado, los archivos de hoja de cálculo se convierten en tablas HTML. Si la tabla original tiene más de 12 filas, el sistema la dividirá automáticamente en varias tablas HTML cada 12 filas. Para más información, consulte https://ragflow.io/docs/dev/enable_excel2html.`, }, + knowledgeConfiguration: { + paddleocrOptions: 'Opciones de PaddleOCR', + paddleocrApiUrl: 'URL de API de PaddleOCR', + paddleocrApiUrlTip: 'La URL del endpoint de la API para el servicio PaddleOCR', + paddleocrApiUrlPlaceholder: 'ej: https://servidor-paddleocr.com/api', + paddleocrAccessToken: 'Token de acceso de AI Studio', + paddleocrAccessTokenTip: 'Token de acceso para la API de PaddleOCR (opcional)', + paddleocrAccessTokenPlaceholder: 'Su token de AI Studio (opcional)', + paddleocrAlgorithm: 'Algoritmo de PaddleOCR', + paddleocrAlgorithmTip: 'Algoritmo a utilizar para el análisis de PaddleOCR', + paddleocrSelectAlgorithm: 'Seleccionar algoritmo', + paddleocrModelNamePlaceholder: 'ej: paddleocr-desde-env-1', + }, + // Otros bloques de traducción // Continua con la misma estructura chat: { @@ -379,6 +393,17 @@ export default { modelTypeMessage: '¡Por favor ingresa el tipo de tu modelo!', addLlmBaseUrl: 'URL base', baseUrlNameMessage: '¡Por favor ingresa tu URL base!', + paddleocr: { + apiUrl: 'URL de la API de PaddleOCR', + apiUrlPlaceholder: 'Por ejemplo: https://paddleocr-server.com/layout-parsing', + accessToken: 'Token de acceso de AI Studio', + accessTokenPlaceholder: 'Su token de AI Studio (opcional)', + algorithm: 'Algoritmo de PaddleOCR', + selectAlgorithm: 'Seleccionar algoritmo', + modelNamePlaceholder: 'Por ejemplo: paddleocr-from-env-1', + modelNameRequired: 'El nombre del modelo es obligatorio', + apiUrlRequired: 'La URL de la API de PaddleOCR es obligatoria' + }, vision: '¿Soporta visión?', ollamaLink: 'Cómo integrar {{name}}', FishAudioLink: 'Cómo usar FishAudio', diff --git a/web/src/locales/fr.ts b/web/src/locales/fr.ts index a664bc349..d89a69e4a 100644 --- a/web/src/locales/fr.ts +++ b/web/src/locales/fr.ts @@ -293,6 +293,17 @@ export default { communityTip: `Un "community" est un groupe d’entités liées. Le LLM peut générer un résumé pour chaque groupe. Voir plus ici : https: //www.microsoft.com/en-us/research/blog/graphrag-improving-global-search-via-dynamic-community-selection/`, theDocumentBeingParsedCannotBeDeleted: 'Le document en cours d’analyse ne peut pas être supprimé', + paddleocrOptions: 'Options PaddleOCR', + paddleocrApiUrl: 'URL de l’API PaddleOCR', + paddleocrApiUrlTip: 'URL du point de terminaison de l’API du service PaddleOCR', + paddleocrApiUrlPlaceholder: 'Par exemple : https://paddleocr-server.com/layout-parsing', + paddleocrAccessToken: 'Jeton d’accès AI Studio', + paddleocrAccessTokenTip: 'Jeton d’accès à l’API PaddleOCR (optionnel)', + paddleocrAccessTokenPlaceholder: 'Votre jeton AI Studio (optionnel)', + paddleocrAlgorithm: 'Algorithme PaddleOCR', + paddleocrAlgorithmTip: 'Algorithme utilisé pour l’analyse PaddleOCR', + paddleocrSelectAlgorithm: 'Sélectionner un algorithme', + paddleocrModelNamePlaceholder: 'Par exemple : paddleocr-environnement-1', }, chunk: { chunk: 'Segment', @@ -566,6 +577,17 @@ export default { modelTypeMessage: 'Veuillez saisir le type de votre modèle !', addLlmBaseUrl: 'URL de base', baseUrlNameMessage: 'Veuillez saisir votre URL de base !', + paddleocr: { + apiUrl: 'URL de l’API PaddleOCR', + apiUrlPlaceholder: 'Par exemple : https://paddleocr-server.com/layout-parsing', + accessToken: 'Jeton d’accès AI Studio', + accessTokenPlaceholder: 'Votre jeton AI Studio (optionnel)', + algorithm: 'Algorithme PaddleOCR', + selectAlgorithm: 'Sélectionner un algorithme', + modelNamePlaceholder: 'Par exemple : paddleocr-from-env-1', + modelNameRequired: 'Le nom du modèle est obligatoire', + apiUrlRequired: 'L’URL de l’API PaddleOCR est obligatoire' + }, vision: 'Supporte-t-il la vision ?', ollamaLink: 'Comment intégrer {{name}}', FishAudioLink: 'Comment utiliser FishAudio', diff --git a/web/src/locales/id.ts b/web/src/locales/id.ts index 61a728b1b..975760941 100644 --- a/web/src/locales/id.ts +++ b/web/src/locales/id.ts @@ -316,6 +316,17 @@ export default { randomSeed: 'Benih acak', randomSeedMessage: 'Benih acak diperlukan', entityTypes: 'Jenis entitas', + paddleocrOptions: 'Opsi PaddleOCR', + paddleocrApiUrl: 'URL API PaddleOCR', + paddleocrApiUrlTip: 'URL endpoint API layanan PaddleOCR', + paddleocrApiUrlPlaceholder: 'Contoh: https://paddleocr-server.com/layout-parsing', + paddleocrAccessToken: 'Token Akses AI Studio', + paddleocrAccessTokenTip: 'Token akses untuk API PaddleOCR (opsional)', + paddleocrAccessTokenPlaceholder: 'Token AI Studio Anda (opsional)', + paddleocrAlgorithm: 'Algoritma PaddleOCR', + paddleocrAlgorithmTip: 'Algoritma yang digunakan untuk pemrosesan PaddleOCR', + paddleocrSelectAlgorithm: 'Pilih algoritma', + paddleocrModelNamePlaceholder: 'Contoh: paddleocr-lingkungan-1', }, chunk: { chunk: 'Potongan', @@ -553,6 +564,17 @@ export default { modelTypeMessage: 'Silakan masukkan jenis model Anda!', addLlmBaseUrl: 'Base url', baseUrlNameMessage: 'Silakan masukkan base url Anda!', + paddleocr: { + apiUrl: 'URL API PaddleOCR', + apiUrlPlaceholder: 'Contoh: https://paddleocr-server.com/layout-parsing', + accessToken: 'Token Akses AI Studio', + accessTokenPlaceholder: 'Token AI Studio Anda (opsional)', + algorithm: 'Algoritma PaddleOCR', + selectAlgorithm: 'Pilih algoritma', + modelNamePlaceholder: 'Contoh: paddleocr-from-env-1', + modelNameRequired: 'Nama model wajib diisi', + apiUrlRequired: 'URL API PaddleOCR wajib diisi' + }, vision: 'Apakah mendukung Vision?', ollamaLink: 'Cara mengintegrasikan {{name}}', FishAudioLink: 'Cara menggunakan FishAudio', diff --git a/web/src/locales/it.ts b/web/src/locales/it.ts index cb44b0753..b2c6f3330 100644 --- a/web/src/locales/it.ts +++ b/web/src/locales/it.ts @@ -488,6 +488,17 @@ Quanto sopra è il contenuto che devi riassumere.`, 'In un grafo della conoscenza, una comunità è un cluster di entità collegate da relazioni. Puoi far generare al LLM un abstract per ogni comunità, noto come report comunità.', theDocumentBeingParsedCannotBeDeleted: 'Il documento in fase di analisi non può essere eliminato', + paddleocrOptions: 'Opzioni PaddleOCR', + paddleocrApiUrl: 'URL API di PaddleOCR', + paddleocrApiUrlTip: 'URL dell’endpoint API del servizio PaddleOCR', + paddleocrApiUrlPlaceholder: 'Ad esempio: https://paddleocr-server.com/layout-parsing', + paddleocrAccessToken: 'Token di accesso AI Studio', + paddleocrAccessTokenTip: 'Token di accesso per l’API PaddleOCR (facoltativo)', + paddleocrAccessTokenPlaceholder: 'Il tuo token AI Studio (facoltativo)', + paddleocrAlgorithm: 'Algoritmo PaddleOCR', + paddleocrAlgorithmTip: 'Algoritmo utilizzato per l’elaborazione PaddleOCR', + paddleocrSelectAlgorithm: 'Seleziona algoritmo', + paddleocrModelNamePlaceholder: 'Ad esempio: paddleocr-ambiente-1', }, chunk: { chunk: 'Chunk', @@ -785,6 +796,17 @@ Quanto sopra è il contenuto che devi riassumere.`, modelTypeMessage: 'Inserisci il tuo tipo di modello!', addLlmBaseUrl: 'URL base', baseUrlNameMessage: 'Inserisci il tuo URL base!', + paddleocr: { + apiUrl: 'URL API di PaddleOCR', + apiUrlPlaceholder: 'Ad esempio: https://paddleocr-server.com/layout-parsing', + accessToken: 'Token di accesso AI Studio', + accessTokenPlaceholder: 'Il tuo token AI Studio (facoltativo)', + algorithm: 'Algoritmo PaddleOCR', + selectAlgorithm: 'Seleziona algoritmo', + modelNamePlaceholder: 'Ad esempio: paddleocr-from-env-1', + modelNameRequired: 'Il nome del modello è obbligatorio', + apiUrlRequired: 'L’URL API di PaddleOCR è obbligatorio' + }, vision: 'Supporta Vision?', ollamaLink: 'Come integrare {{name}}', FishAudioLink: 'Come usare FishAudio', diff --git a/web/src/locales/ja.ts b/web/src/locales/ja.ts index 9eda792e2..5b10f3596 100644 --- a/web/src/locales/ja.ts +++ b/web/src/locales/ja.ts @@ -240,7 +240,7 @@ export default { XLSX形式のファイルには、ヘッダーのない2つの 列が必要です: 1つは質問の列でもう1つは回答の列です (質問列が先行)。複数のシートも可能です。 - +

  • CSV/TXT形式のファイルは、TABで区切られたUTF-8エンコードである必要があります。 @@ -285,7 +285,7 @@ export default { LLMがその量のコンテキスト長を処理できる場合に、ドキュメント全体を要約する必要があるときに適用されます。

    `, knowledgeGraph: `

    対応ファイル形式はDOCX, EXCEL, PPT, IMAGE, PDF, TXT, MD, JSON, EMLです。 - +

    このアプローチでは、ファイルを'ナイーブ'/'一般'メソッドを使用してチャンクに分割します。ドキュメントをセグメントに分割し、隣接するセグメントを結合してトークン数が'チャンクトークン数'で指定されたしきい値を超えるまで続け、その時点でチャンクが作成されます。

    その後、チャンクはLLMに入力され、ナレッジグラフとマインドマップのエンティティと関係を抽出します。

    エンティティタイプを設定することを忘れないでください。

    `, @@ -314,6 +314,17 @@ export default { entityTypes: 'エンティティタイプ', pageRank: 'ページランク', pageRankTip: `検索時に特定の知識ベースにより高いPageRankスコアを割り当てることができます。対応するスコアは、これらの知識ベースから取得されたチャンクのハイブリッド類似度スコアに加算され、ランキングが向上します。詳細については、https://ragflow.io/docs/dev/set_page_rank を参照してください。`, + paddleocrOptions: 'PaddleOCRオプション', + paddleocrApiUrl: 'PaddleOCR API URL', + paddleocrApiUrlTip: 'PaddleOCRサービスのAPIエンドポイントURL', + paddleocrApiUrlPlaceholder: '例: https://paddleocr-server.com/api', + paddleocrAccessToken: 'AI Studioアクセストークン', + paddleocrAccessTokenTip: 'PaddleOCR APIのアクセストークン(オプション)', + paddleocrAccessTokenPlaceholder: 'AI Studioトークン(オプション)', + paddleocrAlgorithm: 'PaddleOCRアルゴリズム', + paddleocrAlgorithmTip: 'PaddleOCR解析に使用するアルゴリズム', + paddleocrSelectAlgorithm: 'アルゴリズムを選択', + paddleocrModelNamePlaceholder: '例: paddleocr-from-env-1', }, chunk: { chunk: 'チャンク', @@ -596,6 +607,17 @@ export default { modelTypeMessage: 'モデルタイプを入力してください!', addLlmBaseUrl: 'ベースURL', baseUrlNameMessage: 'ベースURLを入力してください!', + paddleocr: { + apiUrl: 'PaddleOCR API URL', + apiUrlPlaceholder: '例:https://paddleocr-server.com/layout-parsing', + accessToken: 'AI Studio アクセストークン', + accessTokenPlaceholder: 'AI Studio のトークン(任意)', + algorithm: 'PaddleOCR アルゴリズム', + selectAlgorithm: 'アルゴリズムを選択', + modelNamePlaceholder: '例:paddleocr-from-env-1', + modelNameRequired: 'モデル名は必須です', + apiUrlRequired: 'PaddleOCR API URL は必須です' + }, vision: 'ビジョンをサポートしていますか?', ollamaLink: '{{name}}を統合する方法', FishAudioLink: 'FishAudioの使用方法', diff --git a/web/src/locales/pt-br.ts b/web/src/locales/pt-br.ts index 25bbab94f..d5df6fd81 100644 --- a/web/src/locales/pt-br.ts +++ b/web/src/locales/pt-br.ts @@ -310,6 +310,17 @@ export default { topnTags: 'Top-N Etiquetas', tags: 'Etiquetas', addTag: 'Adicionar etiqueta', + paddleocrOptions: 'Opções do PaddleOCR', + paddleocrApiUrl: 'URL da API do PaddleOCR', + paddleocrApiUrlTip: 'A URL do endpoint da API para o serviço PaddleOCR', + paddleocrApiUrlPlaceholder: 'ex: https://servidor-paddleocr.com/api', + paddleocrAccessToken: 'Token de Acesso do AI Studio', + paddleocrAccessTokenTip: 'Token de acesso para a API do PaddleOCR (opcional)', + paddleocrAccessTokenPlaceholder: 'Seu token do AI Studio (opcional)', + paddleocrAlgorithm: 'Algoritmo do PaddleOCR', + paddleocrAlgorithmTip: 'Algoritmo a ser usado para a análise do PaddleOCR', + paddleocrSelectAlgorithm: 'Selecionar algoritmo', + paddleocrModelNamePlaceholder: 'ex: paddleocr-do-ambiente-1', }, chunk: { chunk: 'Fragmento', @@ -546,6 +557,17 @@ export default { modelTypeMessage: 'Por favor, insira o tipo do seu modelo!', addLlmBaseUrl: 'URL base', baseUrlNameMessage: 'Por favor, insira sua URL base!', + paddleocr: { + apiUrl: 'URL da API do PaddleOCR', + apiUrlPlaceholder: 'Por exemplo: https://paddleocr-server.com/layout-parsing', + accessToken: 'Token de acesso do AI Studio', + accessTokenPlaceholder: 'Seu token do AI Studio (opcional)', + algorithm: 'Algoritmo do PaddleOCR', + selectAlgorithm: 'Selecionar algoritmo', + modelNamePlaceholder: 'Por exemplo: paddleocr-from-env-1', + modelNameRequired: 'O nome do modelo é obrigatório', + apiUrlRequired: 'A URL da API do PaddleOCR é obrigatória' + }, vision: 'Suporta visão?', ollamaLink: 'Como integrar {{name}}', FishAudioLink: 'Como usar FishAudio', diff --git a/web/src/locales/ru.ts b/web/src/locales/ru.ts index 404577a85..4e676ddce 100644 --- a/web/src/locales/ru.ts +++ b/web/src/locales/ru.ts @@ -510,6 +510,17 @@ export default { 'В графе знаний сообщество - это кластер сущностей, связанных отношениями. Вы можете поручить LLM генерировать аннотацию для каждого сообщества, известную как отчет сообщества. Более подробная информация здесь: https://www.microsoft.com/en-us/research/blog/graphrag-improving-global-search-via-dynamic-community-selection/', theDocumentBeingParsedCannotBeDeleted: 'Документ, который в данный момент парсится, не может быть удален', + paddleocrOptions: 'Параметры PaddleOCR', + paddleocrApiUrl: 'URL API PaddleOCR', + paddleocrApiUrlTip: 'URL конечной точки API сервиса PaddleOCR', + paddleocrApiUrlPlaceholder: 'Например: https://paddleocr-server.com/layout-parsing', + paddleocrAccessToken: 'Токен доступа AI Studio', + paddleocrAccessTokenTip: 'Токен доступа к API PaddleOCR (необязательно)', + paddleocrAccessTokenPlaceholder: 'Ваш токен AI Studio (необязательно)', + paddleocrAlgorithm: 'Алгоритм PaddleOCR', + paddleocrAlgorithmTip: 'Алгоритм, используемый для обработки PaddleOCR', + paddleocrSelectAlgorithm: 'Выбрать алгоритм', + paddleocrModelNamePlaceholder: 'Например: paddleocr-среда-1', }, chunk: { chunk: 'Чанк', @@ -716,7 +727,7 @@ export default { 'Базовый URL вашего экземпляра Confluence (например, https://your-domain.atlassian.net/wiki)', confluenceSpaceKeyTip: 'Необязательно: Укажите ключ пространства для синхронизации только определенного пространства. Оставьте пустым для синхронизации всех доступных пространств. Для нескольких пространств разделите запятыми (например, DEV,DOCS,HR)', - s3PrefixTip: `Укажите путь к папке в вашем S3 бакете для получения файлов. + s3PrefixTip: `Укажите путь к папке в вашем S3 бакете для получения файлов. Пример: general/v2/`, S3CompatibleEndpointUrlTip: `Требуется для S3 совместимого Storage Box. Укажите URL конечной точки, совместимой с S3. Пример: https://fsn1.your-objectstorage.com`, @@ -1034,6 +1045,17 @@ export default { modelsToBeAddedTooltip: 'Если ваш провайдер моделей не указан, но заявляет о "совместимости с OpenAI-API", выберите карточку OpenAI-API-compatible, чтобы добавить соответствующие модели. ', mcp: 'MCP', + paddleocr: { + apiUrl: 'URL API PaddleOCR', + apiUrlPlaceholder: 'Например: https://paddleocr-server.com/layout-parsing', + accessToken: 'Токен доступа AI Studio', + accessTokenPlaceholder: 'Ваш токен AI Studio (необязательно)', + algorithm: 'Алгоритм PaddleOCR', + selectAlgorithm: 'Выбрать алгоритм', + modelNamePlaceholder: 'Например: paddleocr-from-env-1', + modelNameRequired: 'Имя модели является обязательным', + apiUrlRequired: 'URL API PaddleOCR является обязательным' + }, }, message: { registered: 'Зарегистрирован!', diff --git a/web/src/locales/vi.ts b/web/src/locales/vi.ts index b57dad6e7..1c831f57b 100644 --- a/web/src/locales/vi.ts +++ b/web/src/locales/vi.ts @@ -354,6 +354,17 @@ export default { community: 'Xây dựng mối quan hệ cộng đồng', communityTip: 'Các liên kết được nhóm lại thành các cộng đồng phân cấp, với các thực thể và mối quan hệ kết nối từng phân đoạn lên các cấp độ trừu tượng cao hơn. Sau đó, chúng tôi sử dụng một LLM để tạo ra bản tóm tắt cho mỗi cộng đồng, được gọi là báo cáo cộng đồng. Xem thêm: https://www.microsoft.com/en-us/research/blog/graphrag-improving-global-search-via-dynamic-community-selection/', + paddleocrOptions: 'Tùy chọn PaddleOCR', + paddleocrApiUrl: 'URL API PaddleOCR', + paddleocrApiUrlTip: 'URL điểm cuối API của dịch vụ PaddleOCR', + paddleocrApiUrlPlaceholder: 'Ví dụ: https://paddleocr-server.com/layout-parsing', + paddleocrAccessToken: 'Token truy cập AI Studio', + paddleocrAccessTokenTip: 'Token truy cập cho API PaddleOCR (tùy chọn)', + paddleocrAccessTokenPlaceholder: 'Token AI Studio của bạn (tùy chọn)', + paddleocrAlgorithm: 'Thuật toán PaddleOCR', + paddleocrAlgorithmTip: 'Thuật toán được sử dụng để xử lý PaddleOCR', + paddleocrSelectAlgorithm: 'Chọn thuật toán', + paddleocrModelNamePlaceholder: 'Ví dụ: paddleocr-môi-trường-1', }, chunk: { chunk: 'Khối', @@ -595,6 +606,17 @@ export default { modelTypeMessage: 'Vui lòng nhập loại mô hình của bạn!', addLlmBaseUrl: 'URL cơ sở', baseUrlNameMessage: 'Vui lòng nhập URL cơ sở của bạn!', + paddleocr: { + apiUrl: 'URL API PaddleOCR', + apiUrlPlaceholder: 'Ví dụ: https://paddleocr-server.com/layout-parsing', + accessToken: 'Token truy cập AI Studio', + accessTokenPlaceholder: 'Token AI Studio của bạn (tùy chọn)', + algorithm: 'Thuật toán PaddleOCR', + selectAlgorithm: 'Chọn thuật toán', + modelNamePlaceholder: 'Ví dụ: paddleocr-from-env-1', + modelNameRequired: 'Tên mô hình là bắt buộc', + apiUrlRequired: 'URL API PaddleOCR là bắt buộc' + }, vision: 'Có hỗ trợ Tầm nhìn không?', ollamaLink: 'Cách tích hợp {{name}}', FishAudioLink: 'Cách sử dụng FishAudio', diff --git a/web/src/locales/zh-traditional.ts b/web/src/locales/zh-traditional.ts index 65bb4e08b..8113ca549 100644 --- a/web/src/locales/zh-traditional.ts +++ b/web/src/locales/zh-traditional.ts @@ -367,6 +367,17 @@ export default { `, tags: '標籤', addTag: '增加標籤', + paddleocrOptions: 'PaddleOCR 選項', + paddleocrApiUrl: 'PaddleOCR API URL', + paddleocrApiUrlTip: 'PaddleOCR 服務的 API 端點 URL', + paddleocrApiUrlPlaceholder: '例如:https://paddleocr-server.com/layout-parsing', + paddleocrAccessToken: 'AI Studio 訪問令牌', + paddleocrAccessTokenTip: 'PaddleOCR API 的訪問令牌(可選)', + paddleocrAccessTokenPlaceholder: '您的 AI Studio 令牌(可選)', + paddleocrAlgorithm: 'PaddleOCR 算法', + paddleocrAlgorithmTip: '用於 PaddleOCR 解析的算法', + paddleocrSelectAlgorithm: '選擇算法', + paddleocrModelNamePlaceholder: '例如:paddleocr-環境-1', useGraphRag: '提取知識圖譜', useGraphRagTip: '基於知識庫內所有切好的文本塊構建知識圖譜,用以提升多跳和複雜問題回答的正確率。請注意:構建知識圖譜將消耗大量 token 和時間。詳見 https://ragflow.io/docs/dev/construct_knowledge_graph。', @@ -644,6 +655,17 @@ export default { modelNameMessage: '請輸入模型名稱!', modelTypeMessage: '請輸入模型類型!', baseUrlNameMessage: '請輸入基礎 Url!', + paddleocr: { + apiUrl: 'PaddleOCR API URL', + apiUrlPlaceholder: '例如:https://paddleocr-server.com/layout-parsing', + accessToken: 'AI Studio 存取權杖', + accessTokenPlaceholder: '您的 AI Studio 權杖(選填)', + algorithm: 'PaddleOCR 演算法', + selectAlgorithm: '選擇演算法', + modelNamePlaceholder: '例如:paddleocr-from-env-1', + modelNameRequired: '模型名稱為必填項目', + apiUrlRequired: 'PaddleOCR API URL 為必填項目' + }, ollamaLink: '如何集成 {{name}}', FishAudioLink: '如何使用Fish Audio', TencentCloudLink: '如何使用騰訊雲語音識別', diff --git a/web/src/locales/zh.ts b/web/src/locales/zh.ts index 8cbb409aa..9abca9ded 100644 --- a/web/src/locales/zh.ts +++ b/web/src/locales/zh.ts @@ -390,6 +390,17 @@ export default { '启用公式识别。注意:对于西里尔文档可能无法正常工作。', mineruTableEnable: '表格识别', mineruTableEnableTip: '启用表格识别和提取。', + paddleocrOptions: 'PaddleOCR 选项', + paddleocrApiUrl: 'PaddleOCR API URL', + paddleocrApiUrlTip: 'PaddleOCR 服务的 API 端点 URL', + paddleocrApiUrlPlaceholder: '例如:https://paddleocr-server.com/layout-parsing', + paddleocrAccessToken: 'AI Studio 访问令牌', + paddleocrAccessTokenTip: 'PaddleOCR API 的访问令牌(可选)', + paddleocrAccessTokenPlaceholder: '您的 AI Studio 令牌(可选)', + paddleocrAlgorithm: 'PaddleOCR 算法', + paddleocrAlgorithmTip: '用于 PaddleOCR 解析的算法', + paddleocrSelectAlgorithm: '选择算法', + paddleocrModelNamePlaceholder: '例如:paddleocr-环境-1', generationScopeTip: '选择 RAPTOR 的生成范围:整个知识库或单个文件。', generationScope: '生成范围', scopeSingleFile: '单文件', @@ -1113,6 +1124,17 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于 vlmLmdeployEngine: '基于LMDeploy引擎的视觉语言模型(实验性)', }, }, + paddleocr: { + apiUrl: 'PaddleOCR API URL', + apiUrlPlaceholder: '例如:https://paddleocr-server.com/layout-parsing', + accessToken: 'AI Studio访问令牌', + accessTokenPlaceholder: '您的 AI Studio 令牌(可选)', + algorithm: 'PaddleOCR算法', + selectAlgorithm: '选择算法', + modelNamePlaceholder: '例如:paddleocr-from-env-1', + modelNameRequired: '模型名称为必填项', + apiUrlRequired: 'PaddleOCR API URL 为必填项' + }, }, message: { registered: '注册成功', diff --git a/web/src/pages/user-setting/setting-model/hooks.tsx b/web/src/pages/user-setting/setting-model/hooks.tsx index 237999fef..68d73326a 100644 --- a/web/src/pages/user-setting/setting-model/hooks.tsx +++ b/web/src/pages/user-setting/setting-model/hooks.tsx @@ -504,3 +504,43 @@ export const useSubmitMinerU = () => { mineruLoading: loading, }; }; + +export const useSubmitPaddleOCR = () => { + const { addLlm, loading } = useAddLlm(); + const { + visible: paddleocrVisible, + hideModal: hidePaddleOCRModal, + showModal: showPaddleOCRModal, + } = useSetModalState(); + + const onPaddleOCROk = useCallback( + async (payload: any) => { + const cfg: any = { + ...payload, + }; + const req: IAddLlmRequestBody = { + llm_factory: LLMFactory.PaddleOCR, + llm_name: payload.llm_name, + model_type: 'ocr', + api_key: cfg, + api_base: '', + max_tokens: 0, + }; + const ret = await addLlm(req); + if (ret === 0) { + hidePaddleOCRModal(); + return true; + } + return false; + }, + [addLlm, hidePaddleOCRModal], + ); + + return { + paddleocrVisible, + hidePaddleOCRModal, + showPaddleOCRModal, + onPaddleOCROk, + paddleocrLoading: loading, + }; +}; diff --git a/web/src/pages/user-setting/setting-model/index.tsx b/web/src/pages/user-setting/setting-model/index.tsx index 1b549496c..41224a16a 100644 --- a/web/src/pages/user-setting/setting-model/index.tsx +++ b/web/src/pages/user-setting/setting-model/index.tsx @@ -15,6 +15,7 @@ import { useSubmitHunyuan, useSubmitMinerU, useSubmitOllama, + useSubmitPaddleOCR, useSubmitSpark, useSubmitSystemModelSetting, useSubmitTencentCloud, @@ -28,6 +29,7 @@ import FishAudioModal from './modal/fish-audio-modal'; import GoogleModal from './modal/google-modal'; import HunyuanModal from './modal/hunyuan-modal'; import MinerUModal from './modal/mineru-modal'; +import PaddleOCRModal from './modal/paddleocr-modal'; import TencentCloudModal from './modal/next-tencent-modal'; import OllamaModal from './modal/ollama-modal'; import SparkModal from './modal/spark-modal'; @@ -138,6 +140,14 @@ const ModelProviders = () => { mineruLoading, } = useSubmitMinerU(); + const { + paddleocrVisible, + hidePaddleOCRModal, + showPaddleOCRModal, + onPaddleOCROk, + paddleocrLoading, + } = useSubmitPaddleOCR(); + const ModalMap = useMemo( () => ({ [LLMFactory.Bedrock]: showBedrockAddingModal, @@ -150,6 +160,7 @@ const ModelProviders = () => { [LLMFactory.GoogleCloud]: showGoogleAddingModal, [LLMFactory.AzureOpenAI]: showAzureAddingModal, [LLMFactory.MinerU]: showMineruModal, + [LLMFactory.PaddleOCR]: showPaddleOCRModal, }), [ showBedrockAddingModal, @@ -162,6 +173,7 @@ const ModelProviders = () => { showGoogleAddingModal, showAzureAddingModal, showMineruModal, + showPaddleOCRModal, ], ); @@ -309,6 +321,12 @@ const ModelProviders = () => { onOk={onMineruOk} loading={mineruLoading} > + ); }; diff --git a/web/src/pages/user-setting/setting-model/modal/paddleocr-modal/index.tsx b/web/src/pages/user-setting/setting-model/modal/paddleocr-modal/index.tsx new file mode 100644 index 000000000..2df23c3de --- /dev/null +++ b/web/src/pages/user-setting/setting-model/modal/paddleocr-modal/index.tsx @@ -0,0 +1,135 @@ +import { useForm } from 'react-hook-form'; +import { useTranslation } from 'react-i18next'; +import { z } from 'zod'; +import { zodResolver } from '@hookform/resolvers/zod'; +import { t } from 'i18next'; +import { + Dialog, + DialogContent, + DialogHeader, + DialogTitle, +} from '@/components/ui/dialog'; +import { RAGFlowFormItem } from '@/components/ragflow-form'; +import { RAGFlowSelect, RAGFlowSelectOptionType } from '@/components/ui/select'; +import { Input } from '@/components/ui/input'; +import { Form } from '@/components/ui/form'; +import { LLMHeader } from '../../components/llm-header'; +import { LLMFactory } from '@/constants/llm'; + +const FormSchema = z.object({ + llm_name: z.string().min(1, { + message: t('setting.paddleocr.modelNameRequired'), + }), + paddleocr_api_url: z.string().min(1, { + message: t('setting.paddleocr.apiUrlRequired'), + }), + paddleocr_access_token: z.string().optional(), + paddleocr_algorithm: z.string().default('PaddleOCR-VL'), +}); + +export type PaddleOCRFormValues = z.infer; + +export interface IModalProps { + visible: boolean; + hideModal: () => void; + onOk?: (data: T) => Promise; + loading?: boolean; +} + +const algorithmOptions: RAGFlowSelectOptionType[] = [ + { label: 'PaddleOCR-VL', value: 'PaddleOCR-VL' }, +]; + +const PaddleOCRModal = ({ + visible, + hideModal, + onOk, + loading, +}: IModalProps) => { + const { t } = useTranslation(); + + const form = useForm({ + resolver: zodResolver(FormSchema), + defaultValues: { + paddleocr_algorithm: 'PaddleOCR-VL', + }, + }); + + const handleOk = async (values: PaddleOCRFormValues) => { + const ret = await onOk?.(values as any); + if (ret) { + hideModal?.(); + } + }; + + return ( + + + + + + + +
    + + + + + + + + + + + + {(field) => ( + + )} + +
    + + +
    +
    + +
    +
    + ); +}; + +export default PaddleOCRModal;