feat: add paddleocr parser (#12513)

### What problem does this PR solve?

Add PaddleOCR as a new PDF parser.

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Lin Manhui
2026-01-09 17:48:45 +08:00
committed by GitHub
parent 6abf55c048
commit 2e09db02f3
34 changed files with 1510 additions and 453 deletions

View File

@ -195,6 +195,9 @@ async def add_llm():
elif factory == "MinerU":
api_key = apikey_json(["api_key", "provider_order"])
elif factory == "PaddleOCR":
api_key = apikey_json(["api_key", "provider_order"])
llm = {
"tenant_id": current_user.id,
"llm_factory": factory,
@ -230,8 +233,7 @@ async def add_llm():
**extra,
)
try:
m, tc = await mdl.async_chat(None, [{"role": "user", "content": "Hello! How are you doing!"}],
{"temperature": 0.9})
m, tc = await mdl.async_chat(None, [{"role": "user", "content": "Hello! How are you doing!"}], {"temperature": 0.9})
if not tc and m.find("**ERROR**:") >= 0:
raise Exception(m)
except Exception as e:
@ -381,7 +383,7 @@ def list_app():
facts = set([o.to_dict()["llm_factory"] for o in objs if o.api_key and o.status == StatusEnum.VALID.value])
status = {(o.llm_name + "@" + o.llm_factory) for o in objs if o.status == StatusEnum.VALID.value}
llms = LLMService.get_all()
llms = [m.to_dict() for m in llms if m.status == StatusEnum.VALID.value and m.fid not in weighted and (m.fid == 'Builtin' or (m.llm_name + "@" + m.fid) in status)]
llms = [m.to_dict() for m in llms if m.status == StatusEnum.VALID.value and m.fid not in weighted and (m.fid == "Builtin" or (m.llm_name + "@" + m.fid) in status)]
for m in llms:
m["available"] = m["fid"] in facts or m["llm_name"].lower() == "flag-embedding" or m["fid"] in self_deployed
if "tei-" in os.getenv("COMPOSE_PROFILES", "") and m["model_type"] == LLMType.EMBEDDING and m["fid"] == "Builtin" and m["llm_name"] == os.getenv("TEI_MODEL", ""):

View File

@ -19,7 +19,7 @@ import logging
from peewee import IntegrityError
from langfuse import Langfuse
from common import settings
from common.constants import MINERU_DEFAULT_CONFIG, MINERU_ENV_KEYS, LLMType
from common.constants import MINERU_DEFAULT_CONFIG, MINERU_ENV_KEYS, PADDLEOCR_DEFAULT_CONFIG, PADDLEOCR_ENV_KEYS, LLMType
from api.db.db_models import DB, LLMFactories, TenantLLM
from api.db.services.common_service import CommonService
from api.db.services.langfuse_service import TenantLangfuseService
@ -60,10 +60,8 @@ class TenantLLMService(CommonService):
@classmethod
@DB.connection_context()
def get_my_llms(cls, tenant_id):
fields = [cls.model.llm_factory, LLMFactories.logo, LLMFactories.tags, cls.model.model_type, cls.model.llm_name,
cls.model.used_tokens, cls.model.status]
objs = cls.model.select(*fields).join(LLMFactories, on=(cls.model.llm_factory == LLMFactories.name)).where(
cls.model.tenant_id == tenant_id, ~cls.model.api_key.is_null()).dicts()
fields = [cls.model.llm_factory, LLMFactories.logo, LLMFactories.tags, cls.model.model_type, cls.model.llm_name, cls.model.used_tokens, cls.model.status]
objs = cls.model.select(*fields).join(LLMFactories, on=(cls.model.llm_factory == LLMFactories.name)).where(cls.model.tenant_id == tenant_id, ~cls.model.api_key.is_null()).dicts()
return list(objs)
@ -90,6 +88,7 @@ class TenantLLMService(CommonService):
@DB.connection_context()
def get_model_config(cls, tenant_id, llm_type, llm_name=None):
from api.db.services.llm_service import LLMService
e, tenant = TenantService.get_by_id(tenant_id)
if not e:
raise LookupError("Tenant not found")
@ -119,9 +118,9 @@ class TenantLLMService(CommonService):
model_config = cls.get_api_key(tenant_id, mdlnm)
if model_config:
model_config = model_config.to_dict()
elif llm_type == LLMType.EMBEDDING and fid == 'Builtin' and "tei-" in os.getenv("COMPOSE_PROFILES", "") and mdlnm == os.getenv('TEI_MODEL', ''):
elif llm_type == LLMType.EMBEDDING and fid == "Builtin" and "tei-" in os.getenv("COMPOSE_PROFILES", "") and mdlnm == os.getenv("TEI_MODEL", ""):
embedding_cfg = settings.EMBEDDING_CFG
model_config = {"llm_factory": 'Builtin', "api_key": embedding_cfg["api_key"], "llm_name": mdlnm, "api_base": embedding_cfg["base_url"]}
model_config = {"llm_factory": "Builtin", "api_key": embedding_cfg["api_key"], "llm_name": mdlnm, "api_base": embedding_cfg["base_url"]}
else:
raise LookupError(f"Model({mdlnm}@{fid}) not authorized")
@ -140,33 +139,27 @@ class TenantLLMService(CommonService):
if llm_type == LLMType.EMBEDDING.value:
if model_config["llm_factory"] not in EmbeddingModel:
return None
return EmbeddingModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"],
base_url=model_config["api_base"])
return EmbeddingModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"])
elif llm_type == LLMType.RERANK:
if model_config["llm_factory"] not in RerankModel:
return None
return RerankModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"],
base_url=model_config["api_base"])
return RerankModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"])
elif llm_type == LLMType.IMAGE2TEXT.value:
if model_config["llm_factory"] not in CvModel:
return None
return CvModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], lang,
base_url=model_config["api_base"], **kwargs)
return CvModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], lang, base_url=model_config["api_base"], **kwargs)
elif llm_type == LLMType.CHAT.value:
if model_config["llm_factory"] not in ChatModel:
return None
return ChatModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"],
base_url=model_config["api_base"], **kwargs)
return ChatModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"], **kwargs)
elif llm_type == LLMType.SPEECH2TEXT:
if model_config["llm_factory"] not in Seq2txtModel:
return None
return Seq2txtModel[model_config["llm_factory"]](key=model_config["api_key"],
model_name=model_config["llm_name"], lang=lang,
base_url=model_config["api_base"])
return Seq2txtModel[model_config["llm_factory"]](key=model_config["api_key"], model_name=model_config["llm_name"], lang=lang, base_url=model_config["api_base"])
elif llm_type == LLMType.TTS:
if model_config["llm_factory"] not in TTSModel:
return None
@ -216,14 +209,11 @@ class TenantLLMService(CommonService):
try:
num = (
cls.model.update(used_tokens=cls.model.used_tokens + used_tokens)
.where(cls.model.tenant_id == tenant_id, cls.model.llm_name == llm_name,
cls.model.llm_factory == llm_factory if llm_factory else True)
.where(cls.model.tenant_id == tenant_id, cls.model.llm_name == llm_name, cls.model.llm_factory == llm_factory if llm_factory else True)
.execute()
)
except Exception:
logging.exception(
"TenantLLMService.increase_usage got exception,Failed to update used_tokens for tenant_id=%s, llm_name=%s",
tenant_id, llm_name)
logging.exception("TenantLLMService.increase_usage got exception,Failed to update used_tokens for tenant_id=%s, llm_name=%s", tenant_id, llm_name)
return 0
return num
@ -231,9 +221,7 @@ class TenantLLMService(CommonService):
@classmethod
@DB.connection_context()
def get_openai_models(cls):
objs = cls.model.select().where((cls.model.llm_factory == "OpenAI"),
~(cls.model.llm_name == "text-embedding-3-small"),
~(cls.model.llm_name == "text-embedding-3-large")).dicts()
objs = cls.model.select().where((cls.model.llm_factory == "OpenAI"), ~(cls.model.llm_name == "text-embedding-3-small"), ~(cls.model.llm_name == "text-embedding-3-large")).dicts()
return list(objs)
@classmethod
@ -298,6 +286,68 @@ class TenantLLMService(CommonService):
idx += 1
continue
@classmethod
def _collect_paddleocr_env_config(cls) -> dict | None:
cfg = PADDLEOCR_DEFAULT_CONFIG
found = False
for key in PADDLEOCR_ENV_KEYS:
val = os.environ.get(key)
if val:
found = True
cfg[key] = val
return cfg if found else None
@classmethod
@DB.connection_context()
def ensure_paddleocr_from_env(cls, tenant_id: str) -> str | None:
"""
Ensure a PaddleOCR model exists for the tenant if env variables are present.
Return the existing or newly created llm_name, or None if env not set.
"""
cfg = cls._collect_paddleocr_env_config()
if not cfg:
return None
saved_paddleocr_models = cls.query(tenant_id=tenant_id, llm_factory="PaddleOCR", model_type=LLMType.OCR.value)
def _parse_api_key(raw: str) -> dict:
try:
return json.loads(raw or "{}")
except Exception:
return {}
for item in saved_paddleocr_models:
api_cfg = _parse_api_key(item.api_key)
normalized = {k: api_cfg.get(k, PADDLEOCR_DEFAULT_CONFIG.get(k)) for k in PADDLEOCR_ENV_KEYS}
if normalized == cfg:
return item.llm_name
used_names = {item.llm_name for item in saved_paddleocr_models}
idx = 1
base_name = "paddleocr-from-env"
while True:
candidate = f"{base_name}-{idx}"
if candidate in used_names:
idx += 1
continue
try:
cls.save(
tenant_id=tenant_id,
llm_factory="PaddleOCR",
llm_name=candidate,
model_type=LLMType.OCR.value,
api_key=json.dumps(cfg),
api_base="",
max_tokens=0,
)
return candidate
except IntegrityError:
logging.warning("PaddleOCR env model %s already exists for tenant %s, retry with next name", candidate, tenant_id)
used_names.add(candidate)
idx += 1
continue
@classmethod
@DB.connection_context()
def delete_by_tenant_id(cls, tenant_id):
@ -306,6 +356,7 @@ class TenantLLMService(CommonService):
@staticmethod
def llm_id2llm_type(llm_id: str) -> str | None:
from api.db.services.llm_service import LLMService
llm_id, *_ = TenantLLMService.split_model_name_and_factory(llm_id)
llm_factories = settings.FACTORY_LLM_INFOS
for llm_factory in llm_factories:
@ -340,8 +391,7 @@ class LLM4Tenant:
langfuse_keys = TenantLangfuseService.filter_by_tenant(tenant_id=tenant_id)
self.langfuse = None
if langfuse_keys:
langfuse = Langfuse(public_key=langfuse_keys.public_key, secret_key=langfuse_keys.secret_key,
host=langfuse_keys.host)
langfuse = Langfuse(public_key=langfuse_keys.public_key, secret_key=langfuse_keys.secret_key, host=langfuse_keys.host)
if langfuse.auth_check():
self.langfuse = langfuse
trace_id = self.langfuse.create_trace_id()

View File

@ -20,6 +20,7 @@ from strenum import StrEnum
SERVICE_CONF = "service_conf.yaml"
RAG_FLOW_SERVICE_NAME = "ragflow"
class CustomEnum(Enum):
@classmethod
def valid(cls, value):
@ -68,13 +69,13 @@ class ActiveEnum(Enum):
class LLMType(StrEnum):
CHAT = 'chat'
EMBEDDING = 'embedding'
SPEECH2TEXT = 'speech2text'
IMAGE2TEXT = 'image2text'
RERANK = 'rerank'
TTS = 'tts'
OCR = 'ocr'
CHAT = "chat"
EMBEDDING = "embedding"
SPEECH2TEXT = "speech2text"
IMAGE2TEXT = "image2text"
RERANK = "rerank"
TTS = "tts"
OCR = "ocr"
class TaskStatus(StrEnum):
@ -86,8 +87,7 @@ class TaskStatus(StrEnum):
SCHEDULE = "5"
VALID_TASK_STATUS = {TaskStatus.UNSTART, TaskStatus.RUNNING, TaskStatus.CANCEL, TaskStatus.DONE, TaskStatus.FAIL,
TaskStatus.SCHEDULE}
VALID_TASK_STATUS = {TaskStatus.UNSTART, TaskStatus.RUNNING, TaskStatus.CANCEL, TaskStatus.DONE, TaskStatus.FAIL, TaskStatus.SCHEDULE}
class ParserType(StrEnum):
@ -136,6 +136,7 @@ class FileSource(StrEnum):
BITBUCKET = "bitbucket"
ZENDESK = "zendesk"
class PipelineTaskType(StrEnum):
PARSE = "Parse"
DOWNLOAD = "Download"
@ -145,15 +146,17 @@ class PipelineTaskType(StrEnum):
MEMORY = "Memory"
VALID_PIPELINE_TASK_TYPES = {PipelineTaskType.PARSE, PipelineTaskType.DOWNLOAD, PipelineTaskType.RAPTOR,
PipelineTaskType.GRAPH_RAG, PipelineTaskType.MINDMAP}
VALID_PIPELINE_TASK_TYPES = {PipelineTaskType.PARSE, PipelineTaskType.DOWNLOAD, PipelineTaskType.RAPTOR, PipelineTaskType.GRAPH_RAG, PipelineTaskType.MINDMAP}
class MCPServerType(StrEnum):
SSE = "sse"
STREAMABLE_HTTP = "streamable-http"
VALID_MCP_SERVER_TYPES = {MCPServerType.SSE, MCPServerType.STREAMABLE_HTTP}
class Storage(Enum):
MINIO = 1
AZURE_SPN = 2
@ -165,10 +168,10 @@ class Storage(Enum):
class MemoryType(Enum):
RAW = 0b0001 # 1 << 0 = 1 (0b00000001)
SEMANTIC = 0b0010 # 1 << 1 = 2 (0b00000010)
EPISODIC = 0b0100 # 1 << 2 = 4 (0b00000100)
PROCEDURAL = 0b1000 # 1 << 3 = 8 (0b00001000)
RAW = 0b0001 # 1 << 0 = 1 (0b00000001)
SEMANTIC = 0b0010 # 1 << 1 = 2 (0b00000010)
EPISODIC = 0b0100 # 1 << 2 = 4 (0b00000100)
PROCEDURAL = 0b1000 # 1 << 3 = 8 (0b00001000)
class MemoryStorageType(StrEnum):
@ -239,3 +242,10 @@ MINERU_DEFAULT_CONFIG = {
"MINERU_SERVER_URL": "",
"MINERU_DELETE_OUTPUT": 1,
}
PADDLEOCR_ENV_KEYS = ["PADDLEOCR_API_URL", "PADDLEOCR_ACCESS_TOKEN", "PADDLEOCR_ALGORITHM"]
PADDLEOCR_DEFAULT_CONFIG = {
"PADDLEOCR_API_URL": "",
"PADDLEOCR_ACCESS_TOKEN": None,
"PADDLEOCR_ALGORITHM": "PaddleOCR-VL",
}

View File

@ -26,5 +26,8 @@ def normalize_layout_recognizer(layout_recognizer_raw: Any) -> tuple[Any, str |
if lowered.endswith("@mineru"):
parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0]
layout_recognizer = "MinerU"
elif lowered.endswith("@paddleocr"):
parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0]
layout_recognizer = "PaddleOCR"
return layout_recognizer, parser_model_name

View File

@ -5531,6 +5531,14 @@
"status": "1",
"rank": "900",
"llm": []
},
{
"name": "PaddleOCR",
"logo": "",
"tags": "OCR",
"status": "1",
"rank": "910",
"llm": []
}
]
}

View File

@ -0,0 +1,400 @@
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from __future__ import annotations
import base64
import logging
import os
import re
from dataclasses import asdict, dataclass, field, fields
from io import BytesIO
from os import PathLike
from pathlib import Path
from typing import Any, Callable, ClassVar, Literal, Optional, Union, Tuple, List
import requests
try:
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
except Exception:
class RAGFlowPdfParser:
pass
AlgorithmType = Literal["PaddleOCR-VL"]
SectionTuple = tuple[str, ...]
TableTuple = tuple[str, ...]
ParseResult = tuple[list[SectionTuple], list[TableTuple]]
_MARKDOWN_IMAGE_PATTERN = re.compile(
r"""
<div[^>]*>\s*
<img[^>]*/>\s*
</div>
|
<img[^>]*/>
""",
re.IGNORECASE | re.VERBOSE | re.DOTALL,
)
def _remove_images_from_markdown(markdown: str) -> str:
return _MARKDOWN_IMAGE_PATTERN.sub("", markdown)
@dataclass
class PaddleOCRVLConfig:
"""Configuration for PaddleOCR-VL algorithm."""
use_doc_orientation_classify: Optional[bool] = None
use_doc_unwarping: Optional[bool] = None
use_layout_detection: Optional[bool] = None
use_polygon_points: Optional[bool] = None
use_chart_recognition: Optional[bool] = None
use_seal_recognition: Optional[bool] = None
use_ocr_for_image_block: Optional[bool] = None
layout_threshold: Optional[Union[float, dict]] = None
layout_nms: Optional[bool] = None
layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]] = None
layout_merge_bboxes_mode: Optional[Union[str, dict]] = None
prompt_label: Optional[str] = None
format_block_content: Optional[bool] = True
repetition_penalty: Optional[float] = None
temperature: Optional[float] = None
top_p: Optional[float] = None
min_pixels: Optional[int] = None
max_pixels: Optional[int] = None
max_new_tokens: Optional[int] = None
merge_layout_blocks: Optional[bool] = None
markdown_ignore_labels: Optional[List[str]] = None
vlm_extra_args: Optional[dict] = None
@dataclass
class PaddleOCRConfig:
"""Main configuration for PaddleOCR parser."""
api_url: str = ""
access_token: Optional[str] = None
algorithm: AlgorithmType = "PaddleOCR-VL"
request_timeout: int = 600
prettify_markdown: bool = True
show_formula_number: bool = True
visualize: bool = False
additional_params: dict[str, Any] = field(default_factory=dict)
algorithm_config: dict[str, Any] = field(default_factory=dict)
@classmethod
def from_dict(cls, config: Optional[dict[str, Any]]) -> "PaddleOCRConfig":
"""Create configuration from dictionary."""
if not config:
return cls()
cfg = config.copy()
algorithm = cfg.get("algorithm", "PaddleOCR-VL")
# Validate algorithm
if algorithm not in ("PaddleOCR-VL",):
raise ValueError(f"Unsupported algorithm: {algorithm}")
# Extract algorithm-specific configuration
algorithm_config: dict[str, Any] = {}
if algorithm == "PaddleOCR-VL":
# Create default PaddleOCRVLConfig object and convert to dict
algorithm_config = asdict(PaddleOCRVLConfig())
# Apply user-provided VL config
vl_config = cfg.get("vl")
if isinstance(vl_config, dict):
algorithm_config.update({k: v for k, v in vl_config.items() if v is not None})
# Remove processed keys
cfg.pop("vl", None)
# Prepare initialization arguments
field_names = {field.name for field in fields(cls)}
init_kwargs: dict[str, Any] = {}
for field_name in field_names:
if field_name in cfg:
init_kwargs[field_name] = cfg[field_name]
init_kwargs["algorithm_config"] = algorithm_config
return cls(**init_kwargs)
@classmethod
def from_kwargs(cls, **kwargs: Any) -> "PaddleOCRConfig":
"""Create configuration from keyword arguments."""
return cls.from_dict(kwargs)
class PaddleOCRParser(RAGFlowPdfParser):
"""Parser for PDF documents using PaddleOCR API."""
_COMMON_FIELD_MAPPING: ClassVar[dict[str, str]] = {
"prettify_markdown": "prettifyMarkdown",
"show_formula_number": "showFormulaNumber",
"visualize": "visualize",
}
_ALGORITHM_FIELD_MAPPINGS: ClassVar[dict[str, dict[str, str]]] = {
"PaddleOCR-VL": {
"use_doc_orientation_classify": "useDocOrientationClassify",
"use_doc_unwarping": "useDocUnwarping",
"use_layout_detection": "useLayoutDetection",
"use_polygon_points": "usePolygonPoints",
"use_chart_recognition": "useChartRecognition",
"use_seal_recognition": "useSealRecognition",
"use_ocr_for_image_block": "useOcrForImageBlock",
"layout_threshold": "layoutThreshold",
"layout_nms": "layoutNms",
"layout_unclip_ratio": "layoutUnclipRatio",
"layout_merge_bboxes_mode": "layoutMergeBboxesMode",
"prompt_label": "promptLabel",
"format_block_content": "formatBlockContent",
"repetition_penalty": "repetitionPenalty",
"temperature": "temperature",
"top_p": "topP",
"min_pixels": "minPixels",
"max_pixels": "maxPixels",
"max_new_tokens": "maxNewTokens",
"merge_layout_blocks": "mergeLayoutBlocks",
"markdown_ignore_labels": "markdownIgnoreLabels",
"vlm_extra_args": "vlmExtraArgs",
},
}
def __init__(
self,
api_url: Optional[str] = None,
access_token: Optional[str] = None,
algorithm: AlgorithmType = "PaddleOCR-VL",
*,
request_timeout: int = 600,
):
"""Initialize PaddleOCR parser."""
self.api_url = api_url.rstrip("/") if api_url else os.getenv("PADDLEOCR_API_URL", "")
self.access_token = access_token or os.getenv("PADDLEOCR_ACCESS_TOKEN")
self.algorithm = algorithm
self.request_timeout = request_timeout
self.logger = logging.getLogger(self.__class__.__name__)
# Force PDF file type
self.file_type = 0
# Public methods
def check_installation(self) -> tuple[bool, str]:
"""Check if the parser is properly installed and configured."""
if not self.api_url:
return False, "[PaddleOCR] API URL not configured"
# TODO [@Bobholamovic]: Check URL availability and token validity
return True, ""
def parse_pdf(
self,
filepath: str | PathLike[str],
binary: BytesIO | bytes | None = None,
callback: Optional[Callable[[float, str], None]] = None,
*,
parse_method: str = "raw",
api_url: Optional[str] = None,
access_token: Optional[str] = None,
algorithm: Optional[AlgorithmType] = None,
request_timeout: Optional[int] = None,
prettify_markdown: Optional[bool] = None,
show_formula_number: Optional[bool] = None,
visualize: Optional[bool] = None,
additional_params: Optional[dict[str, Any]] = None,
vl_config: Optional[dict[str, Any]] = None,
**kwargs: Any,
) -> ParseResult:
"""Parse PDF document using PaddleOCR API."""
# Create configuration - pass all kwargs to capture VL config parameters
config_dict = {
"api_url": api_url if api_url is not None else self.api_url,
"access_token": access_token if access_token is not None else self.access_token,
"algorithm": algorithm if algorithm is not None else self.algorithm,
"request_timeout": request_timeout if request_timeout is not None else self.request_timeout,
}
if prettify_markdown is not None:
config_dict["prettify_markdown"] = prettify_markdown
if show_formula_number is not None:
config_dict["show_formula_number"] = show_formula_number
if visualize is not None:
config_dict["visualize"] = visualize
if additional_params is not None:
config_dict["additional_params"] = additional_params
if vl_config is not None:
config_dict["vl"] = vl_config
# Add any VL config parameters from kwargs
for key, value in kwargs.items():
if key in {field.name for field in fields(PaddleOCRVLConfig)}:
config_dict[key] = value
cfg = PaddleOCRConfig.from_dict(config_dict)
if not cfg.api_url:
raise RuntimeError("[PaddleOCR] API URL missing")
# Prepare file data
data_bytes = self._prepare_file_data(filepath, binary)
# Build and send request
result = self._send_request(data_bytes, cfg, callback)
# Process response
sections = self._transfer_to_sections(result, algorithm=cfg.algorithm, parse_method=parse_method)
if callback:
callback(0.9, f"[PaddleOCR] done, sections: {len(sections)}")
tables = self._transfer_to_tables(result)
if callback:
callback(1.0, f"[PaddleOCR] done, tables: {len(tables)}")
return sections, tables
def _prepare_file_data(self, filepath: str | PathLike[str], binary: BytesIO | bytes | None) -> bytes:
"""Prepare file data for API request."""
source_path = Path(filepath)
if binary is not None:
if isinstance(binary, (bytes, bytearray)):
return binary
return binary.getbuffer().tobytes()
if not source_path.exists():
raise FileNotFoundError(f"[PaddleOCR] file not found: {source_path}")
return source_path.read_bytes()
def _build_payload(self, data: bytes, file_type: int, config: PaddleOCRConfig) -> dict[str, Any]:
"""Build payload for API request."""
payload: dict[str, Any] = {
"file": base64.b64encode(data).decode("ascii"),
"fileType": file_type,
}
# Add common parameters
for param_key, param_value in [
("prettify_markdown", config.prettify_markdown),
("show_formula_number", config.show_formula_number),
("visualize", config.visualize),
]:
if param_value is not None:
api_param = self._COMMON_FIELD_MAPPING[param_key]
payload[api_param] = param_value
# Add algorithm-specific parameters
algorithm_mapping = self._ALGORITHM_FIELD_MAPPINGS.get(config.algorithm, {})
for param_key, param_value in config.algorithm_config.items():
if param_value is not None and param_key in algorithm_mapping:
api_param = algorithm_mapping[param_key]
payload[api_param] = param_value
# Add any additional parameters
if config.additional_params:
payload.update(config.additional_params)
return payload
def _send_request(self, data: bytes, config: PaddleOCRConfig, callback: Optional[Callable[[float, str], None]]) -> dict[str, Any]:
"""Send request to PaddleOCR API and parse response."""
# Build payload
payload = self._build_payload(data, self.file_type, config)
# Prepare headers
headers = {"Content-Type": "application/json", "Client-Platform": "ragflow"}
if config.access_token:
headers["Authorization"] = f"token {config.access_token}"
self.logger.info("[PaddleOCR] invoking API")
if callback:
callback(0.1, "[PaddleOCR] submitting request")
# Send request
try:
resp = requests.post(config.api_url, json=payload, headers=headers, timeout=self.request_timeout)
resp.raise_for_status()
except Exception as exc:
if callback:
callback(-1, f"[PaddleOCR] request failed: {exc}")
raise RuntimeError(f"[PaddleOCR] request failed: {exc}")
# Parse response
try:
response_data = resp.json()
except Exception as exc:
raise RuntimeError(f"[PaddleOCR] response is not JSON: {exc}") from exc
if callback:
callback(0.8, "[PaddleOCR] response received")
# Validate response format
if response_data.get("errorCode") != 0 or not isinstance(response_data.get("result"), dict):
if callback:
callback(-1, "[PaddleOCR] invalid response format")
raise RuntimeError("[PaddleOCR] invalid response format")
return response_data["result"]
def _transfer_to_sections(self, result: dict[str, Any], algorithm: AlgorithmType, parse_method: str) -> list[SectionTuple]:
"""Convert API response to section tuples."""
sections: list[SectionTuple] = []
if algorithm == "PaddleOCR-VL":
layout_parsing_results = result.get("layoutParsingResults", [])
for page_idx, layout_result in enumerate(layout_parsing_results):
pruned_result = layout_result.get("prunedResult", {})
parsing_res_list = pruned_result.get("parsing_res_list", [])
for block in parsing_res_list:
block_content = block.get("block_content", "").strip()
if not block_content:
continue
# Remove images
block_content = _remove_images_from_markdown(block_content)
label = block.get("block_label", "")
block_bbox = block.get("block_bbox", [0, 0, 0, 0])
tag = f"@@{page_idx + 1}\t{block_bbox[0]}\t{block_bbox[2]}\t{block_bbox[1]}\t{block_bbox[3]}##"
if parse_method == "manual":
sections.append((block_content, label, tag))
elif parse_method == "paper":
sections.append((block_content + tag, label))
else:
sections.append((block_content, tag))
return sections
def _transfer_to_tables(self, result: dict[str, Any]) -> list[TableTuple]:
"""Convert API response to table tuples."""
return []
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO)
parser = PaddleOCRParser(api_url=os.getenv("PADDLEOCR_API_URL", ""), algorithm=os.getenv("PADDLEOCR_ALGORITHM", "PaddleOCR-VL"))
ok, reason = parser.check_installation()
print("PaddleOCR available:", ok, reason)

View File

@ -22,9 +22,7 @@ from deepdoc.parser.utils import get_text
from rag.app import naive
from rag.app.naive import by_plaintext, PARSERS
from common.parser_config_utils import normalize_layout_recognizer
from rag.nlp import bullets_category, is_english, remove_contents_table, \
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
tokenize_chunks, attach_media_context
from rag.nlp import bullets_category, is_english, remove_contents_table, hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, tokenize_chunks, attach_media_context
from rag.nlp import rag_tokenizer
from deepdoc.parser import PdfParser, HtmlParser
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
@ -32,17 +30,12 @@ from PIL import Image
class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback)
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
start = timer()
@ -62,24 +55,17 @@ class Pdf(PdfParser):
self._merge_with_same_bullet()
callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))
for b in self.boxes], tbls
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes], tbls
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
Since a book is long and not all the parts are useful, if it's a PDF,
please set up the page ranges for every book in order eliminate negative effects and save elapsed computing time.
Supported file formats are docx, pdf, txt.
Since a book is long and not all the parts are useful, if it's a PDF,
please set up the page ranges for every book in order eliminate negative effects and save elapsed computing time.
"""
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
pdf_parser = None
sections, tbls = [], []
@ -87,28 +73,23 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback(0.1, "Start to parse.")
doc_parser = naive.Docx()
# TODO: table of contents need to be removed
main_sections = doc_parser(
filename, binary=binary, from_page=from_page, to_page=to_page)
main_sections = doc_parser(filename, binary=binary, from_page=from_page, to_page=to_page)
sections = []
tbls = []
for text, image, html in main_sections:
sections.append((text, image))
tbls.append(((None, html), ""))
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
remove_contents_table(sections, eng=is_english(random_choices([t for t, _ in sections], k=200)))
tbls = vision_figure_parser_docx_wrapper(sections=sections, tbls=tbls, callback=callback, **kwargs)
# tbls = [((None, lns), None) for lns in tbls]
sections = [(item[0], item[1] if item[1] is not None else "") for item in sections if
not isinstance(item[1], Image.Image)]
sections = [(item[0], item[1] if item[1] is not None else "") for item in sections if not isinstance(item[1], Image.Image)]
callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer, parser_model_name = normalize_layout_recognizer(
parser_config.get("layout_recognize", "DeepDOC")
)
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
@ -127,13 +108,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pdf_cls=Pdf,
layout_recognizer=layout_recognizer,
mineru_llm_name=parser_model_name,
**kwargs
paddleocr_llm_name=parser_model_name,
**kwargs,
)
if not sections and not tables:
return []
if name in ["tcadp", "docling", "mineru"]:
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
parser_config["chunk_token_num"] = 0
callback(0.8, "Finish parsing.")
@ -142,16 +124,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
txt = get_text(filename, binary)
sections = txt.split("\n")
sections = [(line, "") for line in sections if line]
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
remove_contents_table(sections, eng=is_english(random_choices([t for t, _ in sections], k=200)))
callback(0.8, "Finish parsing.")
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections = HtmlParser()(filename, binary)
sections = [(line, "") for line in sections if line]
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
remove_contents_table(sections, eng=is_english(random_choices([t for t, _ in sections], k=200)))
callback(0.8, "Finish parsing.")
elif re.search(r"\.doc$", filename, re.IGNORECASE):
@ -165,31 +145,23 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
binary = BytesIO(binary)
doc_parsed = tika_parser.from_buffer(binary)
if doc_parsed.get('content', None) is not None:
sections = doc_parsed['content'].split('\n')
if doc_parsed.get("content", None) is not None:
sections = doc_parsed["content"].split("\n")
sections = [(line, "") for line in sections if line]
remove_contents_table(sections, eng=is_english(
random_choices([t for t, _ in sections], k=200)))
remove_contents_table(sections, eng=is_english(random_choices([t for t, _ in sections], k=200)))
callback(0.8, "Finish parsing.")
else:
raise NotImplementedError(
"file type not supported yet(doc, docx, pdf, txt supported)")
raise NotImplementedError("file type not supported yet(doc, docx, pdf, txt supported)")
make_colon_as_title(sections)
bull = bullets_category(
[t for t in random_choices([t for t, _ in sections], k=100)])
bull = bullets_category([t for t in random_choices([t for t, _ in sections], k=100)])
if bull >= 0:
chunks = ["\n".join(ck)
for ck in hierarchical_merge(bull, sections, 5)]
chunks = ["\n".join(ck) for ck in hierarchical_merge(bull, sections, 5)]
else:
sections = [s.split("@") for s, _ in sections]
sections = [(pr[0], "@" + pr[1]) if len(pr) == 2 else (pr[0], '') for pr in sections]
chunks = naive_merge(
sections,
parser_config.get("chunk_token_num", 256),
parser_config.get("delimiter", "\n。;!?")
)
sections = [(pr[0], "@" + pr[1]) if len(pr) == 2 else (pr[0], "") for pr in sections]
chunks = naive_merge(sections, parser_config.get("chunk_token_num", 256), parser_config.get("delimiter", "\n。;!?"))
# is it English
# is_english(random_choices([t for t, _ in sections], k=218))
@ -208,9 +180,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)

View File

@ -21,8 +21,7 @@ from docx import Document
from common.constants import ParserType
from deepdoc.parser.utils import get_text
from rag.nlp import bullets_category, remove_contents_table, \
make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge
from rag.nlp import bullets_category, remove_contents_table, make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge
from rag.nlp import rag_tokenizer, Node
from deepdoc.parser import PdfParser, DocxParser, HtmlParser
from rag.app.naive import by_plaintext, PARSERS
@ -38,8 +37,7 @@ class Docx(DocxParser):
return line
def old_call(self, filename, binary=None, from_page=0, to_page=100000):
self.doc = Document(
filename) if not binary else Document(BytesIO(binary))
self.doc = Document(filename) if not binary else Document(BytesIO(binary))
pn = 0
lines = []
for p in self.doc.paragraphs:
@ -48,16 +46,15 @@ class Docx(DocxParser):
if from_page <= pn < to_page and p.text.strip():
lines.append(self.__clean(p.text))
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
if "lastRenderedPageBreak" in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
if "w:br" in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
return [line for line in lines if line]
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
self.doc = Document(
filename) if not binary else Document(BytesIO(binary))
self.doc = Document(filename) if not binary else Document(BytesIO(binary))
pn = 0
lines = []
level_set = set()
@ -71,10 +68,10 @@ class Docx(DocxParser):
lines.append((question_level, p_text))
level_set.add(question_level)
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
if "lastRenderedPageBreak" in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
if "w:br" in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
sorted_levels = sorted(level_set)
@ -88,12 +85,12 @@ class Docx(DocxParser):
return [element for element in root.get_tree() if element]
def __str__(self) -> str:
return f'''
return f"""
question:{self.question},
answer:{self.answer},
level:{self.level},
childs:{self.childs}
'''
"""
class Pdf(PdfParser):
@ -101,18 +98,12 @@ class Pdf(PdfParser):
self.model_speciess = ParserType.LAWS.value
super().__init__()
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
start = timer()
@ -123,22 +114,15 @@ class Pdf(PdfParser):
callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
return [(b["text"], self._line_tag(b, zoomin))
for b in self.boxes], None
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], None
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, txt.
Supported file formats are docx, pdf, txt.
"""
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
pdf_parser = None
sections = []
@ -152,9 +136,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
return tokenize_chunks(chunks, doc, eng, None)
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer, parser_model_name = normalize_layout_recognizer(
parser_config.get("layout_recognize", "DeepDOC")
)
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
@ -173,13 +155,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pdf_cls=Pdf,
layout_recognizer=layout_recognizer,
mineru_llm_name=parser_model_name,
**kwargs
paddleocr_llm_name=parser_model_name,
**kwargs,
)
if not raw_sections and not tables:
return []
if name in ["tcadp", "docling", "mineru"]:
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
parser_config["chunk_token_num"] = 0
for txt, poss in raw_sections:
@ -210,8 +193,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
binary = BytesIO(binary)
doc_parsed = tika_parser.from_buffer(binary)
if doc_parsed.get('content', None) is not None:
sections = doc_parsed['content'].split('\n')
if doc_parsed.get("content", None) is not None:
sections = doc_parsed["content"].split("\n")
sections = [s for s in sections if s]
callback(0.8, "Finish parsing.")
else:
@ -219,8 +202,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
logging.warning(f"tika.parser got empty content from {filename}.")
return []
else:
raise NotImplementedError(
"file type not supported yet(doc, docx, pdf, txt supported)")
raise NotImplementedError("file type not supported yet(doc, docx, pdf, txt supported)")
# Remove 'Contents' part
remove_contents_table(sections, eng)
@ -241,9 +223,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], callback=dummy)

View File

@ -20,8 +20,7 @@ import re
from common.constants import ParserType
from io import BytesIO
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, \
docx_question_level, attach_media_context
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context
from common.token_utils import num_tokens_from_string
from deepdoc.parser import PdfParser, DocxParser
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper, vision_figure_parser_docx_wrapper
@ -36,18 +35,12 @@ class Pdf(PdfParser):
self.model_speciess = ParserType.MANUAL.value
super().__init__()
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
logging.debug("OCR: {}".format(timer() - start))
@ -71,8 +64,7 @@ class Pdf(PdfParser):
for b in self.boxes:
b["text"] = re.sub(r"([\t  ]|\u3000){2,}", " ", b["text"].strip())
return [(b["text"], b.get("layoutno", ""), self.get_position(b, zoomin))
for i, b in enumerate(self.boxes)], tbls
return [(b["text"], b.get("layoutno", ""), self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)], tbls
class Docx(DocxParser):
@ -80,12 +72,12 @@ class Docx(DocxParser):
pass
def get_picture(self, document, paragraph):
img = paragraph._element.xpath('.//pic:pic')
img = paragraph._element.xpath(".//pic:pic")
if not img:
return None
try:
img = img[0]
embed = img.xpath('.//a:blip/@r:embed')[0]
embed = img.xpath(".//a:blip/@r:embed")[0]
related_part = document.part.related_parts[embed]
image = related_part.image
if image is not None:
@ -111,7 +103,7 @@ class Docx(DocxParser):
new_width = max(width1, width2)
new_height = height1 + height2
new_image = Image.new('RGB', (new_width, new_height))
new_image = Image.new("RGB", (new_width, new_height))
new_image.paste(img1, (0, 0))
new_image.paste(img2, (0, height1))
@ -119,8 +111,7 @@ class Docx(DocxParser):
return new_image
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
self.doc = Document(
filename) if not binary else Document(BytesIO(binary))
self.doc = Document(filename) if not binary else Document(BytesIO(binary))
pn = 0
last_answer, last_image = "", None
question_stack, level_stack = [], []
@ -128,19 +119,19 @@ class Docx(DocxParser):
for p in self.doc.paragraphs:
if pn > to_page:
break
question_level, p_text = 0, ''
question_level, p_text = 0, ""
if from_page <= pn < to_page and p.text.strip():
question_level, p_text = docx_question_level(p)
if not question_level or question_level > 6: # not a question
last_answer = f'{last_answer}\n{p_text}'
last_answer = f"{last_answer}\n{p_text}"
current_image = self.get_picture(self.doc, p)
last_image = self.concat_img(last_image, current_image)
else: # is a question
if last_answer or last_image:
sum_question = '\n'.join(question_stack)
sum_question = "\n".join(question_stack)
if sum_question:
ti_list.append((f'{sum_question}\n{last_answer}', last_image))
last_answer, last_image = '', None
ti_list.append((f"{sum_question}\n{last_answer}", last_image))
last_answer, last_image = "", None
i = question_level
while question_stack and i <= level_stack[-1]:
@ -149,15 +140,15 @@ class Docx(DocxParser):
question_stack.append(p_text)
level_stack.append(question_level)
for run in p.runs:
if 'lastRenderedPageBreak' in run._element.xml:
if "lastRenderedPageBreak" in run._element.xml:
pn += 1
continue
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
if "w:br" in run._element.xml and 'type="page"' in run._element.xml:
pn += 1
if last_answer:
sum_question = '\n'.join(question_stack)
sum_question = "\n".join(question_stack)
if sum_question:
ti_list.append((f'{sum_question}\n{last_answer}', last_image))
ti_list.append((f"{sum_question}\n{last_answer}", last_image))
tbls = []
for tb in self.doc.tables:
@ -182,26 +173,19 @@ class Docx(DocxParser):
return ti_list, tbls
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Only pdf is supported.
Only pdf is supported.
"""
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
pdf_parser = None
doc = {
"docnm_kwd": filename
}
doc = {"docnm_kwd": filename}
doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
# is it English
eng = lang.lower() == "english" # pdf_parser.is_english
if re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer, parser_model_name = normalize_layout_recognizer(
parser_config.get("layout_recognize", "DeepDOC")
)
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
@ -222,8 +206,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pdf_cls=Pdf,
layout_recognizer=layout_recognizer,
mineru_llm_name=parser_model_name,
paddleocr_llm_name=parser_model_name,
parse_method="manual",
**kwargs
**kwargs,
)
def _normalize_section(section):
@ -252,7 +237,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if not sections and not tbls:
return []
if name in ["tcadp", "docling", "mineru"]:
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
parser_config["chunk_token_num"] = 0
callback(0.8, "Finish parsing.")
@ -264,8 +249,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
for txt, _, _ in sections:
for t, lvl in pdf_parser.outlines:
tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)])
tks_ = set([txt[i] + txt[i + 1]
for i in range(min(len(t), len(txt) - 1))])
tks_ = set([txt[i] + txt[i + 1] for i in range(min(len(t), len(txt) - 1))])
if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8:
levels.append(lvl)
break
@ -274,8 +258,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
else:
bull = bullets_category([txt for txt, _, _ in sections])
most_level, levels = title_frequency(
bull, [(txt, lvl) for txt, lvl, _ in sections])
most_level, levels = title_frequency(bull, [(txt, lvl) for txt, lvl, _ in sections])
assert len(sections) == len(levels)
sec_ids = []
@ -285,25 +268,21 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
sid += 1
sec_ids.append(sid)
sections = [(txt, sec_ids[i], poss)
for i, (txt, _, poss) in enumerate(sections)]
sections = [(txt, sec_ids[i], poss) for i, (txt, _, poss) in enumerate(sections)]
for (img, rows), poss in tbls:
if not rows:
continue
sections.append((rows if isinstance(rows, str) else rows[0], -1,
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
sections.append((rows if isinstance(rows, str) else rows[0], -1, [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
def tag(pn, left, right, top, bottom):
if pn + left + right + top + bottom == 0:
return ""
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
.format(pn, left, right, top, bottom)
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format(pn, left, right, top, bottom)
chunks = []
last_sid = -2
tk_cnt = 0
for txt, sec_id, poss in sorted(sections, key=lambda x: (
x[-1][0][0], x[-1][0][3], x[-1][0][1])):
for txt, sec_id, poss in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1])):
poss = "\t".join([tag(*pos) for pos in poss])
if tk_cnt < 32 or (tk_cnt < 1024 and (sec_id == last_sid or sec_id == -1)):
if chunks:
@ -330,14 +309,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
elif re.search(r"\.docx?$", filename, re.IGNORECASE):
docx_parser = Docx()
ti_list, tbls = docx_parser(filename, binary,
from_page=0, to_page=10000, callback=callback)
ti_list, tbls = docx_parser(filename, binary, from_page=0, to_page=10000, callback=callback)
tbls = vision_figure_parser_docx_wrapper(sections=ti_list, tbls=tbls, callback=callback, **kwargs)
res = tokenize_table(tbls, doc, eng)
for text, image in ti_list:
d = copy.deepcopy(doc)
if image:
d['image'] = image
d["image"] = image
d["doc_type_kwd"] = "image"
tokenize(d, text, eng)
res.append(d)
@ -353,9 +331,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], callback=dummy)

View File

@ -33,29 +33,32 @@ from common.token_utils import num_tokens_from_string
from common.constants import LLMType
from api.db.services.llm_service import LLMBundle
from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, \
PdfParser, TxtParser
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, \
vision_figure_parser_pdf_wrapper
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, vision_figure_parser_pdf_wrapper
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
from deepdoc.parser.docling_parser import DoclingParser
from deepdoc.parser.tcadp_parser import TCADPParser
from common.parser_config_utils import normalize_layout_recognizer
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, \
tokenize_chunks, doc_tokenize_chunks_with_images, tokenize_table, append_context2table_image4pdf, tokenize_chunks_with_images, \
attach_media_context # noqa: F401
from rag.nlp import (
concat_img,
find_codec,
naive_merge,
naive_merge_with_images,
naive_merge_docx,
rag_tokenizer,
tokenize_chunks,
doc_tokenize_chunks_with_images,
tokenize_table,
append_context2table_image4pdf,
tokenize_chunks_with_images,
) # noqa: F401
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None,
**kwargs):
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None, **kwargs):
callback = callback
binary = binary
pdf_parser = pdf_cls() if pdf_cls else Pdf()
sections, tables = pdf_parser(
filename if not binary else binary,
from_page=from_page,
to_page=to_page,
callback=callback
)
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
tables = vision_figure_parser_pdf_wrapper(
tbls=tables,
@ -67,17 +70,17 @@ def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
def by_mineru(
filename,
binary=None,
from_page=0,
to_page=100000,
lang="Chinese",
callback=None,
pdf_cls=None,
parse_method: str = "raw",
mineru_llm_name: str | None = None,
tenant_id: str | None = None,
**kwargs,
filename,
binary=None,
from_page=0,
to_page=100000,
lang="Chinese",
callback=None,
pdf_cls=None,
parse_method: str = "raw",
mineru_llm_name: str | None = None,
tenant_id: str | None = None,
**kwargs,
):
pdf_parser = None
if tenant_id:
@ -115,8 +118,7 @@ def by_mineru(
return None, None, None
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None,
**kwargs):
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None, **kwargs):
pdf_parser = DoclingParser()
parse_method = kwargs.get("parse_method", "raw")
@ -130,7 +132,7 @@ def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
callback=callback,
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
parse_method=parse_method
parse_method=parse_method,
)
return sections, tables, pdf_parser
@ -142,16 +144,60 @@ def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese",
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
return None, None, tcadp_parser
sections, tables = tcadp_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
file_type="PDF"
)
sections, tables = tcadp_parser.parse_pdf(filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), file_type="PDF")
return sections, tables, tcadp_parser
def by_paddleocr(
filename,
binary=None,
from_page=0,
to_page=100000,
lang="Chinese",
callback=None,
pdf_cls=None,
parse_method: str = "raw",
paddleocr_llm_name: str | None = None,
tenant_id: str | None = None,
**kwargs,
):
pdf_parser = None
if tenant_id:
if not paddleocr_llm_name:
try:
from api.db.services.tenant_llm_service import TenantLLMService
env_name = TenantLLMService.ensure_paddleocr_from_env(tenant_id)
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="PaddleOCR", model_type=LLMType.OCR)
if candidates:
paddleocr_llm_name = candidates[0].llm_name
elif env_name:
paddleocr_llm_name = env_name
except Exception as e: # best-effort fallback
logging.warning(f"fallback to env paddleocr: {e}")
if paddleocr_llm_name:
try:
ocr_model = LLMBundle(tenant_id=tenant_id, llm_type=LLMType.OCR, llm_name=paddleocr_llm_name, lang=lang)
pdf_parser = ocr_model.mdl
sections, tables = pdf_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
parse_method=parse_method,
**kwargs,
)
return sections, tables, pdf_parser
except Exception as e:
logging.error(f"Failed to parse pdf via LLMBundle PaddleOCR ({paddleocr_llm_name}): {e}")
return None, None, None
if callback:
callback(-1, "PaddleOCR not found.")
return None, None, None
def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
layout_recognizer = (kwargs.get("layout_recognizer") or "").strip()
if (not layout_recognizer) or (layout_recognizer == "Plain Text"):
@ -168,12 +214,7 @@ def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=No
)
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
sections, tables = pdf_parser(
filename if not binary else binary,
from_page=from_page,
to_page=to_page,
callback=callback
)
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
return sections, tables, pdf_parser
@ -182,6 +223,7 @@ PARSERS = {
"mineru": by_mineru,
"docling": by_docling,
"tcadp": by_tcadp,
"paddleocr": by_paddleocr,
"plaintext": by_plaintext, # default
}
@ -191,12 +233,12 @@ class Docx(DocxParser):
pass
def get_picture(self, document, paragraph):
imgs = paragraph._element.xpath('.//pic:pic')
imgs = paragraph._element.xpath(".//pic:pic")
if not imgs:
return None
res_img = None
for img in imgs:
embed = img.xpath('.//a:blip/@r:embed')
embed = img.xpath(".//a:blip/@r:embed")
if not embed:
continue
embed = embed[0]
@ -219,7 +261,7 @@ class Docx(DocxParser):
logging.warning(f"The recognized image stream appears to be corrupted. Skipping image, exception: {e}")
continue
try:
image = Image.open(BytesIO(image_blob)).convert('RGB')
image = Image.open(BytesIO(image_blob)).convert("RGB")
if res_img is None:
res_img = image
else:
@ -251,11 +293,11 @@ class Docx(DocxParser):
try:
# Iterate through all paragraphs and tables in document order
for i, block in enumerate(self.doc._element.body):
if block.tag.endswith('p'): # Paragraph
if block.tag.endswith("p"): # Paragraph
p = Paragraph(block, self.doc)
blocks.append(('p', i, p))
elif block.tag.endswith('tbl'): # Table
blocks.append(('t', i, None)) # Table object will be retrieved later
blocks.append(("p", i, p))
elif block.tag.endswith("tbl"): # Table
blocks.append(("t", i, None)) # Table object will be retrieved later
except Exception as e:
logging.error(f"Error collecting blocks: {e}")
return ""
@ -264,7 +306,7 @@ class Docx(DocxParser):
target_table_pos = -1
table_count = 0
for i, (block_type, pos, _) in enumerate(blocks):
if block_type == 't':
if block_type == "t":
if table_count == table_index:
target_table_pos = pos
break
@ -280,7 +322,7 @@ class Docx(DocxParser):
if pos >= target_table_pos: # Skip blocks after the table
continue
if block_type != 'p':
if block_type != "p":
continue
if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
@ -309,7 +351,7 @@ class Docx(DocxParser):
if pos >= target_table_pos: # Skip blocks after the table
continue
if block_type != 'p':
if block_type != "p":
continue
if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
@ -340,8 +382,7 @@ class Docx(DocxParser):
return ""
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
self.doc = Document(
filename) if not binary else Document(BytesIO(binary))
self.doc = Document(filename) if not binary else Document(BytesIO(binary))
pn = 0
lines = []
last_image = None
@ -357,7 +398,7 @@ class Docx(DocxParser):
if pn > to_page:
break
if block.tag.endswith('p'):
if block.tag.endswith("p"):
p = Paragraph(block, self.doc)
if from_page <= pn < to_page:
@ -417,7 +458,7 @@ class Docx(DocxParser):
if "w:br" in xml and 'type="page"' in xml:
pn += 1
elif block.tag.endswith('tbl'):
elif block.tag.endswith("tbl"):
if pn < from_page or pn > to_page:
table_idx += 1
continue
@ -455,7 +496,6 @@ class Docx(DocxParser):
return new_line
def to_markdown(self, filename=None, binary=None, inline_images: bool = True):
"""
This function uses mammoth, licensed under the BSD 2-Clause License.
@ -486,8 +526,7 @@ class Docx(DocxParser):
try:
if inline_images:
result = mammoth.convert_to_html(docx_file,
convert_image=mammoth.images.img_element(_convert_image_to_base64))
result = mammoth.convert_to_html(docx_file, convert_image=mammoth.images.img_element(_convert_image_to_base64))
else:
result = mammoth.convert_to_html(docx_file)
@ -505,18 +544,11 @@ class Pdf(PdfParser):
def __init__(self):
super().__init__()
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None, separate_tables_figures=False):
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None, separate_tables_figures=False):
start = timer()
first_start = start
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
@ -559,13 +591,14 @@ class Markdown(MarkdownParser):
return []
from bs4 import BeautifulSoup
html_content = markdown(text)
soup = BeautifulSoup(html_content, 'html.parser')
soup = BeautifulSoup(html_content, "html.parser")
return soup
def get_hyperlink_urls(self, soup):
if soup:
return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
return set([a.get("href") for a in soup.find_all("a") if a.get("href")])
return []
def extract_image_urls_with_lines(self, text):
@ -588,10 +621,10 @@ class Markdown(MarkdownParser):
try:
from bs4 import BeautifulSoup
soup = BeautifulSoup(text, 'html.parser')
soup = BeautifulSoup(text, "html.parser")
newline_offsets = [m.start() for m in re.finditer(r"\n", text)] + [len(text)]
for img_tag in soup.find_all('img'):
src = img_tag.get('src')
for img_tag in soup.find_all("img"):
src = img_tag.get("src")
if not src:
continue
@ -627,14 +660,14 @@ class Markdown(MarkdownParser):
continue
img_obj = None
try:
if url.startswith(('http://', 'https://')):
if url.startswith(("http://", "https://")):
response = requests.get(url, stream=True, timeout=30)
if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image/'):
img_obj = Image.open(BytesIO(response.content)).convert('RGB')
if response.status_code == 200 and response.headers.get("Content-Type", "").startswith("image/"):
img_obj = Image.open(BytesIO(response.content)).convert("RGB")
else:
local_path = Path(url)
if local_path.exists():
img_obj = Image.open(url).convert('RGB')
img_obj = Image.open(url).convert("RGB")
else:
logging.warning(f"Local image file not found: {url}")
except Exception as e:
@ -652,7 +685,7 @@ class Markdown(MarkdownParser):
with open(filename, "r") as f:
txt = f.read()
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
remainder, tables = self.extract_tables_and_remainder(f"{txt}\n", separate_tables=separate_tables)
# To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410.
# extractor = MarkdownElementExtractor(remainder)
extractor = MarkdownElementExtractor(txt)
@ -678,7 +711,7 @@ class Markdown(MarkdownParser):
tbls = []
for table in tables:
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
tbls.append(((None, markdown(table, extensions=["markdown.extensions.tables"])), ""))
if return_section_images:
return sections, tbls, section_images
return sections, tbls
@ -694,7 +727,7 @@ def load_from_xml_v2(baseURI, rels_item_xml):
if rels_item_xml is not None:
rels_elm = parse_xml(rels_item_xml)
for rel_elm in rels_elm.Relationship_lst:
if rel_elm.target_ref in ('../NULL', 'NULL'):
if rel_elm.target_ref in ("../NULL", "NULL"):
continue
srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
return srels
@ -702,21 +735,18 @@ def load_from_xml_v2(baseURI, rels_item_xml):
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, excel, txt.
This method apply the naive ways to chunk files.
Successive text will be sliced into pieces using 'delimiter'.
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
Supported file formats are docx, pdf, excel, txt.
This method apply the naive ways to chunk files.
Successive text will be sliced into pieces using 'delimiter'.
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
"""
urls = set()
url_res = []
is_english = lang.lower() == "english" # is_english(cks)
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
child_deli = (parser_config.get("children_delimiter") or "").encode('utf-8').decode('unicode_escape').encode(
'latin1').decode('utf-8')
child_deli = (parser_config.get("children_delimiter") or "").encode("utf-8").decode("unicode_escape").encode("latin1").decode("utf-8")
cust_child_deli = re.findall(r"`([^`]+)`", child_deli)
child_deli = "|".join(re.sub(r"`([^`]+)`", "", child_deli))
if cust_child_deli:
@ -728,10 +758,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
table_context_size = max(0, int(parser_config.get("table_context_size", 0) or 0))
image_context_size = max(0, int(parser_config.get("image_context_size", 0) or 0))
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
res = []
pdf_parser = None
@ -750,8 +777,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
# Recursively chunk each embedded file and collect results
for embed_filename, embed_bytes in embeds:
try:
sub_res = chunk(embed_filename, binary=embed_bytes, lang=lang, callback=callback, is_root=False,
**kwargs) or []
sub_res = chunk(embed_filename, binary=embed_bytes, lang=lang, callback=callback, is_root=False, **kwargs) or []
embed_res.extend(sub_res)
except Exception as e:
error_msg = f"Failed to chunk embed {embed_filename}: {e}"
@ -772,8 +798,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
except Exception as e:
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False,
**kwargs)
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
url_res.extend(sub_url_res)
# fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
@ -784,11 +809,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
# chunks list[dict]
# images list - index of image chunk in chunks
chunks, images = naive_merge_docx(
sections, int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"), table_context_size, image_context_size)
chunks, images = naive_merge_docx(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"), table_context_size, image_context_size)
vision_figure_parser_docx_wrapper_naive(chunks=chunks, idx_lst=images, callback=callback, **kwargs)
callback(0.8, "Finish parsing.")
@ -801,9 +823,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer, parser_model_name = normalize_layout_recognizer(
parser_config.get("layout_recognize", "DeepDOC")
)
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
if parser_config.get("analyze_hyperlink", False) and is_root:
urls = extract_links_from_pdf(binary)
@ -824,7 +844,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
callback=callback,
layout_recognizer=layout_recognizer,
mineru_llm_name=parser_model_name,
**kwargs
paddleocr_llm_name=parser_model_name,
**kwargs,
)
if not sections and not tables:
@ -833,7 +854,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
if table_context_size or image_context_size:
tables = append_context2table_image4pdf(sections, tables, image_context_size)
if name in ["tcadp", "docling", "mineru"]:
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
parser_config["chunk_token_num"] = 0
res = tokenize_table(tables, doc, is_english)
@ -847,10 +868,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
if layout_recognizer == "TCADP Parser":
table_result_type = parser_config.get("table_result_type", "1")
markdown_image_response_type = parser_config.get("markdown_image_response_type", "1")
tcadp_parser = TCADPParser(
table_result_type=table_result_type,
markdown_image_response_type=markdown_image_response_type
)
tcadp_parser = TCADPParser(table_result_type=table_result_type, markdown_image_response_type=markdown_image_response_type)
if not tcadp_parser.check_installation():
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
return res
@ -858,13 +876,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
# Determine file type based on extension
file_type = "XLSX" if re.search(r"\.xlsx?$", filename, re.IGNORECASE) else "CSV"
sections, tables = tcadp_parser.parse_pdf(
filepath=filename,
binary=binary,
callback=callback,
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
file_type=file_type
)
sections, tables = tcadp_parser.parse_pdf(filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), file_type=file_type)
parser_config["chunk_token_num"] = 0
res = tokenize_table(tables, doc, is_english)
callback(0.8, "Finish parsing.")
@ -879,9 +891,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
callback(0.1, "Start to parse.")
sections = TxtParser()(filename, binary,
parser_config.get("chunk_token_num", 128),
parser_config.get("delimiter", "\n!?;。;!?"))
sections = TxtParser()(filename, binary, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?;。;!?"))
callback(0.8, "Finish parsing.")
elif re.search(r"\.(md|markdown|mdx)$", filename, re.IGNORECASE):
@ -919,11 +929,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
else:
section_images = [None] * len(sections)
section_images[idx] = combined_image
markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=[
((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=[((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
boosted_figures = markdown_vision_parser(callback=callback)
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]),
sections[idx][1])
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1])
else:
logging.warning("No visual model detected. Skipping figure parsing enhancement.")
@ -962,8 +970,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
binary = BytesIO(binary)
doc_parsed = tika_parser.from_buffer(binary)
if doc_parsed.get('content', None) is not None:
sections = doc_parsed['content'].split('\n')
if doc_parsed.get("content", None) is not None:
sections = doc_parsed["content"].split("\n")
sections = [(_, "") for _ in sections if _]
callback(0.8, "Finish parsing.")
else:
@ -972,8 +980,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
logging.warning(error_msg)
return []
else:
raise NotImplementedError(
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
raise NotImplementedError("file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
st = timer()
if is_markdown:
@ -1021,8 +1028,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
has_images = merged_images and any(img is not None for img in merged_images)
if has_images:
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images,
child_delimiters_pattern=child_deli))
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images, child_delimiters_pattern=child_deli))
else:
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))
else:
@ -1031,17 +1037,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
section_images = None
if section_images:
chunks, images = naive_merge_with_images(sections, section_images,
int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
res.extend(
tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli))
chunks, images = naive_merge_with_images(sections, section_images, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"))
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli))
else:
chunks = naive_merge(
sections, int(parser_config.get(
"chunk_token_num", 128)), parser_config.get(
"delimiter", "\n!?。;!?"))
chunks = naive_merge(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"))
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))
@ -1071,9 +1070,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)

View File

@ -28,18 +28,12 @@ from common.parser_config_utils import normalize_layout_recognizer
class Pdf(PdfParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None):
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
from timeit import default_timer as timer
start = timer()
callback(msg="OCR started")
self.__images__(
filename if not binary else binary,
zoomin,
from_page,
to_page,
callback
)
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
start = timer()
@ -57,21 +51,16 @@ class Pdf(PdfParser):
tbls = self._extract_table_figure(True, zoomin, True, True)
self._concat_downward()
sections = [(b["text"], self.get_position(b, zoomin))
for i, b in enumerate(self.boxes)]
return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (
x[-1][0][0], x[-1][0][3], x[-1][0][1]))], tbls
sections = [(b["text"], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)]
return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))], tbls
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
"""
Supported file formats are docx, pdf, excel, txt.
One file forms a chunk which maintains original text order.
Supported file formats are docx, pdf, excel, txt.
One file forms a chunk which maintains original text order.
"""
parser_config = kwargs.get(
"parser_config", {
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
eng = lang.lower() == "english" # is_english(cks)
if re.search(r"\.docx$", filename, re.IGNORECASE):
@ -99,9 +88,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
callback(0.8, "Finish parsing.")
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer, parser_model_name = normalize_layout_recognizer(
parser_config.get("layout_recognize", "DeepDOC")
)
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
@ -120,13 +107,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pdf_cls=Pdf,
layout_recognizer=layout_recognizer,
mineru_llm_name=parser_model_name,
**kwargs
paddleocr_llm_name=parser_model_name,
**kwargs,
)
if not sections and not tbls:
return []
if name in ["tcadp", "docling", "mineru"]:
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
parser_config["chunk_token_num"] = 0
callback(0.8, "Finish parsing.")
@ -134,8 +122,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
for (img, rows), poss in tbls:
if not rows:
continue
sections.append((rows if isinstance(rows, str) else rows[0],
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
sections.append((rows if isinstance(rows, str) else rows[0], [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
sections = [s for s, _ in sections if s]
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
@ -167,19 +154,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
binary = BytesIO(binary)
doc_parsed = tika_parser.from_buffer(binary)
if doc_parsed.get('content', None) is not None:
sections = doc_parsed['content'].split('\n')
if doc_parsed.get("content", None) is not None:
sections = doc_parsed["content"].split("\n")
sections = [s for s in sections if s]
callback(0.8, "Finish parsing.")
else:
raise NotImplementedError(
"file type not supported yet(doc, docx, pdf, txt supported)")
raise NotImplementedError("file type not supported yet(doc, docx, pdf, txt supported)")
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
tokenize(doc, "\n".join(sections), eng)
return [doc]
@ -188,9 +171,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if __name__ == "__main__":
import sys
def dummy(prog=None, msg=""):
pass
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)

View File

@ -36,22 +36,18 @@ class Ppt(PptParser):
callback(0.5, "Text extraction finished.")
import aspose.slides as slides
import aspose.pydrawing as drawing
imgs = []
with slides.Presentation(BytesIO(fnm)) as presentation:
for i, slide in enumerate(presentation.slides[from_page: to_page]):
for i, slide in enumerate(presentation.slides[from_page:to_page]):
try:
with BytesIO() as buffered:
slide.get_thumbnail(
0.1, 0.1).save(
buffered, drawing.imaging.ImageFormat.jpeg)
slide.get_thumbnail(0.1, 0.1).save(buffered, drawing.imaging.ImageFormat.jpeg)
buffered.seek(0)
imgs.append(Image.open(buffered).copy())
except RuntimeError as e:
raise RuntimeError(
f'ppt parse error at page {i + 1}, original error: {str(e)}') from e
assert len(imgs) == len(
txts), "Slides text and image do not match: {} vs. {}".format(
len(imgs), len(txts))
raise RuntimeError(f"ppt parse error at page {i + 1}, original error: {str(e)}") from e
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
callback(0.9, "Image extraction finished")
self.is_english = is_english(txts)
return [(txts[i], imgs[i]) for i in range(len(txts))]
@ -61,12 +57,10 @@ class Pdf(PdfParser):
def __init__(self):
super().__init__()
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, zoomin=3, callback=None, **kwargs):
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None, **kwargs):
# 1. OCR
callback(msg="OCR started")
self.__images__(filename if not binary else binary, zoomin, from_page,
to_page, callback)
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
# 2. Layout Analysis
callback(msg="Layout Analysis")
@ -91,12 +85,7 @@ class Pdf(PdfParser):
global_page_num = b["page_number"] + from_page
if not (from_page < global_page_num <= to_page + from_page):
continue
page_items[global_page_num].append({
"top": b["top"],
"x0": b["x0"],
"text": b["text"],
"type": "text"
})
page_items[global_page_num].append({"top": b["top"], "x0": b["x0"], "text": b["text"], "type": "text"})
# (B) Add table and figure
for (img, content), positions in tbls:
@ -127,12 +116,7 @@ class Pdf(PdfParser):
top = positions[0][3]
left = positions[0][1]
page_items[current_page_num].append({
"top": top,
"x0": left,
"text": final_text,
"type": "table_or_figure"
})
page_items[current_page_num].append({"top": top, "x0": left, "text": final_text, "type": "table_or_figure"})
# 7. Generate result
res = []
@ -153,18 +137,16 @@ class Pdf(PdfParser):
class PlainPdf(PlainParser):
def __call__(self, filename, binary=None, from_page=0,
to_page=100000, callback=None, **kwargs):
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
self.pdf = pdf2_read(filename if not binary else BytesIO(binary))
page_txt = []
for page in self.pdf.pages[from_page: to_page]:
for page in self.pdf.pages[from_page:to_page]:
page_txt.append(page.extract_text())
callback(0.9, "Parsing finished")
return [(txt, None) for txt in page_txt], []
def chunk(filename, binary=None, from_page=0, to_page=100000,
lang="Chinese", callback=None, parser_config=None, **kwargs):
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, parser_config=None, **kwargs):
"""
The supported file formats are pdf, pptx.
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
@ -173,18 +155,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
if parser_config is None:
parser_config = {}
eng = lang.lower() == "english"
doc = {
"docnm_kwd": filename,
"title_tks": rag_tokenizer.tokenize(
re.sub(r"\.[a-zA-Z]+$", "", filename))
}
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
res = []
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
ppt_parser = Ppt()
for pn, (txt, img) in enumerate(ppt_parser(
filename if not binary else binary, from_page, 1000000,
callback)):
for pn, (txt, img) in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
d = copy.deepcopy(doc)
pn += from_page
d["image"] = img
@ -196,9 +172,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
res.append(d)
return res
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
layout_recognizer, parser_model_name = normalize_layout_recognizer(
parser_config.get("layout_recognize", "DeepDOC")
)
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
if isinstance(layout_recognizer, bool):
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
@ -217,13 +191,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
pdf_cls=Pdf,
layout_recognizer=layout_recognizer,
mineru_llm_name=parser_model_name,
**kwargs
paddleocr_llm_name=parser_model_name,
**kwargs,
)
if not sections:
return []
if name in ["tcadp", "docling", "mineru"]:
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
parser_config["chunk_token_num"] = 0
callback(0.8, "Finish parsing.")
@ -236,22 +211,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
d["image"] = img
d["page_num_int"] = [pn + 1]
d["top_int"] = [0]
d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0,
img.size[1] if img else 0)]
d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
tokenize(d, txt, eng)
res.append(d)
return res
raise NotImplementedError(
"file type not supported yet(pptx, pdf supported)")
raise NotImplementedError("file type not supported yet(pptx, pdf supported)")
if __name__ == "__main__":
import sys
def dummy(a, b):
pass
chunk(sys.argv[1], callback=dummy)

View File

@ -166,7 +166,7 @@ class ParserParam(ProcessParamBase):
pdf_parse_method = pdf_config.get("parse_method", "")
self.check_empty(pdf_parse_method, "Parse method abnormal.")
if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru", "tcadp parser"]:
if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru", "tcadp parser", "paddleocr"]:
self.check_empty(pdf_config.get("lang", ""), "PDF VLM language")
pdf_output_format = pdf_config.get("output_format", "")
@ -232,6 +232,9 @@ class Parser(ProcessBase):
if lowered.endswith("@mineru"):
parser_model_name = raw_parse_method.rsplit("@", 1)[0]
parse_method = "MinerU"
elif lowered.endswith("@paddleocr"):
parser_model_name = raw_parse_method.rsplit("@", 1)[0]
parse_method = "PaddleOCR"
if parse_method.lower() == "deepdoc":
bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback)
@ -239,6 +242,7 @@ class Parser(ProcessBase):
lines, _ = PlainParser()(blob)
bboxes = [{"text": t} for t, _ in lines]
elif parse_method.lower() == "mineru":
def resolve_mineru_llm_name():
configured = parser_model_name or conf.get("mineru_llm_name")
if configured:
@ -320,6 +324,84 @@ class Parser(ProcessBase):
bboxes.append({"text": section})
else:
bboxes.append({"text": section})
elif parse_method.lower() == "paddleocr":
def resolve_paddleocr_llm_name():
configured = parser_model_name or conf.get("paddleocr_llm_name")
if configured:
return configured
tenant_id = self._canvas._tenant_id
if not tenant_id:
return None
from api.db.services.tenant_llm_service import TenantLLMService
env_name = TenantLLMService.ensure_paddleocr_from_env(tenant_id)
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="PaddleOCR", model_type=LLMType.OCR.value)
if candidates:
return candidates[0].llm_name
return env_name
parser_model_name = resolve_paddleocr_llm_name()
if not parser_model_name:
raise RuntimeError("PaddleOCR model not configured. Please add PaddleOCR in Model Providers or set PADDLEOCR_* env.")
tenant_id = self._canvas._tenant_id
ocr_model = LLMBundle(tenant_id, LLMType.OCR, llm_name=parser_model_name)
pdf_parser = ocr_model.mdl
lines, _ = pdf_parser.parse_pdf(
filepath=name,
binary=blob,
callback=self.callback,
parse_method=conf.get("paddleocr_parse_method", "raw"),
)
bboxes = []
for section in lines:
# PaddleOCRParser returns sections as tuple, different formats based on parse_method:
# - "raw": (text, position_tag)
# - "manual": (text, label, position_tag)
# - "paper": (text_with_tag, label)
text = section[0]
# Parse position tag if exists
position_tag = ""
if len(section) > 1:
if len(section) == 2: # raw format: (text, tag)
position_tag = section[1]
elif len(section) == 3: # manual format: (text, label, tag)
position_tag = section[2]
elif "paper" in conf.get("paddleocr_parse_method", "") and len(section) == 2:
# paper format: text may contain tag
text_with_tag = text
import re
tag_match = re.search(r"(@@[0-9-]+\t[0-9.\t]+##)", text_with_tag)
if tag_match:
position_tag = tag_match.group(1)
text = text_with_tag.replace(position_tag, "").strip()
# Extract coordinate information from position tag
page_number, x0, x1, top, bottom = 1, 0, 0, 0, 0
if position_tag:
import re
tag_match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
if tag_match:
pn, x0_str, x1_str, top_str, bottom_str = tag_match.groups()
page_number = int(pn.split("-")[0]) # Take first page number
x0, x1, top, bottom = float(x0_str), float(x1_str), float(top_str), float(bottom_str)
box = {
"text": text,
"page_number": page_number,
"x0": x0,
"x1": x1,
"top": top,
"bottom": bottom,
}
bboxes.append(box)
else:
vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("parse_method"), lang=self._param.setups["pdf"].get("lang"))
lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
@ -802,7 +884,7 @@ class Parser(ProcessBase):
outs = self.output()
tasks = []
for d in outs.get("json", []):
tasks.append(asyncio.create_task(image2id(d,partial(settings.STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id),get_uuid())))
tasks.append(asyncio.create_task(image2id(d, partial(settings.STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id), get_uuid())))
try:
await asyncio.gather(*tasks, return_exceptions=False)

View File

@ -19,6 +19,7 @@ import os
from typing import Any, Optional
from deepdoc.parser.mineru_parser import MinerUParser
from deepdoc.parser.paddleocr_parser import PaddleOCRParser
class Base:
@ -60,16 +61,11 @@ class MinerUOcrModel(Base, MinerUParser):
# Redact sensitive config keys before logging
redacted_config = {}
for k, v in config.items():
if any(
sensitive_word in k.lower()
for sensitive_word in ("key", "password", "token", "secret")
):
if any(sensitive_word in k.lower() for sensitive_word in ("key", "password", "token", "secret")):
redacted_config[k] = "[REDACTED]"
else:
redacted_config[k] = v
logging.info(
f"Parsed MinerU config (sensitive fields redacted): {redacted_config}"
)
logging.info(f"Parsed MinerU config (sensitive fields redacted): {redacted_config}")
MinerUParser.__init__(self, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url)
@ -93,6 +89,60 @@ class MinerUOcrModel(Base, MinerUParser):
server_url=self.mineru_server_url,
delete_output=self.mineru_delete_output,
parse_method=parse_method,
**kwargs
**kwargs,
)
return sections, tables
class PaddleOCROcrModel(Base, PaddleOCRParser):
_FACTORY_NAME = "PaddleOCR"
def __init__(self, key: str | dict, model_name: str, **kwargs):
Base.__init__(self, key, model_name, **kwargs)
raw_config = {}
if key:
try:
raw_config = json.loads(key)
except Exception:
raw_config = {}
# nested {"api_key": {...}} from UI
# flat {"PADDLEOCR_*": "..."} payload auto-provisioned from env vars
config = raw_config.get("api_key", raw_config)
if not isinstance(config, dict):
config = {}
def _resolve_config(key: str, env_key: str, default=""):
# lower-case keys (UI), upper-case PADDLEOCR_* (env auto-provision), env vars
return config.get(key, config.get(env_key, os.environ.get(env_key, default)))
self.paddleocr_api_url = _resolve_config("paddleocr_api_url", "PADDLEOCR_API_URL", "")
self.paddleocr_algorithm = _resolve_config("paddleocr_algorithm", "PADDLEOCR_ALGORITHM", "PaddleOCR-VL")
self.paddleocr_access_token = _resolve_config("paddleocr_access_token", "PADDLEOCR_ACCESS_TOKEN", None)
# Redact sensitive config keys before logging
redacted_config = {}
for k, v in config.items():
if any(sensitive_word in k.lower() for sensitive_word in ("key", "password", "token", "secret")):
redacted_config[k] = "[REDACTED]"
else:
redacted_config[k] = v
logging.info(f"Parsed PaddleOCR config (sensitive fields redacted): {redacted_config}")
PaddleOCRParser.__init__(
self,
api_url=self.paddleocr_api_url,
access_token=self.paddleocr_access_token,
algorithm=self.paddleocr_algorithm,
)
def check_available(self) -> tuple[bool, str]:
return self.check_installation()
def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw", **kwargs):
ok, reason = self.check_available()
if not ok:
raise RuntimeError(f"PaddleOCR server not accessible: {reason}")
sections, tables = PaddleOCRParser.parse_pdf(self, filepath=filepath, binary=binary, callback=callback, parse_method=parse_method, **kwargs)
return sections, tables

View File

@ -0,0 +1,14 @@
<?xml version="1.0" encoding="UTF-8"?>
<svg version="1.1" xmlns="http://www.w3.org/2000/svg" width="204" height="204">
<path d="M0 0 C5.28 0 10.56 0 16 0 C16.15427797 5.78542375 16.22313666 9.66447202 14 15 C13.4176812 17.42810911 12.8778743 19.86670228 12.375 22.3125 C12.11460938 23.56675781 11.85421875 24.82101563 11.5859375 26.11328125 C11.39257812 27.06589844 11.19921875 28.01851562 11 29 C10.34 29 9.68 29 9 29 C9.04125 29.86625 9.0825 30.7325 9.125 31.625 C8.99380505 35.16726367 8.15585649 37.66841364 7 41 C6.65596692 43.33177975 6.32050798 45.66487041 6 48 C5.67 48.33 5.34 48.66 5 49 C3.55833967 54.24854466 2.68835557 59.60788136 2 65 C1.34 65 0.68 65 0 65 C-0.09796875 65.63164063 -0.1959375 66.26328125 -0.296875 66.9140625 C-1.22159497 72.44643887 -2.45538005 77.74845326 -4.0625 83.125 C-5.64813561 88.477257 -6.61641909 93.43807684 -7 99 C-7.66 99 -8.32 99 -9 99 C-8.95875 100.0725 -8.9175 101.145 -8.875 102.25 C-8.99814042 105.94421261 -9.40497 106.89631346 -11 110 C-11.52482966 111.79980854 -11.9996991 113.61456629 -12.4375 115.4375 C-12.69273438 116.49453125 -12.94796875 117.5515625 -13.2109375 118.640625 C-13.47132812 119.74921875 -13.73171875 120.8578125 -14 122 C-14.26039062 123.10859375 -14.52078125 124.2171875 -14.7890625 125.359375 C-16.23108798 131.56445435 -17.62923242 137.77882404 -19 144 C-19.66 144 -20.32 144 -21 144 C-21.12375 145.19625 -21.2475 146.3925 -21.375 147.625 C-21.77380462 151.48011134 -22.70431438 154.11294313 -24 158 C-29.28 158 -34.56 158 -40 158 C-39.50048992 151.75612399 -37.93599284 146.12864787 -36.16015625 140.1328125 C-34.83094643 135.52383644 -33.76462788 130.86061164 -32.6875 126.1875 C-30.51622401 116.78617956 -27.95798491 107.51855943 -25.23510742 98.26245117 C-24.94482666 97.25802979 -24.6545459 96.2536084 -24.35546875 95.21875 C-24.09403076 94.33123047 -23.83259277 93.44371094 -23.56323242 92.52929688 C-22.81886716 89.6184652 -22.81886716 89.6184652 -23 85 C-32.9 85 -42.8 85 -53 85 C-51.89473684 75.05263158 -51.89473684 75.05263158 -50 70 C-49.76023438 69.23042969 -49.52046875 68.46085937 -49.2734375 67.66796875 C-48.89058594 66.44013672 -48.89058594 66.44013672 -48.5 65.1875 C-48.2628125 64.41792969 -48.025625 63.64835937 -47.78125 62.85546875 C-47 61 -47 61 -45 60 C-42.67583704 59.91413191 -40.34916693 59.89288957 -38.0234375 59.90234375 C-36.99556625 59.90446617 -36.99556625 59.90446617 -35.94692993 59.90663147 C-33.75624239 59.91223325 -31.5656567 59.92478594 -29.375 59.9375 C-27.89062648 59.94251478 -26.40625135 59.94707772 -24.921875 59.95117188 C-21.28121685 59.96220323 -17.64061836 59.97946901 -14 60 C-13.94376465 59.35498535 -13.8875293 58.7099707 -13.82958984 58.04541016 C-12.96693744 48.72910992 -11.24508629 39.82715762 -7.74609375 31.125 C-6.4362897 27.39445861 -5.68000873 23.5319145 -4.828125 19.67578125 C-3.34357442 13.08633739 -1.65158 6.54936895 0 0 Z " fill="#2831DF" transform="translate(53,46)"/>
<path d="M0 0 C5.28 0 10.56 0 16 0 C15.235399 10.70441398 12.26943536 20.9810415 9.08203125 31.1875 C7.02129712 37.82748353 5.22384225 44.51569215 3.5 51.25 C3.19739258 52.42224121 2.89478516 53.59448242 2.58300781 54.80224609 C0.71701016 62.07123696 -1.04512446 69.35696753 -2.71826172 76.67236328 C-3.85473071 81.57050729 -5.31273138 86.26962192 -7 91 C-8.59119364 96.03877987 -9.44914797 100.76690573 -10 106 C-10.66 106 -11.32 106 -12 106 C-12.03738281 106.70125 -12.07476562 107.4025 -12.11328125 108.125 C-12.48264852 113.45547666 -13.1249218 117.94818256 -15.18359375 122.87109375 C-16.93149684 127.42902286 -17.86250115 132.26131996 -19 137 C-19.34588869 138.38455735 -19.69363488 139.76865163 -20.04296875 141.15234375 C-20.222229 141.86261719 -20.40148926 142.57289063 -20.58618164 143.3046875 C-20.93785233 144.69671066 -21.28988648 146.08864207 -21.64233398 147.48046875 C-22.52090977 150.97902837 -23.29193239 154.45966195 -24 158 C-29.61 158 -35.22 158 -41 158 C-40.24791147 152.73538029 -39.34974669 147.7690517 -38.125 142.625 C-37.94718994 141.87186523 -37.76937988 141.11873047 -37.58618164 140.34277344 C-36.50974786 135.8706144 -35.25275445 131.54235526 -33.7109375 127.20703125 C-32.48712712 123.40783969 -31.6904287 119.50981261 -30.8203125 115.6171875 C-29.90622587 111.58650076 -28.93409458 107.5734341 -27.9375 103.5625 C-27.77515869 102.90048584 -27.61281738 102.23847168 -27.44555664 101.55639648 C-26.70403999 98.58317455 -25.89147112 95.67598058 -24.91796875 92.76953125 C-23.84587401 89.53499862 -23.40522292 86.37685768 -23 83 C-22.34 83 -21.68 83 -21 83 C-20.896875 81.741875 -20.79375 80.48375 -20.6875 79.1875 C-20.17690392 74.48853603 -19.00603931 70.13009242 -17.69921875 65.59765625 C-16.83798598 62.39810434 -16.36546873 59.2892186 -16 56 C-15.34 56 -14.68 56 -14 56 C-13.96624268 55.33282959 -13.93248535 54.66565918 -13.89770508 53.97827148 C-13.41579319 46.91496121 -11.9250941 40.75547542 -9.6875 34.0625 C-6.29781688 23.49159445 -3.92624433 12.9233607 -2 2 C-1.34 2 -0.68 2 0 2 C0 1.34 0 0.68 0 0 Z " fill="#2831DF" transform="translate(134,46)"/>
<path d="M0 0 C2.43757413 -0.02698422 4.87484915 -0.04683099 7.3125 -0.0625 C8.00279297 -0.07087891 8.69308594 -0.07925781 9.40429688 -0.08789062 C12.5483025 -0.10307906 14.99420019 -0.00193327 18 1 C18 1.66 18 2.32 18 3 C18.66 3 19.32 3 20 3 C21.0064275 4.66279327 22.00585407 6.32983484 23 8 C23.99 8.66 24.98 9.32 26 10 C27.08301035 12.29598193 28.07796291 14.63477442 29 17 C29.350625 17.763125 29.70125 18.52625 30.0625 19.3125 C34.88894127 33.14829831 35.10756712 51.23425219 30 65 C28.24403886 68.52667826 26.22195061 71.75208274 24 75 C23.62488281 75.63164063 23.24976563 76.26328125 22.86328125 76.9140625 C21.38081833 79.38678722 20.48665592 80.81074492 17.75 81.875 C17.1725 81.91625 16.595 81.9575 16 82 C16 82.66 16 83.32 16 84 C6.01354054 84.86352013 -3.85054488 85.12644402 -13.875 85.0625 C-15.25650876 85.05746511 -16.63801933 85.05290579 -18.01953125 85.04882812 C-21.34640246 85.03719571 -24.67317602 85.0208528 -28 85 C-25.88008565 64.88008565 -25.88008565 64.88008565 -22 61 C-14.05344286 59.6982022 -6.19562196 60.02377079 1.82421875 60.390625 C7.89338929 60.57880147 7.89338929 60.57880147 13.078125 57.671875 C19.2379721 49.10818513 18.49422308 39.18370502 17 29 C16.34 29 15.68 29 15 29 C14.4740625 27.824375 14.4740625 27.824375 13.9375 26.625 C11.4077885 23.19764893 9.50197264 22.98385012 5.4296875 22.26953125 C1.91534221 21.87967623 -1.46670023 21.87151637 -5 22 C-4.76233962 20.09871698 -4.52388673 18.19752936 -4.28125 16.296875 C-4.09907454 14.80910876 -3.93145621 13.31945809 -3.78125 11.828125 C-3.39419678 8.15111943 -2.95070301 5.16989238 -1 2 C-0.67 1.34 -0.34 0.68 0 0 Z " fill="#2932DF" transform="translate(82,46)"/>
<path d="M0 0 C15.71406556 -1.02980527 15.71406556 -1.02980527 20.5 2 C28.76609987 9.71502654 33.63280425 23.17948938 34.09179688 34.20507812 C34.57281659 49.21054692 33.72804383 61.68962202 26 75 C24.38541822 76.71717245 22.72413232 78.39285243 21 80 C20.38125 80.7425 19.7625 81.485 19.125 82.25 C14.68229769 85.90869602 8.01215129 85.1279183 2.4921875 85.1328125 C1.45942184 85.13424759 1.45942184 85.13424759 0.40579224 85.13571167 C-1.03790377 85.13638732 -2.48160293 85.13457633 -3.92529297 85.13037109 C-6.1260461 85.12499745 -8.32659672 85.13034546 -10.52734375 85.13671875 C-11.93750045 85.13605818 -13.34765702 85.13477746 -14.7578125 85.1328125 C-16.66264404 85.13112061 -16.66264404 85.13112061 -18.60595703 85.12939453 C-21.54244033 85.01744401 -24.15099128 84.68422413 -27 84 C-26.31697126 78.58609855 -25.22293698 73.36322421 -23.9375 68.0625 C-23.75123047 67.28326172 -23.56496094 66.50402344 -23.37304688 65.70117188 C-22.91818991 63.80010303 -22.4594938 61.89995343 -22 60 C-21.01604248 60.00523682 -20.03208496 60.01047363 -19.01831055 60.01586914 C-15.36744089 60.03397029 -11.71657521 60.04545075 -8.06567383 60.05493164 C-6.48558892 60.05996014 -4.90550866 60.06678378 -3.32543945 60.07543945 C-1.05418754 60.08756992 1.21700285 60.09324313 3.48828125 60.09765625 C4.54780754 60.10539818 4.54780754 60.10539818 5.6287384 60.11329651 C8.70254391 60.11364803 11.0590442 59.9803186 14 59 C15.10836423 57.11683841 15.10836423 57.11683841 16 55 C16.66 54.01 17.32 53.02 18 52 C20.37643288 44.15777151 19.63162649 35.30648664 16 28 C13.87039364 25.37894602 12.27236626 23.57037108 9.1875 22.1875 C7.08572233 22.00734763 5.04601849 21.98845702 2.9375 22 C-1 22 -1 22 -4 21 C-3.25428948 16.35197907 -2.3717461 11.7381195 -1.4375 7.125 C-1.29892578 6.43664063 -1.16035156 5.74828125 -1.01757812 5.0390625 C-0.67927173 3.35919626 -0.33976088 1.67957266 0 0 Z " fill="#2831DF" transform="translate(161,46)"/>
<path d="M0 0 C2.8125 1.0625 2.8125 1.0625 5 3 C5.8125 6.6875 5.8125 6.6875 6 10 C6.66 10 7.32 10 8 10 C7.40672298 19.64855779 6.96022527 26.87405508 0 34 C-2 35.125 -2 35.125 -4 35 C-8.45279556 32.34952645 -9.59522559 29.86268066 -11 25 C-11.66 24.67 -12.32 24.34 -13 24 C-13.05425226 21.60385874 -13.09379027 19.20896147 -13.125 16.8125 C-13.14175781 16.13896484 -13.15851563 15.46542969 -13.17578125 14.77148438 C-13.22959846 9.25521997 -12.63330947 5.67867918 -8.75 1.6875 C-5.44146305 -0.34273858 -3.8200098 -0.55708476 0 0 Z " fill="#2832DE" transform="translate(102,2)"/>
<path d="M0 0 C4.83670567 4.83670567 5.20245233 9.25686475 5.25 15.8125 C5.270625 16.51181641 5.29125 17.21113281 5.3125 17.93164062 C5.34861844 22.75796677 4.39813869 25.85170455 2 30 C1.67 30.99 1.34 31.98 1 33 C-2.3 33 -5.6 33 -9 33 C-13.20796432 26.12699161 -16.24978351 19.27600545 -15 11 C-13.68274847 6.82417685 -12.03704153 3.21584179 -9.1875 -0.125 C-5.75721453 -1.49711419 -3.50872971 -1.0024942 0 0 Z " fill="#2931DF" transform="translate(184,3)"/>
<path d="M0 0 C1.0219043 -0.00676758 1.0219043 -0.00676758 2.06445312 -0.01367188 C5.73102226 -0.00285604 9.12254501 0.25617151 12.6875 1.125 C12.6875 1.785 12.6875 2.445 12.6875 3.125 C2.4575 3.125 -7.7725 3.125 -18.3125 3.125 C-18.3125 2.465 -18.3125 1.805 -18.3125 1.125 C-12.2065942 0.30305114 -6.15338301 0.01692039 0 0 Z " fill="#2C35ED" transform="translate(159.3125,105.875)"/>
<path d="M0 0 C3.8544062 0.13989034 7.70845629 0.28798006 11.5625 0.4375 C12.64853516 0.47681641 13.73457031 0.51613281 14.85351562 0.55664062 C20.30410746 0.77201355 25.62075441 1.08159222 31 2 C31 2.33 31 2.66 31 3 C20.44 3 9.88 3 -1 3 C-0.67 2.01 -0.34 1.02 0 0 Z " fill="#2B35EB" transform="translate(8,106)"/>
<path d="M0 0 C0 0.33 0 0.66 0 1 C-10.23 1 -20.46 1 -31 1 C-31 0.67 -31 0.34 -31 0 C-20.35831508 -2.10743678 -10.627168 -2.28562253 0 0 Z " fill="#2C35EF" transform="translate(92,108)"/>
<path d="" fill="#0000FF" transform="translate(0,0)"/>
<path d="" fill="#000000" transform="translate(0,0)"/>
</svg>

After

Width:  |  Height:  |  Size: 10 KiB

View File

@ -6,6 +6,7 @@ import { camelCase } from 'lodash';
import { ReactNode, useMemo } from 'react';
import { useFormContext } from 'react-hook-form';
import { MinerUOptionsFormField } from './mineru-options-form-field';
import { PaddleOCROptionsFormField } from './paddleocr-options-form-field';
import { SelectWithSearch } from './originui/select-with-search';
import {
FormControl,
@ -28,12 +29,14 @@ export function LayoutRecognizeFormField({
optionsWithoutLLM,
label,
showMineruOptions = true,
showPaddleocrOptions = true,
}: {
name?: string;
horizontal?: boolean;
optionsWithoutLLM?: { value: string; label: string }[];
label?: ReactNode;
showMineruOptions?: boolean;
showPaddleocrOptions?: boolean;
}) {
const form = useFormContext();
@ -113,6 +116,7 @@ export function LayoutRecognizeFormField({
</div>
</FormItem>
{showMineruOptions && <MinerUOptionsFormField />}
{showPaddleocrOptions && <PaddleOCROptionsFormField />}
</>
);
}}

View File

@ -0,0 +1,95 @@
import { RAGFlowFormItem } from '@/components/ragflow-form';
import { Input } from '@/components/ui/input';
import { RAGFlowSelect } from '@/components/ui/select';
import { LLMFactory } from '@/constants/llm';
import { buildOptions } from '@/utils/form';
import { useFormContext, useWatch } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
const algorithmOptions = buildOptions(['PaddleOCR-VL']);
export function PaddleOCROptionsFormField({
namePrefix = 'parser_config',
}: {
namePrefix?: string;
}) {
const form = useFormContext();
const { t } = useTranslation();
const buildName = (field: string) =>
namePrefix ? `${namePrefix}.${field}` : field;
const layoutRecognize = useWatch({
control: form.control,
name: 'parser_config.layout_recognize',
});
// Check if PaddleOCR is selected (the value contains 'PaddleOCR' or matches the factory name)
const isPaddleOCRSelected =
layoutRecognize?.includes(LLMFactory.PaddleOCR) ||
layoutRecognize?.toLowerCase()?.includes('paddleocr');
if (!isPaddleOCRSelected) {
return null;
}
return (
<div className="space-y-4 border-l-2 border-primary/30 pl-4 ml-2">
<div className="text-sm font-medium text-text-secondary">
{t('knowledgeConfiguration.paddleocrOptions', 'PaddleOCR Options')}
</div>
<RAGFlowFormItem
name={buildName('paddleocr_api_url')}
label={t('knowledgeConfiguration.paddleocrApiUrl', 'PaddleOCR API URL')}
tooltip={t(
'knowledgeConfiguration.paddleocrApiUrlTip',
'The API endpoint URL for PaddleOCR service',
)}
horizontal={true}
>
{(field) => (
<Input
{...field}
placeholder={t('knowledgeConfiguration.paddleocrApiUrlPlaceholder')}
/>
)}
</RAGFlowFormItem>
<RAGFlowFormItem
name={buildName('paddleocr_access_token')}
label={t('knowledgeConfiguration.paddleocrAccessToken', 'AI Studio Access Token')}
tooltip={t(
'knowledgeConfiguration.paddleocrAccessTokenTip',
'Access token for PaddleOCR API (optional)',
)}
horizontal={true}
>
{(field) => (
<Input
{...field}
placeholder={t('knowledgeConfiguration.paddleocrAccessTokenPlaceholder')}
/>
)}
</RAGFlowFormItem>
<RAGFlowFormItem
name={buildName('paddleocr_algorithm')}
label={t('knowledgeConfiguration.paddleocrAlgorithm', 'PaddleOCR Algorithm')}
tooltip={t(
'knowledgeConfiguration.paddleocrAlgorithmTip',
'Algorithm to use for PaddleOCR parsing',
)}
horizontal={true}
>
{(field) => (
<RAGFlowSelect
value={field.value || 'PaddleOCR-VL'}
onChange={field.onChange}
options={algorithmOptions}
placeholder={t('common.selectPlaceholder', 'Select value')}
/>
)}
</RAGFlowFormItem>
</div>
);
}

View File

@ -105,6 +105,7 @@ export const LlmIcon = ({
LLMFactory.Gemini,
LLMFactory.StepFun,
LLMFactory.MinerU,
LLMFactory.PaddleOCR,
// LLMFactory.DeerAPI,
];
if (svgIcons.includes(name as LLMFactory)) {

View File

@ -61,6 +61,7 @@ export enum LLMFactory {
JiekouAI = 'Jiekou.AI',
Builtin = 'Builtin',
MinerU = 'MinerU',
PaddleOCR = 'PaddleOCR',
}
// Please lowercase the file name
@ -127,6 +128,7 @@ export const IconMap = {
[LLMFactory.JiekouAI]: 'jiekouai',
[LLMFactory.Builtin]: 'builtin',
[LLMFactory.MinerU]: 'mineru',
[LLMFactory.PaddleOCR]: 'paddleocr',
};
export const APIMapUrl = {
@ -178,4 +180,5 @@ export const APIMapUrl = {
[LLMFactory.DeerAPI]: 'https://api.deerapi.com/token',
[LLMFactory.TokenPony]: 'https://www.tokenpony.cn/#/user/keys',
[LLMFactory.DeepInfra]: 'https://deepinfra.com/dash/api_keys',
[LLMFactory.PaddleOCR]: 'https://www.paddleocr.ai/latest/',
};

View File

@ -385,6 +385,17 @@ Prozedurales Gedächtnis: Erlernte Fähigkeiten, Gewohnheiten und automatisierte
'Formelerkennung aktivieren. Hinweis: Dies funktioniert möglicherweise nicht korrekt bei kyrillischen Dokumenten.',
mineruTableEnable: 'Tabellenerkennung',
mineruTableEnableTip: 'Tabellenerkennung und -extraktion aktivieren.',
paddleocrOptions: 'PaddleOCR-Optionen',
paddleocrApiUrl: 'PaddleOCR API-URL',
paddleocrApiUrlTip: 'API-Endpunkt-URL des PaddleOCR-Dienstes',
paddleocrApiUrlPlaceholder: 'Zum Beispiel: https://paddleocr-server.com/layout-parsing',
paddleocrAccessToken: 'AI Studio-Zugriffstoken',
paddleocrAccessTokenTip: 'Zugriffstoken für die PaddleOCR-API (optional)',
paddleocrAccessTokenPlaceholder: 'Ihr AI Studio-Token (optional)',
paddleocrAlgorithm: 'PaddleOCR-Algorithmus',
paddleocrAlgorithmTip: 'Algorithmus, der für die PaddleOCR-Verarbeitung verwendet wird',
paddleocrSelectAlgorithm: 'Algorithmus auswählen',
paddleocrModelNamePlaceholder: 'Zum Beispiel: paddleocr-umgebung-1',
overlappedPercent: 'Überlappungsprozent(%)',
generationScopeTip:
'Bestimmt, ob RAPTOR für den gesamten Datensatz oder für eine einzelne Datei generiert wird.',
@ -475,7 +486,7 @@ Prozedurales Gedächtnis: Erlernte Fähigkeiten, Gewohnheiten und automatisierte
book: `<p>Unterstützte Dateiformate sind <b>DOCX</b>, <b>PDF</b>, <b>TXT</b>.</p><p>
Für jedes Buch im PDF-Format stellen Sie bitte die <i>Seitenbereiche</i> ein, um unerwünschte Informationen zu entfernen und die Analysezeit zu reduzieren.</p>`,
laws: `<p>Unterstützte Dateiformate sind <b>DOCX</b>, <b>PDF</b>, <b>TXT</b>.</p><p>
Rechtliche Dokumente folgen in der Regel einem strengen Schreibformat. Wir verwenden Textmerkmale, um Teilungspunkte zu identifizieren.
Rechtliche Dokumente folgen in der Regel einem strengen Schreibformat. Wir verwenden Textmerkmale, um Teilungspunkte zu identifizieren.
</p><p>
Der Chunk hat eine Granularität, die mit 'ARTIKEL' übereinstimmt, wobei sichergestellt wird, dass der gesamte übergeordnete Text im Chunk enthalten ist.
</p>`,
@ -489,7 +500,7 @@ Prozedurales Gedächtnis: Erlernte Fähigkeiten, Gewohnheiten und automatisierte
<li>Dann werden benachbarte Segmente kombiniert, bis die Token-Anzahl den durch 'Chunk-Token-Anzahl' festgelegten Schwellenwert überschreitet, woraufhin ein Chunk erstellt wird.</li></p>`,
paper: `<p>Nur <b>PDF</b>-Dateien werden unterstützt.</p><p>
Papers werden nach Abschnitten wie <i>abstract, 1.1, 1.2</i> aufgeteilt. </p><p>
Dieser Ansatz ermöglicht es dem LLM, das Paper effektiver zusammenzufassen und umfassendere, verständlichere Antworten zu liefern.
Dieser Ansatz ermöglicht es dem LLM, das Paper effektiver zusammenzufassen und umfassendere, verständlichere Antworten zu liefern.
Es erhöht jedoch auch den Kontext für KI-Gespräche und die Rechenkosten für das LLM. Daher sollten Sie während eines Gesprächs erwägen, den Wert von '<b>topN</b>' zu reduzieren.</p>`,
presentation: `<p>Unterstützte Dateiformate sind <b>PDF</b>, <b>PPTX</b>.</p><p>
Jede Seite in den Folien wird als Chunk behandelt, wobei ihr Vorschaubild gespeichert wird.</p><p>
@ -1108,6 +1119,17 @@ Beispiel: Virtual Hosted Style`,
modelTypeMessage: 'Bitte geben Sie Ihren Modelltyp ein!',
addLlmBaseUrl: 'Basis-URL',
baseUrlNameMessage: 'Bitte geben Sie Ihre Basis-URL ein!',
paddleocr: {
apiUrl: 'PaddleOCR API-URL',
apiUrlPlaceholder: 'Zum Beispiel: https://paddleocr-server.com/layout-parsing',
accessToken: 'AI Studio-Zugriffstoken',
accessTokenPlaceholder: 'Ihr AI Studio-Token (optional)',
algorithm: 'PaddleOCR-Algorithmus',
selectAlgorithm: 'Algorithmus auswählen',
modelNamePlaceholder: 'Zum Beispiel: paddleocr-from-env-1',
modelNameRequired: 'Der Modellname ist ein Pflichtfeld',
apiUrlRequired: 'Die PaddleOCR API-URL ist ein Pflichtfeld'
},
vision: 'Unterstützt es Vision?',
ollamaLink: 'Wie integriere ich {{name}}',
FishAudioLink: 'Wie verwende ich FishAudio',

View File

@ -148,7 +148,7 @@ Procedural Memory: Learned skills, habits, and automated procedures.`,
action: 'Action',
},
config: {
memorySizeTooltip: `Accounts for each message's content + its embedding vector (≈ Content + Dimensions × 8 Bytes).
memorySizeTooltip: `Accounts for each message's content + its embedding vector (≈ Content + Dimensions × 8 Bytes).
Example: A 1 KB message with 1024-dim embedding uses ~9 KB. The 5 MB default limit holds ~500 such messages.`,
avatar: 'Avatar',
description: 'Description',
@ -424,6 +424,17 @@ Example: A 1 KB message with 1024-dim embedding uses ~9 KB. The 5 MB default lim
'Enable formula recognition. Note: This may not work correctly for Cyrillic documents.',
mineruTableEnable: 'Table recognition',
mineruTableEnableTip: 'Enable table recognition and extraction.',
paddleocrOptions: 'PaddleOCR Options',
paddleocrApiUrl: 'PaddleOCR API URL',
paddleocrApiUrlTip: 'The API endpoint URL for PaddleOCR service',
paddleocrApiUrlPlaceholder: 'e.g. https://paddleocr-server.com/layout-parsing',
paddleocrAccessToken: 'AI Studio Access Token',
paddleocrAccessTokenTip: 'Access token for PaddleOCR API (optional)',
paddleocrAccessTokenPlaceholder: 'Your AI Studio token (optional)',
paddleocrAlgorithm: 'PaddleOCR Algorithm',
paddleocrAlgorithmTip: 'Algorithm to use for PaddleOCR parsing',
paddleocrSelectAlgorithm: 'Select Algorithm',
paddleocrModelNamePlaceholder: 'e.g. paddleocr-from-env-1',
overlappedPercent: 'Overlapped percent(%)',
generationScopeTip:
'Determines whether RAPTOR is generated for the entire dataset or for a single file.',
@ -1094,6 +1105,17 @@ Example: Virtual Hosted Style`,
modelTypeMessage: 'Please input your model type!',
addLlmBaseUrl: 'Base url',
baseUrlNameMessage: 'Please input your base url!',
paddleocr: {
apiUrl: 'PaddleOCR API URL',
apiUrlPlaceholder: 'For example: https://paddleocr-server.com/layout-parsing',
accessToken: 'AI Studio Access Token',
accessTokenPlaceholder: 'Your AI Studio token (optional)',
algorithm: 'PaddleOCR Algorithm',
selectAlgorithm: 'Select Algorithm',
modelNamePlaceholder: 'For example: paddleocr-from-env-1',
modelNameRequired: 'Model name is required',
apiUrlRequired: 'PaddleOCR API URL is required'
},
vision: 'Does it support Vision?',
ollamaLink: 'How to integrate {{name}}',
FishAudioLink: 'How to use FishAudio',

View File

@ -159,6 +159,20 @@ export default {
html4excelTip: `Usar junto con el método de fragmentación General. Cuando está desactivado, los archivos de hoja de cálculo (XLSX, XLS (Excel 97-2003)) se analizan línea por línea como pares clave-valor. Cuando está activado, los archivos de hoja de cálculo se convierten en tablas HTML. Si la tabla original tiene más de 12 filas, el sistema la dividirá automáticamente en varias tablas HTML cada 12 filas. Para más información, consulte https://ragflow.io/docs/dev/enable_excel2html.`,
},
knowledgeConfiguration: {
paddleocrOptions: 'Opciones de PaddleOCR',
paddleocrApiUrl: 'URL de API de PaddleOCR',
paddleocrApiUrlTip: 'La URL del endpoint de la API para el servicio PaddleOCR',
paddleocrApiUrlPlaceholder: 'ej: https://servidor-paddleocr.com/api',
paddleocrAccessToken: 'Token de acceso de AI Studio',
paddleocrAccessTokenTip: 'Token de acceso para la API de PaddleOCR (opcional)',
paddleocrAccessTokenPlaceholder: 'Su token de AI Studio (opcional)',
paddleocrAlgorithm: 'Algoritmo de PaddleOCR',
paddleocrAlgorithmTip: 'Algoritmo a utilizar para el análisis de PaddleOCR',
paddleocrSelectAlgorithm: 'Seleccionar algoritmo',
paddleocrModelNamePlaceholder: 'ej: paddleocr-desde-env-1',
},
// Otros bloques de traducción
// Continua con la misma estructura
chat: {
@ -379,6 +393,17 @@ export default {
modelTypeMessage: '¡Por favor ingresa el tipo de tu modelo!',
addLlmBaseUrl: 'URL base',
baseUrlNameMessage: '¡Por favor ingresa tu URL base!',
paddleocr: {
apiUrl: 'URL de la API de PaddleOCR',
apiUrlPlaceholder: 'Por ejemplo: https://paddleocr-server.com/layout-parsing',
accessToken: 'Token de acceso de AI Studio',
accessTokenPlaceholder: 'Su token de AI Studio (opcional)',
algorithm: 'Algoritmo de PaddleOCR',
selectAlgorithm: 'Seleccionar algoritmo',
modelNamePlaceholder: 'Por ejemplo: paddleocr-from-env-1',
modelNameRequired: 'El nombre del modelo es obligatorio',
apiUrlRequired: 'La URL de la API de PaddleOCR es obligatoria'
},
vision: '¿Soporta visión?',
ollamaLink: 'Cómo integrar {{name}}',
FishAudioLink: 'Cómo usar FishAudio',

View File

@ -293,6 +293,17 @@ export default {
communityTip: `Un "community" est un groupe dentités liées. Le LLM peut générer un résumé pour chaque groupe. Voir plus ici : https: //www.microsoft.com/en-us/research/blog/graphrag-improving-global-search-via-dynamic-community-selection/`,
theDocumentBeingParsedCannotBeDeleted:
'Le document en cours danalyse ne peut pas être supprimé',
paddleocrOptions: 'Options PaddleOCR',
paddleocrApiUrl: 'URL de lAPI PaddleOCR',
paddleocrApiUrlTip: 'URL du point de terminaison de lAPI du service PaddleOCR',
paddleocrApiUrlPlaceholder: 'Par exemple : https://paddleocr-server.com/layout-parsing',
paddleocrAccessToken: 'Jeton daccès AI Studio',
paddleocrAccessTokenTip: 'Jeton daccès à lAPI PaddleOCR (optionnel)',
paddleocrAccessTokenPlaceholder: 'Votre jeton AI Studio (optionnel)',
paddleocrAlgorithm: 'Algorithme PaddleOCR',
paddleocrAlgorithmTip: 'Algorithme utilisé pour lanalyse PaddleOCR',
paddleocrSelectAlgorithm: 'Sélectionner un algorithme',
paddleocrModelNamePlaceholder: 'Par exemple : paddleocr-environnement-1',
},
chunk: {
chunk: 'Segment',
@ -566,6 +577,17 @@ export default {
modelTypeMessage: 'Veuillez saisir le type de votre modèle !',
addLlmBaseUrl: 'URL de base',
baseUrlNameMessage: 'Veuillez saisir votre URL de base !',
paddleocr: {
apiUrl: 'URL de lAPI PaddleOCR',
apiUrlPlaceholder: 'Par exemple : https://paddleocr-server.com/layout-parsing',
accessToken: 'Jeton daccès AI Studio',
accessTokenPlaceholder: 'Votre jeton AI Studio (optionnel)',
algorithm: 'Algorithme PaddleOCR',
selectAlgorithm: 'Sélectionner un algorithme',
modelNamePlaceholder: 'Par exemple : paddleocr-from-env-1',
modelNameRequired: 'Le nom du modèle est obligatoire',
apiUrlRequired: 'LURL de lAPI PaddleOCR est obligatoire'
},
vision: 'Supporte-t-il la vision ?',
ollamaLink: 'Comment intégrer {{name}}',
FishAudioLink: 'Comment utiliser FishAudio',

View File

@ -316,6 +316,17 @@ export default {
randomSeed: 'Benih acak',
randomSeedMessage: 'Benih acak diperlukan',
entityTypes: 'Jenis entitas',
paddleocrOptions: 'Opsi PaddleOCR',
paddleocrApiUrl: 'URL API PaddleOCR',
paddleocrApiUrlTip: 'URL endpoint API layanan PaddleOCR',
paddleocrApiUrlPlaceholder: 'Contoh: https://paddleocr-server.com/layout-parsing',
paddleocrAccessToken: 'Token Akses AI Studio',
paddleocrAccessTokenTip: 'Token akses untuk API PaddleOCR (opsional)',
paddleocrAccessTokenPlaceholder: 'Token AI Studio Anda (opsional)',
paddleocrAlgorithm: 'Algoritma PaddleOCR',
paddleocrAlgorithmTip: 'Algoritma yang digunakan untuk pemrosesan PaddleOCR',
paddleocrSelectAlgorithm: 'Pilih algoritma',
paddleocrModelNamePlaceholder: 'Contoh: paddleocr-lingkungan-1',
},
chunk: {
chunk: 'Potongan',
@ -553,6 +564,17 @@ export default {
modelTypeMessage: 'Silakan masukkan jenis model Anda!',
addLlmBaseUrl: 'Base url',
baseUrlNameMessage: 'Silakan masukkan base url Anda!',
paddleocr: {
apiUrl: 'URL API PaddleOCR',
apiUrlPlaceholder: 'Contoh: https://paddleocr-server.com/layout-parsing',
accessToken: 'Token Akses AI Studio',
accessTokenPlaceholder: 'Token AI Studio Anda (opsional)',
algorithm: 'Algoritma PaddleOCR',
selectAlgorithm: 'Pilih algoritma',
modelNamePlaceholder: 'Contoh: paddleocr-from-env-1',
modelNameRequired: 'Nama model wajib diisi',
apiUrlRequired: 'URL API PaddleOCR wajib diisi'
},
vision: 'Apakah mendukung Vision?',
ollamaLink: 'Cara mengintegrasikan {{name}}',
FishAudioLink: 'Cara menggunakan FishAudio',

View File

@ -488,6 +488,17 @@ Quanto sopra è il contenuto che devi riassumere.`,
'In un grafo della conoscenza, una comunità è un cluster di entità collegate da relazioni. Puoi far generare al LLM un abstract per ogni comunità, noto come report comunità.',
theDocumentBeingParsedCannotBeDeleted:
'Il documento in fase di analisi non può essere eliminato',
paddleocrOptions: 'Opzioni PaddleOCR',
paddleocrApiUrl: 'URL API di PaddleOCR',
paddleocrApiUrlTip: 'URL dellendpoint API del servizio PaddleOCR',
paddleocrApiUrlPlaceholder: 'Ad esempio: https://paddleocr-server.com/layout-parsing',
paddleocrAccessToken: 'Token di accesso AI Studio',
paddleocrAccessTokenTip: 'Token di accesso per lAPI PaddleOCR (facoltativo)',
paddleocrAccessTokenPlaceholder: 'Il tuo token AI Studio (facoltativo)',
paddleocrAlgorithm: 'Algoritmo PaddleOCR',
paddleocrAlgorithmTip: 'Algoritmo utilizzato per lelaborazione PaddleOCR',
paddleocrSelectAlgorithm: 'Seleziona algoritmo',
paddleocrModelNamePlaceholder: 'Ad esempio: paddleocr-ambiente-1',
},
chunk: {
chunk: 'Chunk',
@ -785,6 +796,17 @@ Quanto sopra è il contenuto che devi riassumere.`,
modelTypeMessage: 'Inserisci il tuo tipo di modello!',
addLlmBaseUrl: 'URL base',
baseUrlNameMessage: 'Inserisci il tuo URL base!',
paddleocr: {
apiUrl: 'URL API di PaddleOCR',
apiUrlPlaceholder: 'Ad esempio: https://paddleocr-server.com/layout-parsing',
accessToken: 'Token di accesso AI Studio',
accessTokenPlaceholder: 'Il tuo token AI Studio (facoltativo)',
algorithm: 'Algoritmo PaddleOCR',
selectAlgorithm: 'Seleziona algoritmo',
modelNamePlaceholder: 'Ad esempio: paddleocr-from-env-1',
modelNameRequired: 'Il nome del modello è obbligatorio',
apiUrlRequired: 'LURL API di PaddleOCR è obbligatorio'
},
vision: 'Supporta Vision?',
ollamaLink: 'Come integrare {{name}}',
FishAudioLink: 'Come usare FishAudio',

View File

@ -240,7 +240,7 @@ export default {
<b>XLSX</b>形式のファイルには、ヘッダーのない2つの
列が必要です: 1つは質問の列でもう1つは回答の列です
(質問列が先行)。複数のシートも可能です。
</li>
<li>
<b>CSV/TXT</b>形式のファイルは、TABで区切られたUTF-8エンコードである必要があります。
@ -285,7 +285,7 @@ export default {
LLMがその量のコンテキスト長を処理できる場合に、ドキュメント全体を要約する必要があるときに適用されます。
</p>`,
knowledgeGraph: `<p>対応ファイル形式は<b>DOCX, EXCEL, PPT, IMAGE, PDF, TXT, MD, JSON, EML</b>です。
<p>このアプローチでは、ファイルを'ナイーブ'/'一般'メソッドを使用してチャンクに分割します。ドキュメントをセグメントに分割し、隣接するセグメントを結合してトークン数が'チャンクトークン数'で指定されたしきい値を超えるまで続け、その時点でチャンクが作成されます。</p>
<p>その後、チャンクはLLMに入力され、ナレッジグラフとマインドマップのエンティティと関係を抽出します。</p>
<p><b>エンティティタイプ</b>を設定することを忘れないでください。</p>`,
@ -314,6 +314,17 @@ export default {
entityTypes: 'エンティティタイプ',
pageRank: 'ページランク',
pageRankTip: `検索時に特定の知識ベースにより高いPageRankスコアを割り当てることができます。対応するスコアは、これらの知識ベースから取得されたチャンクのハイブリッド類似度スコアに加算され、ランキングが向上します。詳細については、https://ragflow.io/docs/dev/set_page_rank を参照してください。`,
paddleocrOptions: 'PaddleOCRオプション',
paddleocrApiUrl: 'PaddleOCR API URL',
paddleocrApiUrlTip: 'PaddleOCRサービスのAPIエンドポイントURL',
paddleocrApiUrlPlaceholder: '例: https://paddleocr-server.com/api',
paddleocrAccessToken: 'AI Studioアクセストークン',
paddleocrAccessTokenTip: 'PaddleOCR APIのアクセストークンオプション',
paddleocrAccessTokenPlaceholder: 'AI Studioトークンオプション',
paddleocrAlgorithm: 'PaddleOCRアルゴリズム',
paddleocrAlgorithmTip: 'PaddleOCR解析に使用するアルゴリズム',
paddleocrSelectAlgorithm: 'アルゴリズムを選択',
paddleocrModelNamePlaceholder: '例: paddleocr-from-env-1',
},
chunk: {
chunk: 'チャンク',
@ -596,6 +607,17 @@ export default {
modelTypeMessage: 'モデルタイプを入力してください!',
addLlmBaseUrl: 'ベースURL',
baseUrlNameMessage: 'ベースURLを入力してください',
paddleocr: {
apiUrl: 'PaddleOCR API URL',
apiUrlPlaceholder: '例https://paddleocr-server.com/layout-parsing',
accessToken: 'AI Studio アクセストークン',
accessTokenPlaceholder: 'AI Studio のトークン(任意)',
algorithm: 'PaddleOCR アルゴリズム',
selectAlgorithm: 'アルゴリズムを選択',
modelNamePlaceholder: '例paddleocr-from-env-1',
modelNameRequired: 'モデル名は必須です',
apiUrlRequired: 'PaddleOCR API URL は必須です'
},
vision: 'ビジョンをサポートしていますか?',
ollamaLink: '{{name}}を統合する方法',
FishAudioLink: 'FishAudioの使用方法',

View File

@ -310,6 +310,17 @@ export default {
topnTags: 'Top-N Etiquetas',
tags: 'Etiquetas',
addTag: 'Adicionar etiqueta',
paddleocrOptions: 'Opções do PaddleOCR',
paddleocrApiUrl: 'URL da API do PaddleOCR',
paddleocrApiUrlTip: 'A URL do endpoint da API para o serviço PaddleOCR',
paddleocrApiUrlPlaceholder: 'ex: https://servidor-paddleocr.com/api',
paddleocrAccessToken: 'Token de Acesso do AI Studio',
paddleocrAccessTokenTip: 'Token de acesso para a API do PaddleOCR (opcional)',
paddleocrAccessTokenPlaceholder: 'Seu token do AI Studio (opcional)',
paddleocrAlgorithm: 'Algoritmo do PaddleOCR',
paddleocrAlgorithmTip: 'Algoritmo a ser usado para a análise do PaddleOCR',
paddleocrSelectAlgorithm: 'Selecionar algoritmo',
paddleocrModelNamePlaceholder: 'ex: paddleocr-do-ambiente-1',
},
chunk: {
chunk: 'Fragmento',
@ -546,6 +557,17 @@ export default {
modelTypeMessage: 'Por favor, insira o tipo do seu modelo!',
addLlmBaseUrl: 'URL base',
baseUrlNameMessage: 'Por favor, insira sua URL base!',
paddleocr: {
apiUrl: 'URL da API do PaddleOCR',
apiUrlPlaceholder: 'Por exemplo: https://paddleocr-server.com/layout-parsing',
accessToken: 'Token de acesso do AI Studio',
accessTokenPlaceholder: 'Seu token do AI Studio (opcional)',
algorithm: 'Algoritmo do PaddleOCR',
selectAlgorithm: 'Selecionar algoritmo',
modelNamePlaceholder: 'Por exemplo: paddleocr-from-env-1',
modelNameRequired: 'O nome do modelo é obrigatório',
apiUrlRequired: 'A URL da API do PaddleOCR é obrigatória'
},
vision: 'Suporta visão?',
ollamaLink: 'Como integrar {{name}}',
FishAudioLink: 'Como usar FishAudio',

View File

@ -510,6 +510,17 @@ export default {
'В графе знаний сообщество - это кластер сущностей, связанных отношениями. Вы можете поручить LLM генерировать аннотацию для каждого сообщества, известную как отчет сообщества. Более подробная информация здесь: https://www.microsoft.com/en-us/research/blog/graphrag-improving-global-search-via-dynamic-community-selection/',
theDocumentBeingParsedCannotBeDeleted:
'Документ, который в данный момент парсится, не может быть удален',
paddleocrOptions: 'Параметры PaddleOCR',
paddleocrApiUrl: 'URL API PaddleOCR',
paddleocrApiUrlTip: 'URL конечной точки API сервиса PaddleOCR',
paddleocrApiUrlPlaceholder: 'Например: https://paddleocr-server.com/layout-parsing',
paddleocrAccessToken: 'Токен доступа AI Studio',
paddleocrAccessTokenTip: 'Токен доступа к API PaddleOCR (необязательно)',
paddleocrAccessTokenPlaceholder: 'Ваш токен AI Studio (необязательно)',
paddleocrAlgorithm: 'Алгоритм PaddleOCR',
paddleocrAlgorithmTip: 'Алгоритм, используемый для обработки PaddleOCR',
paddleocrSelectAlgorithm: 'Выбрать алгоритм',
paddleocrModelNamePlaceholder: 'Например: paddleocr-среда-1',
},
chunk: {
chunk: 'Чанк',
@ -716,7 +727,7 @@ export default {
'Базовый URL вашего экземпляра Confluence (например, https://your-domain.atlassian.net/wiki)',
confluenceSpaceKeyTip:
'Необязательно: Укажите ключ пространства для синхронизации только определенного пространства. Оставьте пустым для синхронизации всех доступных пространств. Для нескольких пространств разделите запятыми (например, DEV,DOCS,HR)',
s3PrefixTip: `Укажите путь к папке в вашем S3 бакете для получения файлов.
s3PrefixTip: `Укажите путь к папке в вашем S3 бакете для получения файлов.
Пример: general/v2/`,
S3CompatibleEndpointUrlTip: `Требуется для S3 совместимого Storage Box. Укажите URL конечной точки, совместимой с S3.
Пример: https://fsn1.your-objectstorage.com`,
@ -1034,6 +1045,17 @@ export default {
modelsToBeAddedTooltip:
'Если ваш провайдер моделей не указан, но заявляет о "совместимости с OpenAI-API", выберите карточку OpenAI-API-compatible, чтобы добавить соответствующие модели. ',
mcp: 'MCP',
paddleocr: {
apiUrl: 'URL API PaddleOCR',
apiUrlPlaceholder: 'Например: https://paddleocr-server.com/layout-parsing',
accessToken: 'Токен доступа AI Studio',
accessTokenPlaceholder: 'Ваш токен AI Studio (необязательно)',
algorithm: 'Алгоритм PaddleOCR',
selectAlgorithm: 'Выбрать алгоритм',
modelNamePlaceholder: 'Например: paddleocr-from-env-1',
modelNameRequired: 'Имя модели является обязательным',
apiUrlRequired: 'URL API PaddleOCR является обязательным'
},
},
message: {
registered: 'Зарегистрирован!',

View File

@ -354,6 +354,17 @@ export default {
community: 'Xây dựng mối quan hệ cộng đồng',
communityTip:
'Các liên kết được nhóm lại thành các cộng đồng phân cấp, với các thực thể và mối quan hệ kết nối từng phân đoạn lên các cấp độ trừu tượng cao hơn. Sau đó, chúng tôi sử dụng một LLM để tạo ra bản tóm tắt cho mỗi cộng đồng, được gọi là báo cáo cộng đồng. Xem thêm: https://www.microsoft.com/en-us/research/blog/graphrag-improving-global-search-via-dynamic-community-selection/',
paddleocrOptions: 'Tùy chọn PaddleOCR',
paddleocrApiUrl: 'URL API PaddleOCR',
paddleocrApiUrlTip: 'URL điểm cuối API của dịch vụ PaddleOCR',
paddleocrApiUrlPlaceholder: 'Ví dụ: https://paddleocr-server.com/layout-parsing',
paddleocrAccessToken: 'Token truy cập AI Studio',
paddleocrAccessTokenTip: 'Token truy cập cho API PaddleOCR (tùy chọn)',
paddleocrAccessTokenPlaceholder: 'Token AI Studio của bạn (tùy chọn)',
paddleocrAlgorithm: 'Thuật toán PaddleOCR',
paddleocrAlgorithmTip: 'Thuật toán được sử dụng để xử lý PaddleOCR',
paddleocrSelectAlgorithm: 'Chọn thuật toán',
paddleocrModelNamePlaceholder: 'Ví dụ: paddleocr-môi-trường-1',
},
chunk: {
chunk: 'Khối',
@ -595,6 +606,17 @@ export default {
modelTypeMessage: 'Vui lòng nhập loại mô hình của bạn!',
addLlmBaseUrl: 'URL cơ sở',
baseUrlNameMessage: 'Vui lòng nhập URL cơ sở của bạn!',
paddleocr: {
apiUrl: 'URL API PaddleOCR',
apiUrlPlaceholder: 'Ví dụ: https://paddleocr-server.com/layout-parsing',
accessToken: 'Token truy cập AI Studio',
accessTokenPlaceholder: 'Token AI Studio của bạn (tùy chọn)',
algorithm: 'Thuật toán PaddleOCR',
selectAlgorithm: 'Chọn thuật toán',
modelNamePlaceholder: 'Ví dụ: paddleocr-from-env-1',
modelNameRequired: 'Tên mô hình là bắt buộc',
apiUrlRequired: 'URL API PaddleOCR là bắt buộc'
},
vision: 'Có hỗ trợ Tầm nhìn không?',
ollamaLink: 'Cách tích hợp {{name}}',
FishAudioLink: 'Cách sử dụng FishAudio',

View File

@ -367,6 +367,17 @@ export default {
`,
tags: '標籤',
addTag: '增加標籤',
paddleocrOptions: 'PaddleOCR 選項',
paddleocrApiUrl: 'PaddleOCR API URL',
paddleocrApiUrlTip: 'PaddleOCR 服務的 API 端點 URL',
paddleocrApiUrlPlaceholder: '例如https://paddleocr-server.com/layout-parsing',
paddleocrAccessToken: 'AI Studio 訪問令牌',
paddleocrAccessTokenTip: 'PaddleOCR API 的訪問令牌(可選)',
paddleocrAccessTokenPlaceholder: '您的 AI Studio 令牌(可選)',
paddleocrAlgorithm: 'PaddleOCR 算法',
paddleocrAlgorithmTip: '用於 PaddleOCR 解析的算法',
paddleocrSelectAlgorithm: '選擇算法',
paddleocrModelNamePlaceholder: '例如paddleocr-環境-1',
useGraphRag: '提取知識圖譜',
useGraphRagTip:
'基於知識庫內所有切好的文本塊構建知識圖譜,用以提升多跳和複雜問題回答的正確率。請注意:構建知識圖譜將消耗大量 token 和時間。詳見 https://ragflow.io/docs/dev/construct_knowledge_graph。',
@ -644,6 +655,17 @@ export default {
modelNameMessage: '請輸入模型名稱!',
modelTypeMessage: '請輸入模型類型!',
baseUrlNameMessage: '請輸入基礎 Url',
paddleocr: {
apiUrl: 'PaddleOCR API URL',
apiUrlPlaceholder: '例如https://paddleocr-server.com/layout-parsing',
accessToken: 'AI Studio 存取權杖',
accessTokenPlaceholder: '您的 AI Studio 權杖(選填)',
algorithm: 'PaddleOCR 演算法',
selectAlgorithm: '選擇演算法',
modelNamePlaceholder: '例如paddleocr-from-env-1',
modelNameRequired: '模型名稱為必填項目',
apiUrlRequired: 'PaddleOCR API URL 為必填項目'
},
ollamaLink: '如何集成 {{name}}',
FishAudioLink: '如何使用Fish Audio',
TencentCloudLink: '如何使用騰訊雲語音識別',

View File

@ -390,6 +390,17 @@ export default {
'启用公式识别。注意:对于西里尔文档可能无法正常工作。',
mineruTableEnable: '表格识别',
mineruTableEnableTip: '启用表格识别和提取。',
paddleocrOptions: 'PaddleOCR 选项',
paddleocrApiUrl: 'PaddleOCR API URL',
paddleocrApiUrlTip: 'PaddleOCR 服务的 API 端点 URL',
paddleocrApiUrlPlaceholder: '例如https://paddleocr-server.com/layout-parsing',
paddleocrAccessToken: 'AI Studio 访问令牌',
paddleocrAccessTokenTip: 'PaddleOCR API 的访问令牌(可选)',
paddleocrAccessTokenPlaceholder: '您的 AI Studio 令牌(可选)',
paddleocrAlgorithm: 'PaddleOCR 算法',
paddleocrAlgorithmTip: '用于 PaddleOCR 解析的算法',
paddleocrSelectAlgorithm: '选择算法',
paddleocrModelNamePlaceholder: '例如paddleocr-环境-1',
generationScopeTip: '选择 RAPTOR 的生成范围:整个知识库或单个文件。',
generationScope: '生成范围',
scopeSingleFile: '单文件',
@ -1113,6 +1124,17 @@ General实体和关系提取提示来自 GitHub - microsoft/graphrag基于
vlmLmdeployEngine: '基于LMDeploy引擎的视觉语言模型实验性',
},
},
paddleocr: {
apiUrl: 'PaddleOCR API URL',
apiUrlPlaceholder: '例如https://paddleocr-server.com/layout-parsing',
accessToken: 'AI Studio访问令牌',
accessTokenPlaceholder: '您的 AI Studio 令牌(可选)',
algorithm: 'PaddleOCR算法',
selectAlgorithm: '选择算法',
modelNamePlaceholder: '例如paddleocr-from-env-1',
modelNameRequired: '模型名称为必填项',
apiUrlRequired: 'PaddleOCR API URL 为必填项'
},
},
message: {
registered: '注册成功',

View File

@ -504,3 +504,43 @@ export const useSubmitMinerU = () => {
mineruLoading: loading,
};
};
export const useSubmitPaddleOCR = () => {
const { addLlm, loading } = useAddLlm();
const {
visible: paddleocrVisible,
hideModal: hidePaddleOCRModal,
showModal: showPaddleOCRModal,
} = useSetModalState();
const onPaddleOCROk = useCallback(
async (payload: any) => {
const cfg: any = {
...payload,
};
const req: IAddLlmRequestBody = {
llm_factory: LLMFactory.PaddleOCR,
llm_name: payload.llm_name,
model_type: 'ocr',
api_key: cfg,
api_base: '',
max_tokens: 0,
};
const ret = await addLlm(req);
if (ret === 0) {
hidePaddleOCRModal();
return true;
}
return false;
},
[addLlm, hidePaddleOCRModal],
);
return {
paddleocrVisible,
hidePaddleOCRModal,
showPaddleOCRModal,
onPaddleOCROk,
paddleocrLoading: loading,
};
};

View File

@ -15,6 +15,7 @@ import {
useSubmitHunyuan,
useSubmitMinerU,
useSubmitOllama,
useSubmitPaddleOCR,
useSubmitSpark,
useSubmitSystemModelSetting,
useSubmitTencentCloud,
@ -28,6 +29,7 @@ import FishAudioModal from './modal/fish-audio-modal';
import GoogleModal from './modal/google-modal';
import HunyuanModal from './modal/hunyuan-modal';
import MinerUModal from './modal/mineru-modal';
import PaddleOCRModal from './modal/paddleocr-modal';
import TencentCloudModal from './modal/next-tencent-modal';
import OllamaModal from './modal/ollama-modal';
import SparkModal from './modal/spark-modal';
@ -138,6 +140,14 @@ const ModelProviders = () => {
mineruLoading,
} = useSubmitMinerU();
const {
paddleocrVisible,
hidePaddleOCRModal,
showPaddleOCRModal,
onPaddleOCROk,
paddleocrLoading,
} = useSubmitPaddleOCR();
const ModalMap = useMemo(
() => ({
[LLMFactory.Bedrock]: showBedrockAddingModal,
@ -150,6 +160,7 @@ const ModelProviders = () => {
[LLMFactory.GoogleCloud]: showGoogleAddingModal,
[LLMFactory.AzureOpenAI]: showAzureAddingModal,
[LLMFactory.MinerU]: showMineruModal,
[LLMFactory.PaddleOCR]: showPaddleOCRModal,
}),
[
showBedrockAddingModal,
@ -162,6 +173,7 @@ const ModelProviders = () => {
showGoogleAddingModal,
showAzureAddingModal,
showMineruModal,
showPaddleOCRModal,
],
);
@ -309,6 +321,12 @@ const ModelProviders = () => {
onOk={onMineruOk}
loading={mineruLoading}
></MinerUModal>
<PaddleOCRModal
visible={paddleocrVisible}
hideModal={hidePaddleOCRModal}
onOk={onPaddleOCROk}
loading={paddleocrLoading}
></PaddleOCRModal>
</div>
);
};

View File

@ -0,0 +1,135 @@
import { useForm } from 'react-hook-form';
import { useTranslation } from 'react-i18next';
import { z } from 'zod';
import { zodResolver } from '@hookform/resolvers/zod';
import { t } from 'i18next';
import {
Dialog,
DialogContent,
DialogHeader,
DialogTitle,
} from '@/components/ui/dialog';
import { RAGFlowFormItem } from '@/components/ragflow-form';
import { RAGFlowSelect, RAGFlowSelectOptionType } from '@/components/ui/select';
import { Input } from '@/components/ui/input';
import { Form } from '@/components/ui/form';
import { LLMHeader } from '../../components/llm-header';
import { LLMFactory } from '@/constants/llm';
const FormSchema = z.object({
llm_name: z.string().min(1, {
message: t('setting.paddleocr.modelNameRequired'),
}),
paddleocr_api_url: z.string().min(1, {
message: t('setting.paddleocr.apiUrlRequired'),
}),
paddleocr_access_token: z.string().optional(),
paddleocr_algorithm: z.string().default('PaddleOCR-VL'),
});
export type PaddleOCRFormValues = z.infer<typeof FormSchema>;
export interface IModalProps<T> {
visible: boolean;
hideModal: () => void;
onOk?: (data: T) => Promise<boolean>;
loading?: boolean;
}
const algorithmOptions: RAGFlowSelectOptionType[] = [
{ label: 'PaddleOCR-VL', value: 'PaddleOCR-VL' },
];
const PaddleOCRModal = ({
visible,
hideModal,
onOk,
loading,
}: IModalProps<PaddleOCRFormValues>) => {
const { t } = useTranslation();
const form = useForm<PaddleOCRFormValues>({
resolver: zodResolver(FormSchema),
defaultValues: {
paddleocr_algorithm: 'PaddleOCR-VL',
},
});
const handleOk = async (values: PaddleOCRFormValues) => {
const ret = await onOk?.(values as any);
if (ret) {
hideModal?.();
}
};
return (
<Dialog open={visible} onOpenChange={hideModal}>
<DialogContent>
<DialogHeader>
<DialogTitle>
<LLMHeader name={LLMFactory.PaddleOCR} />
</DialogTitle>
</DialogHeader>
<Form {...form}>
<form
onSubmit={form.handleSubmit(handleOk)}
className="space-y-6"
id="paddleocr-form"
>
<RAGFlowFormItem
name="llm_name"
label={t('setting.modelName')}
required
>
<Input placeholder={t('setting.paddleocr.modelNamePlaceholder')} />
</RAGFlowFormItem>
<RAGFlowFormItem
name="paddleocr_api_url"
label={t('setting.paddleocr.apiUrl')}
required
>
<Input placeholder={t('setting.paddleocr.apiUrlPlaceholder')} />
</RAGFlowFormItem>
<RAGFlowFormItem
name="paddleocr_access_token"
label={t('setting.paddleocr.accessToken')}
>
<Input placeholder={t('setting.paddleocr.accessTokenPlaceholder')} />
</RAGFlowFormItem>
<RAGFlowFormItem
name="paddleocr_algorithm"
label={t('setting.paddleocr.algorithm')}
>
{(field) => (
<RAGFlowSelect
value={field.value}
onChange={field.onChange}
options={algorithmOptions}
placeholder={t('setting.paddleocr.selectAlgorithm')}
/>
)}
</RAGFlowFormItem>
<div className="flex justify-end space-x-2">
<button
type="button"
onClick={hideModal}
className="btn btn-secondary"
>
{t('common.cancel')}
</button>
<button
type="submit"
disabled={loading}
className="btn btn-primary"
>
{loading ? t('common.adding') : t('common.add')}
</button>
</div>
</form>
</Form>
</DialogContent>
</Dialog>
);
};
export default PaddleOCRModal;