mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-01-19 03:35:11 +08:00
feat: add paddleocr parser (#12513)
### What problem does this PR solve? Add PaddleOCR as a new PDF parser. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -195,6 +195,9 @@ async def add_llm():
|
||||
elif factory == "MinerU":
|
||||
api_key = apikey_json(["api_key", "provider_order"])
|
||||
|
||||
elif factory == "PaddleOCR":
|
||||
api_key = apikey_json(["api_key", "provider_order"])
|
||||
|
||||
llm = {
|
||||
"tenant_id": current_user.id,
|
||||
"llm_factory": factory,
|
||||
@ -230,8 +233,7 @@ async def add_llm():
|
||||
**extra,
|
||||
)
|
||||
try:
|
||||
m, tc = await mdl.async_chat(None, [{"role": "user", "content": "Hello! How are you doing!"}],
|
||||
{"temperature": 0.9})
|
||||
m, tc = await mdl.async_chat(None, [{"role": "user", "content": "Hello! How are you doing!"}], {"temperature": 0.9})
|
||||
if not tc and m.find("**ERROR**:") >= 0:
|
||||
raise Exception(m)
|
||||
except Exception as e:
|
||||
@ -381,7 +383,7 @@ def list_app():
|
||||
facts = set([o.to_dict()["llm_factory"] for o in objs if o.api_key and o.status == StatusEnum.VALID.value])
|
||||
status = {(o.llm_name + "@" + o.llm_factory) for o in objs if o.status == StatusEnum.VALID.value}
|
||||
llms = LLMService.get_all()
|
||||
llms = [m.to_dict() for m in llms if m.status == StatusEnum.VALID.value and m.fid not in weighted and (m.fid == 'Builtin' or (m.llm_name + "@" + m.fid) in status)]
|
||||
llms = [m.to_dict() for m in llms if m.status == StatusEnum.VALID.value and m.fid not in weighted and (m.fid == "Builtin" or (m.llm_name + "@" + m.fid) in status)]
|
||||
for m in llms:
|
||||
m["available"] = m["fid"] in facts or m["llm_name"].lower() == "flag-embedding" or m["fid"] in self_deployed
|
||||
if "tei-" in os.getenv("COMPOSE_PROFILES", "") and m["model_type"] == LLMType.EMBEDDING and m["fid"] == "Builtin" and m["llm_name"] == os.getenv("TEI_MODEL", ""):
|
||||
|
||||
@ -19,7 +19,7 @@ import logging
|
||||
from peewee import IntegrityError
|
||||
from langfuse import Langfuse
|
||||
from common import settings
|
||||
from common.constants import MINERU_DEFAULT_CONFIG, MINERU_ENV_KEYS, LLMType
|
||||
from common.constants import MINERU_DEFAULT_CONFIG, MINERU_ENV_KEYS, PADDLEOCR_DEFAULT_CONFIG, PADDLEOCR_ENV_KEYS, LLMType
|
||||
from api.db.db_models import DB, LLMFactories, TenantLLM
|
||||
from api.db.services.common_service import CommonService
|
||||
from api.db.services.langfuse_service import TenantLangfuseService
|
||||
@ -60,10 +60,8 @@ class TenantLLMService(CommonService):
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_my_llms(cls, tenant_id):
|
||||
fields = [cls.model.llm_factory, LLMFactories.logo, LLMFactories.tags, cls.model.model_type, cls.model.llm_name,
|
||||
cls.model.used_tokens, cls.model.status]
|
||||
objs = cls.model.select(*fields).join(LLMFactories, on=(cls.model.llm_factory == LLMFactories.name)).where(
|
||||
cls.model.tenant_id == tenant_id, ~cls.model.api_key.is_null()).dicts()
|
||||
fields = [cls.model.llm_factory, LLMFactories.logo, LLMFactories.tags, cls.model.model_type, cls.model.llm_name, cls.model.used_tokens, cls.model.status]
|
||||
objs = cls.model.select(*fields).join(LLMFactories, on=(cls.model.llm_factory == LLMFactories.name)).where(cls.model.tenant_id == tenant_id, ~cls.model.api_key.is_null()).dicts()
|
||||
|
||||
return list(objs)
|
||||
|
||||
@ -90,6 +88,7 @@ class TenantLLMService(CommonService):
|
||||
@DB.connection_context()
|
||||
def get_model_config(cls, tenant_id, llm_type, llm_name=None):
|
||||
from api.db.services.llm_service import LLMService
|
||||
|
||||
e, tenant = TenantService.get_by_id(tenant_id)
|
||||
if not e:
|
||||
raise LookupError("Tenant not found")
|
||||
@ -119,9 +118,9 @@ class TenantLLMService(CommonService):
|
||||
model_config = cls.get_api_key(tenant_id, mdlnm)
|
||||
if model_config:
|
||||
model_config = model_config.to_dict()
|
||||
elif llm_type == LLMType.EMBEDDING and fid == 'Builtin' and "tei-" in os.getenv("COMPOSE_PROFILES", "") and mdlnm == os.getenv('TEI_MODEL', ''):
|
||||
elif llm_type == LLMType.EMBEDDING and fid == "Builtin" and "tei-" in os.getenv("COMPOSE_PROFILES", "") and mdlnm == os.getenv("TEI_MODEL", ""):
|
||||
embedding_cfg = settings.EMBEDDING_CFG
|
||||
model_config = {"llm_factory": 'Builtin', "api_key": embedding_cfg["api_key"], "llm_name": mdlnm, "api_base": embedding_cfg["base_url"]}
|
||||
model_config = {"llm_factory": "Builtin", "api_key": embedding_cfg["api_key"], "llm_name": mdlnm, "api_base": embedding_cfg["base_url"]}
|
||||
else:
|
||||
raise LookupError(f"Model({mdlnm}@{fid}) not authorized")
|
||||
|
||||
@ -140,33 +139,27 @@ class TenantLLMService(CommonService):
|
||||
if llm_type == LLMType.EMBEDDING.value:
|
||||
if model_config["llm_factory"] not in EmbeddingModel:
|
||||
return None
|
||||
return EmbeddingModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"],
|
||||
base_url=model_config["api_base"])
|
||||
return EmbeddingModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"])
|
||||
|
||||
elif llm_type == LLMType.RERANK:
|
||||
if model_config["llm_factory"] not in RerankModel:
|
||||
return None
|
||||
return RerankModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"],
|
||||
base_url=model_config["api_base"])
|
||||
return RerankModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"])
|
||||
|
||||
elif llm_type == LLMType.IMAGE2TEXT.value:
|
||||
if model_config["llm_factory"] not in CvModel:
|
||||
return None
|
||||
return CvModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], lang,
|
||||
base_url=model_config["api_base"], **kwargs)
|
||||
return CvModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], lang, base_url=model_config["api_base"], **kwargs)
|
||||
|
||||
elif llm_type == LLMType.CHAT.value:
|
||||
if model_config["llm_factory"] not in ChatModel:
|
||||
return None
|
||||
return ChatModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"],
|
||||
base_url=model_config["api_base"], **kwargs)
|
||||
return ChatModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"], **kwargs)
|
||||
|
||||
elif llm_type == LLMType.SPEECH2TEXT:
|
||||
if model_config["llm_factory"] not in Seq2txtModel:
|
||||
return None
|
||||
return Seq2txtModel[model_config["llm_factory"]](key=model_config["api_key"],
|
||||
model_name=model_config["llm_name"], lang=lang,
|
||||
base_url=model_config["api_base"])
|
||||
return Seq2txtModel[model_config["llm_factory"]](key=model_config["api_key"], model_name=model_config["llm_name"], lang=lang, base_url=model_config["api_base"])
|
||||
elif llm_type == LLMType.TTS:
|
||||
if model_config["llm_factory"] not in TTSModel:
|
||||
return None
|
||||
@ -216,14 +209,11 @@ class TenantLLMService(CommonService):
|
||||
try:
|
||||
num = (
|
||||
cls.model.update(used_tokens=cls.model.used_tokens + used_tokens)
|
||||
.where(cls.model.tenant_id == tenant_id, cls.model.llm_name == llm_name,
|
||||
cls.model.llm_factory == llm_factory if llm_factory else True)
|
||||
.where(cls.model.tenant_id == tenant_id, cls.model.llm_name == llm_name, cls.model.llm_factory == llm_factory if llm_factory else True)
|
||||
.execute()
|
||||
)
|
||||
except Exception:
|
||||
logging.exception(
|
||||
"TenantLLMService.increase_usage got exception,Failed to update used_tokens for tenant_id=%s, llm_name=%s",
|
||||
tenant_id, llm_name)
|
||||
logging.exception("TenantLLMService.increase_usage got exception,Failed to update used_tokens for tenant_id=%s, llm_name=%s", tenant_id, llm_name)
|
||||
return 0
|
||||
|
||||
return num
|
||||
@ -231,9 +221,7 @@ class TenantLLMService(CommonService):
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def get_openai_models(cls):
|
||||
objs = cls.model.select().where((cls.model.llm_factory == "OpenAI"),
|
||||
~(cls.model.llm_name == "text-embedding-3-small"),
|
||||
~(cls.model.llm_name == "text-embedding-3-large")).dicts()
|
||||
objs = cls.model.select().where((cls.model.llm_factory == "OpenAI"), ~(cls.model.llm_name == "text-embedding-3-small"), ~(cls.model.llm_name == "text-embedding-3-large")).dicts()
|
||||
return list(objs)
|
||||
|
||||
@classmethod
|
||||
@ -298,6 +286,68 @@ class TenantLLMService(CommonService):
|
||||
idx += 1
|
||||
continue
|
||||
|
||||
@classmethod
|
||||
def _collect_paddleocr_env_config(cls) -> dict | None:
|
||||
cfg = PADDLEOCR_DEFAULT_CONFIG
|
||||
found = False
|
||||
for key in PADDLEOCR_ENV_KEYS:
|
||||
val = os.environ.get(key)
|
||||
if val:
|
||||
found = True
|
||||
cfg[key] = val
|
||||
return cfg if found else None
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def ensure_paddleocr_from_env(cls, tenant_id: str) -> str | None:
|
||||
"""
|
||||
Ensure a PaddleOCR model exists for the tenant if env variables are present.
|
||||
Return the existing or newly created llm_name, or None if env not set.
|
||||
"""
|
||||
cfg = cls._collect_paddleocr_env_config()
|
||||
if not cfg:
|
||||
return None
|
||||
|
||||
saved_paddleocr_models = cls.query(tenant_id=tenant_id, llm_factory="PaddleOCR", model_type=LLMType.OCR.value)
|
||||
|
||||
def _parse_api_key(raw: str) -> dict:
|
||||
try:
|
||||
return json.loads(raw or "{}")
|
||||
except Exception:
|
||||
return {}
|
||||
|
||||
for item in saved_paddleocr_models:
|
||||
api_cfg = _parse_api_key(item.api_key)
|
||||
normalized = {k: api_cfg.get(k, PADDLEOCR_DEFAULT_CONFIG.get(k)) for k in PADDLEOCR_ENV_KEYS}
|
||||
if normalized == cfg:
|
||||
return item.llm_name
|
||||
|
||||
used_names = {item.llm_name for item in saved_paddleocr_models}
|
||||
idx = 1
|
||||
base_name = "paddleocr-from-env"
|
||||
while True:
|
||||
candidate = f"{base_name}-{idx}"
|
||||
if candidate in used_names:
|
||||
idx += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
cls.save(
|
||||
tenant_id=tenant_id,
|
||||
llm_factory="PaddleOCR",
|
||||
llm_name=candidate,
|
||||
model_type=LLMType.OCR.value,
|
||||
api_key=json.dumps(cfg),
|
||||
api_base="",
|
||||
max_tokens=0,
|
||||
)
|
||||
return candidate
|
||||
except IntegrityError:
|
||||
logging.warning("PaddleOCR env model %s already exists for tenant %s, retry with next name", candidate, tenant_id)
|
||||
used_names.add(candidate)
|
||||
idx += 1
|
||||
continue
|
||||
|
||||
@classmethod
|
||||
@DB.connection_context()
|
||||
def delete_by_tenant_id(cls, tenant_id):
|
||||
@ -306,6 +356,7 @@ class TenantLLMService(CommonService):
|
||||
@staticmethod
|
||||
def llm_id2llm_type(llm_id: str) -> str | None:
|
||||
from api.db.services.llm_service import LLMService
|
||||
|
||||
llm_id, *_ = TenantLLMService.split_model_name_and_factory(llm_id)
|
||||
llm_factories = settings.FACTORY_LLM_INFOS
|
||||
for llm_factory in llm_factories:
|
||||
@ -340,8 +391,7 @@ class LLM4Tenant:
|
||||
langfuse_keys = TenantLangfuseService.filter_by_tenant(tenant_id=tenant_id)
|
||||
self.langfuse = None
|
||||
if langfuse_keys:
|
||||
langfuse = Langfuse(public_key=langfuse_keys.public_key, secret_key=langfuse_keys.secret_key,
|
||||
host=langfuse_keys.host)
|
||||
langfuse = Langfuse(public_key=langfuse_keys.public_key, secret_key=langfuse_keys.secret_key, host=langfuse_keys.host)
|
||||
if langfuse.auth_check():
|
||||
self.langfuse = langfuse
|
||||
trace_id = self.langfuse.create_trace_id()
|
||||
|
||||
@ -20,6 +20,7 @@ from strenum import StrEnum
|
||||
SERVICE_CONF = "service_conf.yaml"
|
||||
RAG_FLOW_SERVICE_NAME = "ragflow"
|
||||
|
||||
|
||||
class CustomEnum(Enum):
|
||||
@classmethod
|
||||
def valid(cls, value):
|
||||
@ -68,13 +69,13 @@ class ActiveEnum(Enum):
|
||||
|
||||
|
||||
class LLMType(StrEnum):
|
||||
CHAT = 'chat'
|
||||
EMBEDDING = 'embedding'
|
||||
SPEECH2TEXT = 'speech2text'
|
||||
IMAGE2TEXT = 'image2text'
|
||||
RERANK = 'rerank'
|
||||
TTS = 'tts'
|
||||
OCR = 'ocr'
|
||||
CHAT = "chat"
|
||||
EMBEDDING = "embedding"
|
||||
SPEECH2TEXT = "speech2text"
|
||||
IMAGE2TEXT = "image2text"
|
||||
RERANK = "rerank"
|
||||
TTS = "tts"
|
||||
OCR = "ocr"
|
||||
|
||||
|
||||
class TaskStatus(StrEnum):
|
||||
@ -86,8 +87,7 @@ class TaskStatus(StrEnum):
|
||||
SCHEDULE = "5"
|
||||
|
||||
|
||||
VALID_TASK_STATUS = {TaskStatus.UNSTART, TaskStatus.RUNNING, TaskStatus.CANCEL, TaskStatus.DONE, TaskStatus.FAIL,
|
||||
TaskStatus.SCHEDULE}
|
||||
VALID_TASK_STATUS = {TaskStatus.UNSTART, TaskStatus.RUNNING, TaskStatus.CANCEL, TaskStatus.DONE, TaskStatus.FAIL, TaskStatus.SCHEDULE}
|
||||
|
||||
|
||||
class ParserType(StrEnum):
|
||||
@ -136,6 +136,7 @@ class FileSource(StrEnum):
|
||||
BITBUCKET = "bitbucket"
|
||||
ZENDESK = "zendesk"
|
||||
|
||||
|
||||
class PipelineTaskType(StrEnum):
|
||||
PARSE = "Parse"
|
||||
DOWNLOAD = "Download"
|
||||
@ -145,15 +146,17 @@ class PipelineTaskType(StrEnum):
|
||||
MEMORY = "Memory"
|
||||
|
||||
|
||||
VALID_PIPELINE_TASK_TYPES = {PipelineTaskType.PARSE, PipelineTaskType.DOWNLOAD, PipelineTaskType.RAPTOR,
|
||||
PipelineTaskType.GRAPH_RAG, PipelineTaskType.MINDMAP}
|
||||
VALID_PIPELINE_TASK_TYPES = {PipelineTaskType.PARSE, PipelineTaskType.DOWNLOAD, PipelineTaskType.RAPTOR, PipelineTaskType.GRAPH_RAG, PipelineTaskType.MINDMAP}
|
||||
|
||||
|
||||
class MCPServerType(StrEnum):
|
||||
SSE = "sse"
|
||||
STREAMABLE_HTTP = "streamable-http"
|
||||
|
||||
|
||||
VALID_MCP_SERVER_TYPES = {MCPServerType.SSE, MCPServerType.STREAMABLE_HTTP}
|
||||
|
||||
|
||||
class Storage(Enum):
|
||||
MINIO = 1
|
||||
AZURE_SPN = 2
|
||||
@ -165,10 +168,10 @@ class Storage(Enum):
|
||||
|
||||
|
||||
class MemoryType(Enum):
|
||||
RAW = 0b0001 # 1 << 0 = 1 (0b00000001)
|
||||
SEMANTIC = 0b0010 # 1 << 1 = 2 (0b00000010)
|
||||
EPISODIC = 0b0100 # 1 << 2 = 4 (0b00000100)
|
||||
PROCEDURAL = 0b1000 # 1 << 3 = 8 (0b00001000)
|
||||
RAW = 0b0001 # 1 << 0 = 1 (0b00000001)
|
||||
SEMANTIC = 0b0010 # 1 << 1 = 2 (0b00000010)
|
||||
EPISODIC = 0b0100 # 1 << 2 = 4 (0b00000100)
|
||||
PROCEDURAL = 0b1000 # 1 << 3 = 8 (0b00001000)
|
||||
|
||||
|
||||
class MemoryStorageType(StrEnum):
|
||||
@ -239,3 +242,10 @@ MINERU_DEFAULT_CONFIG = {
|
||||
"MINERU_SERVER_URL": "",
|
||||
"MINERU_DELETE_OUTPUT": 1,
|
||||
}
|
||||
|
||||
PADDLEOCR_ENV_KEYS = ["PADDLEOCR_API_URL", "PADDLEOCR_ACCESS_TOKEN", "PADDLEOCR_ALGORITHM"]
|
||||
PADDLEOCR_DEFAULT_CONFIG = {
|
||||
"PADDLEOCR_API_URL": "",
|
||||
"PADDLEOCR_ACCESS_TOKEN": None,
|
||||
"PADDLEOCR_ALGORITHM": "PaddleOCR-VL",
|
||||
}
|
||||
|
||||
@ -26,5 +26,8 @@ def normalize_layout_recognizer(layout_recognizer_raw: Any) -> tuple[Any, str |
|
||||
if lowered.endswith("@mineru"):
|
||||
parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0]
|
||||
layout_recognizer = "MinerU"
|
||||
elif lowered.endswith("@paddleocr"):
|
||||
parser_model_name = layout_recognizer_raw.rsplit("@", 1)[0]
|
||||
layout_recognizer = "PaddleOCR"
|
||||
|
||||
return layout_recognizer, parser_model_name
|
||||
|
||||
@ -5531,6 +5531,14 @@
|
||||
"status": "1",
|
||||
"rank": "900",
|
||||
"llm": []
|
||||
},
|
||||
{
|
||||
"name": "PaddleOCR",
|
||||
"logo": "",
|
||||
"tags": "OCR",
|
||||
"status": "1",
|
||||
"rank": "910",
|
||||
"llm": []
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
400
deepdoc/parser/paddleocr_parser.py
Normal file
400
deepdoc/parser/paddleocr_parser.py
Normal file
@ -0,0 +1,400 @@
|
||||
# Copyright 2026 The InfiniFlow Authors. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from __future__ import annotations
|
||||
|
||||
import base64
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
from dataclasses import asdict, dataclass, field, fields
|
||||
from io import BytesIO
|
||||
from os import PathLike
|
||||
from pathlib import Path
|
||||
from typing import Any, Callable, ClassVar, Literal, Optional, Union, Tuple, List
|
||||
|
||||
import requests
|
||||
|
||||
try:
|
||||
from deepdoc.parser.pdf_parser import RAGFlowPdfParser
|
||||
except Exception:
|
||||
|
||||
class RAGFlowPdfParser:
|
||||
pass
|
||||
|
||||
|
||||
AlgorithmType = Literal["PaddleOCR-VL"]
|
||||
SectionTuple = tuple[str, ...]
|
||||
TableTuple = tuple[str, ...]
|
||||
ParseResult = tuple[list[SectionTuple], list[TableTuple]]
|
||||
|
||||
|
||||
_MARKDOWN_IMAGE_PATTERN = re.compile(
|
||||
r"""
|
||||
<div[^>]*>\s*
|
||||
<img[^>]*/>\s*
|
||||
</div>
|
||||
|
|
||||
<img[^>]*/>
|
||||
""",
|
||||
re.IGNORECASE | re.VERBOSE | re.DOTALL,
|
||||
)
|
||||
|
||||
|
||||
def _remove_images_from_markdown(markdown: str) -> str:
|
||||
return _MARKDOWN_IMAGE_PATTERN.sub("", markdown)
|
||||
|
||||
|
||||
@dataclass
|
||||
class PaddleOCRVLConfig:
|
||||
"""Configuration for PaddleOCR-VL algorithm."""
|
||||
|
||||
use_doc_orientation_classify: Optional[bool] = None
|
||||
use_doc_unwarping: Optional[bool] = None
|
||||
use_layout_detection: Optional[bool] = None
|
||||
use_polygon_points: Optional[bool] = None
|
||||
use_chart_recognition: Optional[bool] = None
|
||||
use_seal_recognition: Optional[bool] = None
|
||||
use_ocr_for_image_block: Optional[bool] = None
|
||||
layout_threshold: Optional[Union[float, dict]] = None
|
||||
layout_nms: Optional[bool] = None
|
||||
layout_unclip_ratio: Optional[Union[float, Tuple[float, float], dict]] = None
|
||||
layout_merge_bboxes_mode: Optional[Union[str, dict]] = None
|
||||
prompt_label: Optional[str] = None
|
||||
format_block_content: Optional[bool] = True
|
||||
repetition_penalty: Optional[float] = None
|
||||
temperature: Optional[float] = None
|
||||
top_p: Optional[float] = None
|
||||
min_pixels: Optional[int] = None
|
||||
max_pixels: Optional[int] = None
|
||||
max_new_tokens: Optional[int] = None
|
||||
merge_layout_blocks: Optional[bool] = None
|
||||
markdown_ignore_labels: Optional[List[str]] = None
|
||||
vlm_extra_args: Optional[dict] = None
|
||||
|
||||
|
||||
@dataclass
|
||||
class PaddleOCRConfig:
|
||||
"""Main configuration for PaddleOCR parser."""
|
||||
|
||||
api_url: str = ""
|
||||
access_token: Optional[str] = None
|
||||
algorithm: AlgorithmType = "PaddleOCR-VL"
|
||||
request_timeout: int = 600
|
||||
prettify_markdown: bool = True
|
||||
show_formula_number: bool = True
|
||||
visualize: bool = False
|
||||
additional_params: dict[str, Any] = field(default_factory=dict)
|
||||
algorithm_config: dict[str, Any] = field(default_factory=dict)
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, config: Optional[dict[str, Any]]) -> "PaddleOCRConfig":
|
||||
"""Create configuration from dictionary."""
|
||||
if not config:
|
||||
return cls()
|
||||
|
||||
cfg = config.copy()
|
||||
algorithm = cfg.get("algorithm", "PaddleOCR-VL")
|
||||
|
||||
# Validate algorithm
|
||||
if algorithm not in ("PaddleOCR-VL",):
|
||||
raise ValueError(f"Unsupported algorithm: {algorithm}")
|
||||
|
||||
# Extract algorithm-specific configuration
|
||||
algorithm_config: dict[str, Any] = {}
|
||||
if algorithm == "PaddleOCR-VL":
|
||||
# Create default PaddleOCRVLConfig object and convert to dict
|
||||
algorithm_config = asdict(PaddleOCRVLConfig())
|
||||
|
||||
# Apply user-provided VL config
|
||||
vl_config = cfg.get("vl")
|
||||
if isinstance(vl_config, dict):
|
||||
algorithm_config.update({k: v for k, v in vl_config.items() if v is not None})
|
||||
|
||||
# Remove processed keys
|
||||
cfg.pop("vl", None)
|
||||
|
||||
# Prepare initialization arguments
|
||||
field_names = {field.name for field in fields(cls)}
|
||||
init_kwargs: dict[str, Any] = {}
|
||||
|
||||
for field_name in field_names:
|
||||
if field_name in cfg:
|
||||
init_kwargs[field_name] = cfg[field_name]
|
||||
|
||||
init_kwargs["algorithm_config"] = algorithm_config
|
||||
|
||||
return cls(**init_kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_kwargs(cls, **kwargs: Any) -> "PaddleOCRConfig":
|
||||
"""Create configuration from keyword arguments."""
|
||||
return cls.from_dict(kwargs)
|
||||
|
||||
|
||||
class PaddleOCRParser(RAGFlowPdfParser):
|
||||
"""Parser for PDF documents using PaddleOCR API."""
|
||||
|
||||
_COMMON_FIELD_MAPPING: ClassVar[dict[str, str]] = {
|
||||
"prettify_markdown": "prettifyMarkdown",
|
||||
"show_formula_number": "showFormulaNumber",
|
||||
"visualize": "visualize",
|
||||
}
|
||||
|
||||
_ALGORITHM_FIELD_MAPPINGS: ClassVar[dict[str, dict[str, str]]] = {
|
||||
"PaddleOCR-VL": {
|
||||
"use_doc_orientation_classify": "useDocOrientationClassify",
|
||||
"use_doc_unwarping": "useDocUnwarping",
|
||||
"use_layout_detection": "useLayoutDetection",
|
||||
"use_polygon_points": "usePolygonPoints",
|
||||
"use_chart_recognition": "useChartRecognition",
|
||||
"use_seal_recognition": "useSealRecognition",
|
||||
"use_ocr_for_image_block": "useOcrForImageBlock",
|
||||
"layout_threshold": "layoutThreshold",
|
||||
"layout_nms": "layoutNms",
|
||||
"layout_unclip_ratio": "layoutUnclipRatio",
|
||||
"layout_merge_bboxes_mode": "layoutMergeBboxesMode",
|
||||
"prompt_label": "promptLabel",
|
||||
"format_block_content": "formatBlockContent",
|
||||
"repetition_penalty": "repetitionPenalty",
|
||||
"temperature": "temperature",
|
||||
"top_p": "topP",
|
||||
"min_pixels": "minPixels",
|
||||
"max_pixels": "maxPixels",
|
||||
"max_new_tokens": "maxNewTokens",
|
||||
"merge_layout_blocks": "mergeLayoutBlocks",
|
||||
"markdown_ignore_labels": "markdownIgnoreLabels",
|
||||
"vlm_extra_args": "vlmExtraArgs",
|
||||
},
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_url: Optional[str] = None,
|
||||
access_token: Optional[str] = None,
|
||||
algorithm: AlgorithmType = "PaddleOCR-VL",
|
||||
*,
|
||||
request_timeout: int = 600,
|
||||
):
|
||||
"""Initialize PaddleOCR parser."""
|
||||
self.api_url = api_url.rstrip("/") if api_url else os.getenv("PADDLEOCR_API_URL", "")
|
||||
self.access_token = access_token or os.getenv("PADDLEOCR_ACCESS_TOKEN")
|
||||
self.algorithm = algorithm
|
||||
self.request_timeout = request_timeout
|
||||
self.logger = logging.getLogger(self.__class__.__name__)
|
||||
|
||||
# Force PDF file type
|
||||
self.file_type = 0
|
||||
|
||||
# Public methods
|
||||
def check_installation(self) -> tuple[bool, str]:
|
||||
"""Check if the parser is properly installed and configured."""
|
||||
if not self.api_url:
|
||||
return False, "[PaddleOCR] API URL not configured"
|
||||
|
||||
# TODO [@Bobholamovic]: Check URL availability and token validity
|
||||
|
||||
return True, ""
|
||||
|
||||
def parse_pdf(
|
||||
self,
|
||||
filepath: str | PathLike[str],
|
||||
binary: BytesIO | bytes | None = None,
|
||||
callback: Optional[Callable[[float, str], None]] = None,
|
||||
*,
|
||||
parse_method: str = "raw",
|
||||
api_url: Optional[str] = None,
|
||||
access_token: Optional[str] = None,
|
||||
algorithm: Optional[AlgorithmType] = None,
|
||||
request_timeout: Optional[int] = None,
|
||||
prettify_markdown: Optional[bool] = None,
|
||||
show_formula_number: Optional[bool] = None,
|
||||
visualize: Optional[bool] = None,
|
||||
additional_params: Optional[dict[str, Any]] = None,
|
||||
vl_config: Optional[dict[str, Any]] = None,
|
||||
**kwargs: Any,
|
||||
) -> ParseResult:
|
||||
"""Parse PDF document using PaddleOCR API."""
|
||||
# Create configuration - pass all kwargs to capture VL config parameters
|
||||
config_dict = {
|
||||
"api_url": api_url if api_url is not None else self.api_url,
|
||||
"access_token": access_token if access_token is not None else self.access_token,
|
||||
"algorithm": algorithm if algorithm is not None else self.algorithm,
|
||||
"request_timeout": request_timeout if request_timeout is not None else self.request_timeout,
|
||||
}
|
||||
if prettify_markdown is not None:
|
||||
config_dict["prettify_markdown"] = prettify_markdown
|
||||
if show_formula_number is not None:
|
||||
config_dict["show_formula_number"] = show_formula_number
|
||||
if visualize is not None:
|
||||
config_dict["visualize"] = visualize
|
||||
if additional_params is not None:
|
||||
config_dict["additional_params"] = additional_params
|
||||
if vl_config is not None:
|
||||
config_dict["vl"] = vl_config
|
||||
|
||||
# Add any VL config parameters from kwargs
|
||||
for key, value in kwargs.items():
|
||||
if key in {field.name for field in fields(PaddleOCRVLConfig)}:
|
||||
config_dict[key] = value
|
||||
|
||||
cfg = PaddleOCRConfig.from_dict(config_dict)
|
||||
|
||||
if not cfg.api_url:
|
||||
raise RuntimeError("[PaddleOCR] API URL missing")
|
||||
|
||||
# Prepare file data
|
||||
data_bytes = self._prepare_file_data(filepath, binary)
|
||||
|
||||
# Build and send request
|
||||
result = self._send_request(data_bytes, cfg, callback)
|
||||
|
||||
# Process response
|
||||
sections = self._transfer_to_sections(result, algorithm=cfg.algorithm, parse_method=parse_method)
|
||||
if callback:
|
||||
callback(0.9, f"[PaddleOCR] done, sections: {len(sections)}")
|
||||
|
||||
tables = self._transfer_to_tables(result)
|
||||
if callback:
|
||||
callback(1.0, f"[PaddleOCR] done, tables: {len(tables)}")
|
||||
|
||||
return sections, tables
|
||||
|
||||
def _prepare_file_data(self, filepath: str | PathLike[str], binary: BytesIO | bytes | None) -> bytes:
|
||||
"""Prepare file data for API request."""
|
||||
source_path = Path(filepath)
|
||||
|
||||
if binary is not None:
|
||||
if isinstance(binary, (bytes, bytearray)):
|
||||
return binary
|
||||
return binary.getbuffer().tobytes()
|
||||
|
||||
if not source_path.exists():
|
||||
raise FileNotFoundError(f"[PaddleOCR] file not found: {source_path}")
|
||||
|
||||
return source_path.read_bytes()
|
||||
|
||||
def _build_payload(self, data: bytes, file_type: int, config: PaddleOCRConfig) -> dict[str, Any]:
|
||||
"""Build payload for API request."""
|
||||
payload: dict[str, Any] = {
|
||||
"file": base64.b64encode(data).decode("ascii"),
|
||||
"fileType": file_type,
|
||||
}
|
||||
|
||||
# Add common parameters
|
||||
for param_key, param_value in [
|
||||
("prettify_markdown", config.prettify_markdown),
|
||||
("show_formula_number", config.show_formula_number),
|
||||
("visualize", config.visualize),
|
||||
]:
|
||||
if param_value is not None:
|
||||
api_param = self._COMMON_FIELD_MAPPING[param_key]
|
||||
payload[api_param] = param_value
|
||||
|
||||
# Add algorithm-specific parameters
|
||||
algorithm_mapping = self._ALGORITHM_FIELD_MAPPINGS.get(config.algorithm, {})
|
||||
for param_key, param_value in config.algorithm_config.items():
|
||||
if param_value is not None and param_key in algorithm_mapping:
|
||||
api_param = algorithm_mapping[param_key]
|
||||
payload[api_param] = param_value
|
||||
|
||||
# Add any additional parameters
|
||||
if config.additional_params:
|
||||
payload.update(config.additional_params)
|
||||
|
||||
return payload
|
||||
|
||||
def _send_request(self, data: bytes, config: PaddleOCRConfig, callback: Optional[Callable[[float, str], None]]) -> dict[str, Any]:
|
||||
"""Send request to PaddleOCR API and parse response."""
|
||||
# Build payload
|
||||
payload = self._build_payload(data, self.file_type, config)
|
||||
|
||||
# Prepare headers
|
||||
headers = {"Content-Type": "application/json", "Client-Platform": "ragflow"}
|
||||
if config.access_token:
|
||||
headers["Authorization"] = f"token {config.access_token}"
|
||||
|
||||
self.logger.info("[PaddleOCR] invoking API")
|
||||
if callback:
|
||||
callback(0.1, "[PaddleOCR] submitting request")
|
||||
|
||||
# Send request
|
||||
try:
|
||||
resp = requests.post(config.api_url, json=payload, headers=headers, timeout=self.request_timeout)
|
||||
resp.raise_for_status()
|
||||
except Exception as exc:
|
||||
if callback:
|
||||
callback(-1, f"[PaddleOCR] request failed: {exc}")
|
||||
raise RuntimeError(f"[PaddleOCR] request failed: {exc}")
|
||||
|
||||
# Parse response
|
||||
try:
|
||||
response_data = resp.json()
|
||||
except Exception as exc:
|
||||
raise RuntimeError(f"[PaddleOCR] response is not JSON: {exc}") from exc
|
||||
|
||||
if callback:
|
||||
callback(0.8, "[PaddleOCR] response received")
|
||||
|
||||
# Validate response format
|
||||
if response_data.get("errorCode") != 0 or not isinstance(response_data.get("result"), dict):
|
||||
if callback:
|
||||
callback(-1, "[PaddleOCR] invalid response format")
|
||||
raise RuntimeError("[PaddleOCR] invalid response format")
|
||||
|
||||
return response_data["result"]
|
||||
|
||||
def _transfer_to_sections(self, result: dict[str, Any], algorithm: AlgorithmType, parse_method: str) -> list[SectionTuple]:
|
||||
"""Convert API response to section tuples."""
|
||||
sections: list[SectionTuple] = []
|
||||
|
||||
if algorithm == "PaddleOCR-VL":
|
||||
layout_parsing_results = result.get("layoutParsingResults", [])
|
||||
|
||||
for page_idx, layout_result in enumerate(layout_parsing_results):
|
||||
pruned_result = layout_result.get("prunedResult", {})
|
||||
parsing_res_list = pruned_result.get("parsing_res_list", [])
|
||||
|
||||
for block in parsing_res_list:
|
||||
block_content = block.get("block_content", "").strip()
|
||||
if not block_content:
|
||||
continue
|
||||
|
||||
# Remove images
|
||||
block_content = _remove_images_from_markdown(block_content)
|
||||
|
||||
label = block.get("block_label", "")
|
||||
block_bbox = block.get("block_bbox", [0, 0, 0, 0])
|
||||
|
||||
tag = f"@@{page_idx + 1}\t{block_bbox[0]}\t{block_bbox[2]}\t{block_bbox[1]}\t{block_bbox[3]}##"
|
||||
|
||||
if parse_method == "manual":
|
||||
sections.append((block_content, label, tag))
|
||||
elif parse_method == "paper":
|
||||
sections.append((block_content + tag, label))
|
||||
else:
|
||||
sections.append((block_content, tag))
|
||||
|
||||
return sections
|
||||
|
||||
def _transfer_to_tables(self, result: dict[str, Any]) -> list[TableTuple]:
|
||||
"""Convert API response to table tuples."""
|
||||
return []
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO)
|
||||
parser = PaddleOCRParser(api_url=os.getenv("PADDLEOCR_API_URL", ""), algorithm=os.getenv("PADDLEOCR_ALGORITHM", "PaddleOCR-VL"))
|
||||
ok, reason = parser.check_installation()
|
||||
print("PaddleOCR available:", ok, reason)
|
||||
@ -22,9 +22,7 @@ from deepdoc.parser.utils import get_text
|
||||
from rag.app import naive
|
||||
from rag.app.naive import by_plaintext, PARSERS
|
||||
from common.parser_config_utils import normalize_layout_recognizer
|
||||
from rag.nlp import bullets_category, is_english, remove_contents_table, \
|
||||
hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, \
|
||||
tokenize_chunks, attach_media_context
|
||||
from rag.nlp import bullets_category, is_english, remove_contents_table, hierarchical_merge, make_colon_as_title, naive_merge, random_choices, tokenize_table, tokenize_chunks, attach_media_context
|
||||
from rag.nlp import rag_tokenizer
|
||||
from deepdoc.parser import PdfParser, HtmlParser
|
||||
from deepdoc.parser.figure_parser import vision_figure_parser_docx_wrapper
|
||||
@ -32,17 +30,12 @@ from PIL import Image
|
||||
|
||||
|
||||
class Pdf(PdfParser):
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, zoomin=3, callback=None):
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
|
||||
from timeit import default_timer as timer
|
||||
|
||||
start = timer()
|
||||
callback(msg="OCR started")
|
||||
self.__images__(
|
||||
filename if not binary else binary,
|
||||
zoomin,
|
||||
from_page,
|
||||
to_page,
|
||||
callback)
|
||||
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
|
||||
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
||||
|
||||
start = timer()
|
||||
@ -62,24 +55,17 @@ class Pdf(PdfParser):
|
||||
self._merge_with_same_bullet()
|
||||
callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
|
||||
|
||||
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", ""))
|
||||
for b in self.boxes], tbls
|
||||
return [(b["text"] + self._line_tag(b, zoomin), b.get("layoutno", "")) for b in self.boxes], tbls
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, txt.
|
||||
Since a book is long and not all the parts are useful, if it's a PDF,
|
||||
please set up the page ranges for every book in order eliminate negative effects and save elapsed computing time.
|
||||
Supported file formats are docx, pdf, txt.
|
||||
Since a book is long and not all the parts are useful, if it's a PDF,
|
||||
please set up the page ranges for every book in order eliminate negative effects and save elapsed computing time.
|
||||
"""
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
}
|
||||
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
pdf_parser = None
|
||||
sections, tbls = [], []
|
||||
@ -87,28 +73,23 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
callback(0.1, "Start to parse.")
|
||||
doc_parser = naive.Docx()
|
||||
# TODO: table of contents need to be removed
|
||||
main_sections = doc_parser(
|
||||
filename, binary=binary, from_page=from_page, to_page=to_page)
|
||||
|
||||
main_sections = doc_parser(filename, binary=binary, from_page=from_page, to_page=to_page)
|
||||
|
||||
sections = []
|
||||
tbls = []
|
||||
for text, image, html in main_sections:
|
||||
sections.append((text, image))
|
||||
tbls.append(((None, html), ""))
|
||||
|
||||
remove_contents_table(sections, eng=is_english(
|
||||
random_choices([t for t, _ in sections], k=200)))
|
||||
|
||||
remove_contents_table(sections, eng=is_english(random_choices([t for t, _ in sections], k=200)))
|
||||
|
||||
tbls = vision_figure_parser_docx_wrapper(sections=sections, tbls=tbls, callback=callback, **kwargs)
|
||||
# tbls = [((None, lns), None) for lns in tbls]
|
||||
sections = [(item[0], item[1] if item[1] is not None else "") for item in sections if
|
||||
not isinstance(item[1], Image.Image)]
|
||||
sections = [(item[0], item[1] if item[1] is not None else "") for item in sections if not isinstance(item[1], Image.Image)]
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||
parser_config.get("layout_recognize", "DeepDOC")
|
||||
)
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
|
||||
|
||||
if isinstance(layout_recognizer, bool):
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
@ -127,13 +108,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
pdf_cls=Pdf,
|
||||
layout_recognizer=layout_recognizer,
|
||||
mineru_llm_name=parser_model_name,
|
||||
**kwargs
|
||||
paddleocr_llm_name=parser_model_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not sections and not tables:
|
||||
return []
|
||||
|
||||
if name in ["tcadp", "docling", "mineru"]:
|
||||
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
|
||||
parser_config["chunk_token_num"] = 0
|
||||
|
||||
callback(0.8, "Finish parsing.")
|
||||
@ -142,16 +124,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
txt = get_text(filename, binary)
|
||||
sections = txt.split("\n")
|
||||
sections = [(line, "") for line in sections if line]
|
||||
remove_contents_table(sections, eng=is_english(
|
||||
random_choices([t for t, _ in sections], k=200)))
|
||||
remove_contents_table(sections, eng=is_english(random_choices([t for t, _ in sections], k=200)))
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.(htm|html)$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
sections = HtmlParser()(filename, binary)
|
||||
sections = [(line, "") for line in sections if line]
|
||||
remove_contents_table(sections, eng=is_english(
|
||||
random_choices([t for t, _ in sections], k=200)))
|
||||
remove_contents_table(sections, eng=is_english(random_choices([t for t, _ in sections], k=200)))
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.doc$", filename, re.IGNORECASE):
|
||||
@ -165,31 +145,23 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
|
||||
binary = BytesIO(binary)
|
||||
doc_parsed = tika_parser.from_buffer(binary)
|
||||
if doc_parsed.get('content', None) is not None:
|
||||
sections = doc_parsed['content'].split('\n')
|
||||
if doc_parsed.get("content", None) is not None:
|
||||
sections = doc_parsed["content"].split("\n")
|
||||
sections = [(line, "") for line in sections if line]
|
||||
remove_contents_table(sections, eng=is_english(
|
||||
random_choices([t for t, _ in sections], k=200)))
|
||||
remove_contents_table(sections, eng=is_english(random_choices([t for t, _ in sections], k=200)))
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(doc, docx, pdf, txt supported)")
|
||||
raise NotImplementedError("file type not supported yet(doc, docx, pdf, txt supported)")
|
||||
|
||||
make_colon_as_title(sections)
|
||||
bull = bullets_category(
|
||||
[t for t in random_choices([t for t, _ in sections], k=100)])
|
||||
bull = bullets_category([t for t in random_choices([t for t, _ in sections], k=100)])
|
||||
if bull >= 0:
|
||||
chunks = ["\n".join(ck)
|
||||
for ck in hierarchical_merge(bull, sections, 5)]
|
||||
chunks = ["\n".join(ck) for ck in hierarchical_merge(bull, sections, 5)]
|
||||
else:
|
||||
sections = [s.split("@") for s, _ in sections]
|
||||
sections = [(pr[0], "@" + pr[1]) if len(pr) == 2 else (pr[0], '') for pr in sections]
|
||||
chunks = naive_merge(
|
||||
sections,
|
||||
parser_config.get("chunk_token_num", 256),
|
||||
parser_config.get("delimiter", "\n。;!?")
|
||||
)
|
||||
sections = [(pr[0], "@" + pr[1]) if len(pr) == 2 else (pr[0], "") for pr in sections]
|
||||
chunks = naive_merge(sections, parser_config.get("chunk_token_num", 256), parser_config.get("delimiter", "\n。;!?"))
|
||||
|
||||
# is it English
|
||||
# is_english(random_choices([t for t, _ in sections], k=218))
|
||||
@ -208,9 +180,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
|
||||
|
||||
chunk(sys.argv[1], from_page=1, to_page=10, callback=dummy)
|
||||
|
||||
@ -21,8 +21,7 @@ from docx import Document
|
||||
|
||||
from common.constants import ParserType
|
||||
from deepdoc.parser.utils import get_text
|
||||
from rag.nlp import bullets_category, remove_contents_table, \
|
||||
make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge
|
||||
from rag.nlp import bullets_category, remove_contents_table, make_colon_as_title, tokenize_chunks, docx_question_level, tree_merge
|
||||
from rag.nlp import rag_tokenizer, Node
|
||||
from deepdoc.parser import PdfParser, DocxParser, HtmlParser
|
||||
from rag.app.naive import by_plaintext, PARSERS
|
||||
@ -38,8 +37,7 @@ class Docx(DocxParser):
|
||||
return line
|
||||
|
||||
def old_call(self, filename, binary=None, from_page=0, to_page=100000):
|
||||
self.doc = Document(
|
||||
filename) if not binary else Document(BytesIO(binary))
|
||||
self.doc = Document(filename) if not binary else Document(BytesIO(binary))
|
||||
pn = 0
|
||||
lines = []
|
||||
for p in self.doc.paragraphs:
|
||||
@ -48,16 +46,15 @@ class Docx(DocxParser):
|
||||
if from_page <= pn < to_page and p.text.strip():
|
||||
lines.append(self.__clean(p.text))
|
||||
for run in p.runs:
|
||||
if 'lastRenderedPageBreak' in run._element.xml:
|
||||
if "lastRenderedPageBreak" in run._element.xml:
|
||||
pn += 1
|
||||
continue
|
||||
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
|
||||
if "w:br" in run._element.xml and 'type="page"' in run._element.xml:
|
||||
pn += 1
|
||||
return [line for line in lines if line]
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
|
||||
self.doc = Document(
|
||||
filename) if not binary else Document(BytesIO(binary))
|
||||
self.doc = Document(filename) if not binary else Document(BytesIO(binary))
|
||||
pn = 0
|
||||
lines = []
|
||||
level_set = set()
|
||||
@ -71,10 +68,10 @@ class Docx(DocxParser):
|
||||
lines.append((question_level, p_text))
|
||||
level_set.add(question_level)
|
||||
for run in p.runs:
|
||||
if 'lastRenderedPageBreak' in run._element.xml:
|
||||
if "lastRenderedPageBreak" in run._element.xml:
|
||||
pn += 1
|
||||
continue
|
||||
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
|
||||
if "w:br" in run._element.xml and 'type="page"' in run._element.xml:
|
||||
pn += 1
|
||||
|
||||
sorted_levels = sorted(level_set)
|
||||
@ -88,12 +85,12 @@ class Docx(DocxParser):
|
||||
return [element for element in root.get_tree() if element]
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f'''
|
||||
return f"""
|
||||
question:{self.question},
|
||||
answer:{self.answer},
|
||||
level:{self.level},
|
||||
childs:{self.childs}
|
||||
'''
|
||||
"""
|
||||
|
||||
|
||||
class Pdf(PdfParser):
|
||||
@ -101,18 +98,12 @@ class Pdf(PdfParser):
|
||||
self.model_speciess = ParserType.LAWS.value
|
||||
super().__init__()
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, zoomin=3, callback=None):
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
|
||||
from timeit import default_timer as timer
|
||||
|
||||
start = timer()
|
||||
callback(msg="OCR started")
|
||||
self.__images__(
|
||||
filename if not binary else binary,
|
||||
zoomin,
|
||||
from_page,
|
||||
to_page,
|
||||
callback
|
||||
)
|
||||
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
|
||||
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
||||
|
||||
start = timer()
|
||||
@ -123,22 +114,15 @@ class Pdf(PdfParser):
|
||||
|
||||
callback(0.8, "Text extraction ({:.2f}s)".format(timer() - start))
|
||||
|
||||
return [(b["text"], self._line_tag(b, zoomin))
|
||||
for b in self.boxes], None
|
||||
return [(b["text"], self._line_tag(b, zoomin)) for b in self.boxes], None
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, txt.
|
||||
Supported file formats are docx, pdf, txt.
|
||||
"""
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
}
|
||||
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
pdf_parser = None
|
||||
sections = []
|
||||
@ -152,9 +136,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
return tokenize_chunks(chunks, doc, eng, None)
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||
parser_config.get("layout_recognize", "DeepDOC")
|
||||
)
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
|
||||
|
||||
if isinstance(layout_recognizer, bool):
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
@ -173,13 +155,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
pdf_cls=Pdf,
|
||||
layout_recognizer=layout_recognizer,
|
||||
mineru_llm_name=parser_model_name,
|
||||
**kwargs
|
||||
paddleocr_llm_name=parser_model_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not raw_sections and not tables:
|
||||
return []
|
||||
|
||||
if name in ["tcadp", "docling", "mineru"]:
|
||||
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
|
||||
parser_config["chunk_token_num"] = 0
|
||||
|
||||
for txt, poss in raw_sections:
|
||||
@ -210,8 +193,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
|
||||
binary = BytesIO(binary)
|
||||
doc_parsed = tika_parser.from_buffer(binary)
|
||||
if doc_parsed.get('content', None) is not None:
|
||||
sections = doc_parsed['content'].split('\n')
|
||||
if doc_parsed.get("content", None) is not None:
|
||||
sections = doc_parsed["content"].split("\n")
|
||||
sections = [s for s in sections if s]
|
||||
callback(0.8, "Finish parsing.")
|
||||
else:
|
||||
@ -219,8 +202,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
logging.warning(f"tika.parser got empty content from {filename}.")
|
||||
return []
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(doc, docx, pdf, txt supported)")
|
||||
raise NotImplementedError("file type not supported yet(doc, docx, pdf, txt supported)")
|
||||
|
||||
# Remove 'Contents' part
|
||||
remove_contents_table(sections, eng)
|
||||
@ -241,9 +223,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
|
||||
|
||||
chunk(sys.argv[1], callback=dummy)
|
||||
|
||||
@ -20,8 +20,7 @@ import re
|
||||
|
||||
from common.constants import ParserType
|
||||
from io import BytesIO
|
||||
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, \
|
||||
docx_question_level, attach_media_context
|
||||
from rag.nlp import rag_tokenizer, tokenize, tokenize_table, bullets_category, title_frequency, tokenize_chunks, docx_question_level, attach_media_context
|
||||
from common.token_utils import num_tokens_from_string
|
||||
from deepdoc.parser import PdfParser, DocxParser
|
||||
from deepdoc.parser.figure_parser import vision_figure_parser_pdf_wrapper, vision_figure_parser_docx_wrapper
|
||||
@ -36,18 +35,12 @@ class Pdf(PdfParser):
|
||||
self.model_speciess = ParserType.MANUAL.value
|
||||
super().__init__()
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, zoomin=3, callback=None):
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
|
||||
from timeit import default_timer as timer
|
||||
|
||||
start = timer()
|
||||
callback(msg="OCR started")
|
||||
self.__images__(
|
||||
filename if not binary else binary,
|
||||
zoomin,
|
||||
from_page,
|
||||
to_page,
|
||||
callback
|
||||
)
|
||||
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
|
||||
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
||||
logging.debug("OCR: {}".format(timer() - start))
|
||||
|
||||
@ -71,8 +64,7 @@ class Pdf(PdfParser):
|
||||
for b in self.boxes:
|
||||
b["text"] = re.sub(r"([\t ]|\u3000){2,}", " ", b["text"].strip())
|
||||
|
||||
return [(b["text"], b.get("layoutno", ""), self.get_position(b, zoomin))
|
||||
for i, b in enumerate(self.boxes)], tbls
|
||||
return [(b["text"], b.get("layoutno", ""), self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)], tbls
|
||||
|
||||
|
||||
class Docx(DocxParser):
|
||||
@ -80,12 +72,12 @@ class Docx(DocxParser):
|
||||
pass
|
||||
|
||||
def get_picture(self, document, paragraph):
|
||||
img = paragraph._element.xpath('.//pic:pic')
|
||||
img = paragraph._element.xpath(".//pic:pic")
|
||||
if not img:
|
||||
return None
|
||||
try:
|
||||
img = img[0]
|
||||
embed = img.xpath('.//a:blip/@r:embed')[0]
|
||||
embed = img.xpath(".//a:blip/@r:embed")[0]
|
||||
related_part = document.part.related_parts[embed]
|
||||
image = related_part.image
|
||||
if image is not None:
|
||||
@ -111,7 +103,7 @@ class Docx(DocxParser):
|
||||
|
||||
new_width = max(width1, width2)
|
||||
new_height = height1 + height2
|
||||
new_image = Image.new('RGB', (new_width, new_height))
|
||||
new_image = Image.new("RGB", (new_width, new_height))
|
||||
|
||||
new_image.paste(img1, (0, 0))
|
||||
new_image.paste(img2, (0, height1))
|
||||
@ -119,8 +111,7 @@ class Docx(DocxParser):
|
||||
return new_image
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None):
|
||||
self.doc = Document(
|
||||
filename) if not binary else Document(BytesIO(binary))
|
||||
self.doc = Document(filename) if not binary else Document(BytesIO(binary))
|
||||
pn = 0
|
||||
last_answer, last_image = "", None
|
||||
question_stack, level_stack = [], []
|
||||
@ -128,19 +119,19 @@ class Docx(DocxParser):
|
||||
for p in self.doc.paragraphs:
|
||||
if pn > to_page:
|
||||
break
|
||||
question_level, p_text = 0, ''
|
||||
question_level, p_text = 0, ""
|
||||
if from_page <= pn < to_page and p.text.strip():
|
||||
question_level, p_text = docx_question_level(p)
|
||||
if not question_level or question_level > 6: # not a question
|
||||
last_answer = f'{last_answer}\n{p_text}'
|
||||
last_answer = f"{last_answer}\n{p_text}"
|
||||
current_image = self.get_picture(self.doc, p)
|
||||
last_image = self.concat_img(last_image, current_image)
|
||||
else: # is a question
|
||||
if last_answer or last_image:
|
||||
sum_question = '\n'.join(question_stack)
|
||||
sum_question = "\n".join(question_stack)
|
||||
if sum_question:
|
||||
ti_list.append((f'{sum_question}\n{last_answer}', last_image))
|
||||
last_answer, last_image = '', None
|
||||
ti_list.append((f"{sum_question}\n{last_answer}", last_image))
|
||||
last_answer, last_image = "", None
|
||||
|
||||
i = question_level
|
||||
while question_stack and i <= level_stack[-1]:
|
||||
@ -149,15 +140,15 @@ class Docx(DocxParser):
|
||||
question_stack.append(p_text)
|
||||
level_stack.append(question_level)
|
||||
for run in p.runs:
|
||||
if 'lastRenderedPageBreak' in run._element.xml:
|
||||
if "lastRenderedPageBreak" in run._element.xml:
|
||||
pn += 1
|
||||
continue
|
||||
if 'w:br' in run._element.xml and 'type="page"' in run._element.xml:
|
||||
if "w:br" in run._element.xml and 'type="page"' in run._element.xml:
|
||||
pn += 1
|
||||
if last_answer:
|
||||
sum_question = '\n'.join(question_stack)
|
||||
sum_question = "\n".join(question_stack)
|
||||
if sum_question:
|
||||
ti_list.append((f'{sum_question}\n{last_answer}', last_image))
|
||||
ti_list.append((f"{sum_question}\n{last_answer}", last_image))
|
||||
|
||||
tbls = []
|
||||
for tb in self.doc.tables:
|
||||
@ -182,26 +173,19 @@ class Docx(DocxParser):
|
||||
return ti_list, tbls
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Only pdf is supported.
|
||||
Only pdf is supported.
|
||||
"""
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
pdf_parser = None
|
||||
doc = {
|
||||
"docnm_kwd": filename
|
||||
}
|
||||
doc = {"docnm_kwd": filename}
|
||||
doc["title_tks"] = rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", doc["docnm_kwd"]))
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
# is it English
|
||||
eng = lang.lower() == "english" # pdf_parser.is_english
|
||||
if re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||
parser_config.get("layout_recognize", "DeepDOC")
|
||||
)
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
|
||||
|
||||
if isinstance(layout_recognizer, bool):
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
@ -222,8 +206,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
pdf_cls=Pdf,
|
||||
layout_recognizer=layout_recognizer,
|
||||
mineru_llm_name=parser_model_name,
|
||||
paddleocr_llm_name=parser_model_name,
|
||||
parse_method="manual",
|
||||
**kwargs
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
def _normalize_section(section):
|
||||
@ -252,7 +237,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
if not sections and not tbls:
|
||||
return []
|
||||
|
||||
if name in ["tcadp", "docling", "mineru"]:
|
||||
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
|
||||
parser_config["chunk_token_num"] = 0
|
||||
|
||||
callback(0.8, "Finish parsing.")
|
||||
@ -264,8 +249,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
for txt, _, _ in sections:
|
||||
for t, lvl in pdf_parser.outlines:
|
||||
tks = set([t[i] + t[i + 1] for i in range(len(t) - 1)])
|
||||
tks_ = set([txt[i] + txt[i + 1]
|
||||
for i in range(min(len(t), len(txt) - 1))])
|
||||
tks_ = set([txt[i] + txt[i + 1] for i in range(min(len(t), len(txt) - 1))])
|
||||
if len(set(tks & tks_)) / max([len(tks), len(tks_), 1]) > 0.8:
|
||||
levels.append(lvl)
|
||||
break
|
||||
@ -274,8 +258,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
|
||||
else:
|
||||
bull = bullets_category([txt for txt, _, _ in sections])
|
||||
most_level, levels = title_frequency(
|
||||
bull, [(txt, lvl) for txt, lvl, _ in sections])
|
||||
most_level, levels = title_frequency(bull, [(txt, lvl) for txt, lvl, _ in sections])
|
||||
|
||||
assert len(sections) == len(levels)
|
||||
sec_ids = []
|
||||
@ -285,25 +268,21 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
sid += 1
|
||||
sec_ids.append(sid)
|
||||
|
||||
sections = [(txt, sec_ids[i], poss)
|
||||
for i, (txt, _, poss) in enumerate(sections)]
|
||||
sections = [(txt, sec_ids[i], poss) for i, (txt, _, poss) in enumerate(sections)]
|
||||
for (img, rows), poss in tbls:
|
||||
if not rows:
|
||||
continue
|
||||
sections.append((rows if isinstance(rows, str) else rows[0], -1,
|
||||
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
|
||||
sections.append((rows if isinstance(rows, str) else rows[0], -1, [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
|
||||
|
||||
def tag(pn, left, right, top, bottom):
|
||||
if pn + left + right + top + bottom == 0:
|
||||
return ""
|
||||
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##" \
|
||||
.format(pn, left, right, top, bottom)
|
||||
return "@@{}\t{:.1f}\t{:.1f}\t{:.1f}\t{:.1f}##".format(pn, left, right, top, bottom)
|
||||
|
||||
chunks = []
|
||||
last_sid = -2
|
||||
tk_cnt = 0
|
||||
for txt, sec_id, poss in sorted(sections, key=lambda x: (
|
||||
x[-1][0][0], x[-1][0][3], x[-1][0][1])):
|
||||
for txt, sec_id, poss in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1])):
|
||||
poss = "\t".join([tag(*pos) for pos in poss])
|
||||
if tk_cnt < 32 or (tk_cnt < 1024 and (sec_id == last_sid or sec_id == -1)):
|
||||
if chunks:
|
||||
@ -330,14 +309,13 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
|
||||
elif re.search(r"\.docx?$", filename, re.IGNORECASE):
|
||||
docx_parser = Docx()
|
||||
ti_list, tbls = docx_parser(filename, binary,
|
||||
from_page=0, to_page=10000, callback=callback)
|
||||
ti_list, tbls = docx_parser(filename, binary, from_page=0, to_page=10000, callback=callback)
|
||||
tbls = vision_figure_parser_docx_wrapper(sections=ti_list, tbls=tbls, callback=callback, **kwargs)
|
||||
res = tokenize_table(tbls, doc, eng)
|
||||
for text, image in ti_list:
|
||||
d = copy.deepcopy(doc)
|
||||
if image:
|
||||
d['image'] = image
|
||||
d["image"] = image
|
||||
d["doc_type_kwd"] = "image"
|
||||
tokenize(d, text, eng)
|
||||
res.append(d)
|
||||
@ -353,9 +331,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
|
||||
|
||||
chunk(sys.argv[1], callback=dummy)
|
||||
|
||||
283
rag/app/naive.py
283
rag/app/naive.py
@ -33,29 +33,32 @@ from common.token_utils import num_tokens_from_string
|
||||
from common.constants import LLMType
|
||||
from api.db.services.llm_service import LLMBundle
|
||||
from rag.utils.file_utils import extract_embed_file, extract_links_from_pdf, extract_links_from_docx, extract_html
|
||||
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, \
|
||||
PdfParser, TxtParser
|
||||
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, \
|
||||
vision_figure_parser_pdf_wrapper
|
||||
from deepdoc.parser import DocxParser, ExcelParser, HtmlParser, JsonParser, MarkdownElementExtractor, MarkdownParser, PdfParser, TxtParser
|
||||
from deepdoc.parser.figure_parser import VisionFigureParser, vision_figure_parser_docx_wrapper_naive, vision_figure_parser_pdf_wrapper
|
||||
from deepdoc.parser.pdf_parser import PlainParser, VisionParser
|
||||
from deepdoc.parser.docling_parser import DoclingParser
|
||||
from deepdoc.parser.tcadp_parser import TCADPParser
|
||||
from common.parser_config_utils import normalize_layout_recognizer
|
||||
from rag.nlp import concat_img, find_codec, naive_merge, naive_merge_with_images, naive_merge_docx, rag_tokenizer, \
|
||||
tokenize_chunks, doc_tokenize_chunks_with_images, tokenize_table, append_context2table_image4pdf, tokenize_chunks_with_images, \
|
||||
attach_media_context # noqa: F401
|
||||
from rag.nlp import (
|
||||
concat_img,
|
||||
find_codec,
|
||||
naive_merge,
|
||||
naive_merge_with_images,
|
||||
naive_merge_docx,
|
||||
rag_tokenizer,
|
||||
tokenize_chunks,
|
||||
doc_tokenize_chunks_with_images,
|
||||
tokenize_table,
|
||||
append_context2table_image4pdf,
|
||||
tokenize_chunks_with_images,
|
||||
) # noqa: F401
|
||||
|
||||
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None,
|
||||
**kwargs):
|
||||
|
||||
def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None, **kwargs):
|
||||
callback = callback
|
||||
binary = binary
|
||||
pdf_parser = pdf_cls() if pdf_cls else Pdf()
|
||||
sections, tables = pdf_parser(
|
||||
filename if not binary else binary,
|
||||
from_page=from_page,
|
||||
to_page=to_page,
|
||||
callback=callback
|
||||
)
|
||||
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
|
||||
|
||||
tables = vision_figure_parser_pdf_wrapper(
|
||||
tbls=tables,
|
||||
@ -67,17 +70,17 @@ def by_deepdoc(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
|
||||
|
||||
|
||||
def by_mineru(
|
||||
filename,
|
||||
binary=None,
|
||||
from_page=0,
|
||||
to_page=100000,
|
||||
lang="Chinese",
|
||||
callback=None,
|
||||
pdf_cls=None,
|
||||
parse_method: str = "raw",
|
||||
mineru_llm_name: str | None = None,
|
||||
tenant_id: str | None = None,
|
||||
**kwargs,
|
||||
filename,
|
||||
binary=None,
|
||||
from_page=0,
|
||||
to_page=100000,
|
||||
lang="Chinese",
|
||||
callback=None,
|
||||
pdf_cls=None,
|
||||
parse_method: str = "raw",
|
||||
mineru_llm_name: str | None = None,
|
||||
tenant_id: str | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
pdf_parser = None
|
||||
if tenant_id:
|
||||
@ -115,8 +118,7 @@ def by_mineru(
|
||||
return None, None, None
|
||||
|
||||
|
||||
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None,
|
||||
**kwargs):
|
||||
def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, pdf_cls=None, **kwargs):
|
||||
pdf_parser = DoclingParser()
|
||||
parse_method = kwargs.get("parse_method", "raw")
|
||||
|
||||
@ -130,7 +132,7 @@ def by_docling(filename, binary=None, from_page=0, to_page=100000, lang="Chinese
|
||||
callback=callback,
|
||||
output_dir=os.environ.get("MINERU_OUTPUT_DIR", ""),
|
||||
delete_output=bool(int(os.environ.get("MINERU_DELETE_OUTPUT", 1))),
|
||||
parse_method=parse_method
|
||||
parse_method=parse_method,
|
||||
)
|
||||
return sections, tables, pdf_parser
|
||||
|
||||
@ -142,16 +144,60 @@ def by_tcadp(filename, binary=None, from_page=0, to_page=100000, lang="Chinese",
|
||||
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
|
||||
return None, None, tcadp_parser
|
||||
|
||||
sections, tables = tcadp_parser.parse_pdf(
|
||||
filepath=filename,
|
||||
binary=binary,
|
||||
callback=callback,
|
||||
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
|
||||
file_type="PDF"
|
||||
)
|
||||
sections, tables = tcadp_parser.parse_pdf(filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), file_type="PDF")
|
||||
return sections, tables, tcadp_parser
|
||||
|
||||
|
||||
def by_paddleocr(
|
||||
filename,
|
||||
binary=None,
|
||||
from_page=0,
|
||||
to_page=100000,
|
||||
lang="Chinese",
|
||||
callback=None,
|
||||
pdf_cls=None,
|
||||
parse_method: str = "raw",
|
||||
paddleocr_llm_name: str | None = None,
|
||||
tenant_id: str | None = None,
|
||||
**kwargs,
|
||||
):
|
||||
pdf_parser = None
|
||||
if tenant_id:
|
||||
if not paddleocr_llm_name:
|
||||
try:
|
||||
from api.db.services.tenant_llm_service import TenantLLMService
|
||||
|
||||
env_name = TenantLLMService.ensure_paddleocr_from_env(tenant_id)
|
||||
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="PaddleOCR", model_type=LLMType.OCR)
|
||||
if candidates:
|
||||
paddleocr_llm_name = candidates[0].llm_name
|
||||
elif env_name:
|
||||
paddleocr_llm_name = env_name
|
||||
except Exception as e: # best-effort fallback
|
||||
logging.warning(f"fallback to env paddleocr: {e}")
|
||||
|
||||
if paddleocr_llm_name:
|
||||
try:
|
||||
ocr_model = LLMBundle(tenant_id=tenant_id, llm_type=LLMType.OCR, llm_name=paddleocr_llm_name, lang=lang)
|
||||
pdf_parser = ocr_model.mdl
|
||||
sections, tables = pdf_parser.parse_pdf(
|
||||
filepath=filename,
|
||||
binary=binary,
|
||||
callback=callback,
|
||||
parse_method=parse_method,
|
||||
**kwargs,
|
||||
)
|
||||
return sections, tables, pdf_parser
|
||||
except Exception as e:
|
||||
logging.error(f"Failed to parse pdf via LLMBundle PaddleOCR ({paddleocr_llm_name}): {e}")
|
||||
|
||||
return None, None, None
|
||||
|
||||
if callback:
|
||||
callback(-1, "PaddleOCR not found.")
|
||||
return None, None, None
|
||||
|
||||
|
||||
def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
layout_recognizer = (kwargs.get("layout_recognizer") or "").strip()
|
||||
if (not layout_recognizer) or (layout_recognizer == "Plain Text"):
|
||||
@ -168,12 +214,7 @@ def by_plaintext(filename, binary=None, from_page=0, to_page=100000, callback=No
|
||||
)
|
||||
pdf_parser = VisionParser(vision_model=vision_model, **kwargs)
|
||||
|
||||
sections, tables = pdf_parser(
|
||||
filename if not binary else binary,
|
||||
from_page=from_page,
|
||||
to_page=to_page,
|
||||
callback=callback
|
||||
)
|
||||
sections, tables = pdf_parser(filename if not binary else binary, from_page=from_page, to_page=to_page, callback=callback)
|
||||
return sections, tables, pdf_parser
|
||||
|
||||
|
||||
@ -182,6 +223,7 @@ PARSERS = {
|
||||
"mineru": by_mineru,
|
||||
"docling": by_docling,
|
||||
"tcadp": by_tcadp,
|
||||
"paddleocr": by_paddleocr,
|
||||
"plaintext": by_plaintext, # default
|
||||
}
|
||||
|
||||
@ -191,12 +233,12 @@ class Docx(DocxParser):
|
||||
pass
|
||||
|
||||
def get_picture(self, document, paragraph):
|
||||
imgs = paragraph._element.xpath('.//pic:pic')
|
||||
imgs = paragraph._element.xpath(".//pic:pic")
|
||||
if not imgs:
|
||||
return None
|
||||
res_img = None
|
||||
for img in imgs:
|
||||
embed = img.xpath('.//a:blip/@r:embed')
|
||||
embed = img.xpath(".//a:blip/@r:embed")
|
||||
if not embed:
|
||||
continue
|
||||
embed = embed[0]
|
||||
@ -219,7 +261,7 @@ class Docx(DocxParser):
|
||||
logging.warning(f"The recognized image stream appears to be corrupted. Skipping image, exception: {e}")
|
||||
continue
|
||||
try:
|
||||
image = Image.open(BytesIO(image_blob)).convert('RGB')
|
||||
image = Image.open(BytesIO(image_blob)).convert("RGB")
|
||||
if res_img is None:
|
||||
res_img = image
|
||||
else:
|
||||
@ -251,11 +293,11 @@ class Docx(DocxParser):
|
||||
try:
|
||||
# Iterate through all paragraphs and tables in document order
|
||||
for i, block in enumerate(self.doc._element.body):
|
||||
if block.tag.endswith('p'): # Paragraph
|
||||
if block.tag.endswith("p"): # Paragraph
|
||||
p = Paragraph(block, self.doc)
|
||||
blocks.append(('p', i, p))
|
||||
elif block.tag.endswith('tbl'): # Table
|
||||
blocks.append(('t', i, None)) # Table object will be retrieved later
|
||||
blocks.append(("p", i, p))
|
||||
elif block.tag.endswith("tbl"): # Table
|
||||
blocks.append(("t", i, None)) # Table object will be retrieved later
|
||||
except Exception as e:
|
||||
logging.error(f"Error collecting blocks: {e}")
|
||||
return ""
|
||||
@ -264,7 +306,7 @@ class Docx(DocxParser):
|
||||
target_table_pos = -1
|
||||
table_count = 0
|
||||
for i, (block_type, pos, _) in enumerate(blocks):
|
||||
if block_type == 't':
|
||||
if block_type == "t":
|
||||
if table_count == table_index:
|
||||
target_table_pos = pos
|
||||
break
|
||||
@ -280,7 +322,7 @@ class Docx(DocxParser):
|
||||
if pos >= target_table_pos: # Skip blocks after the table
|
||||
continue
|
||||
|
||||
if block_type != 'p':
|
||||
if block_type != "p":
|
||||
continue
|
||||
|
||||
if block.style and block.style.name and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
|
||||
@ -309,7 +351,7 @@ class Docx(DocxParser):
|
||||
if pos >= target_table_pos: # Skip blocks after the table
|
||||
continue
|
||||
|
||||
if block_type != 'p':
|
||||
if block_type != "p":
|
||||
continue
|
||||
|
||||
if block.style and re.search(r"Heading\s*(\d+)", block.style.name, re.I):
|
||||
@ -340,8 +382,7 @@ class Docx(DocxParser):
|
||||
return ""
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000):
|
||||
self.doc = Document(
|
||||
filename) if not binary else Document(BytesIO(binary))
|
||||
self.doc = Document(filename) if not binary else Document(BytesIO(binary))
|
||||
pn = 0
|
||||
lines = []
|
||||
last_image = None
|
||||
@ -357,7 +398,7 @@ class Docx(DocxParser):
|
||||
if pn > to_page:
|
||||
break
|
||||
|
||||
if block.tag.endswith('p'):
|
||||
if block.tag.endswith("p"):
|
||||
p = Paragraph(block, self.doc)
|
||||
|
||||
if from_page <= pn < to_page:
|
||||
@ -417,7 +458,7 @@ class Docx(DocxParser):
|
||||
if "w:br" in xml and 'type="page"' in xml:
|
||||
pn += 1
|
||||
|
||||
elif block.tag.endswith('tbl'):
|
||||
elif block.tag.endswith("tbl"):
|
||||
if pn < from_page or pn > to_page:
|
||||
table_idx += 1
|
||||
continue
|
||||
@ -455,7 +496,6 @@ class Docx(DocxParser):
|
||||
|
||||
return new_line
|
||||
|
||||
|
||||
def to_markdown(self, filename=None, binary=None, inline_images: bool = True):
|
||||
"""
|
||||
This function uses mammoth, licensed under the BSD 2-Clause License.
|
||||
@ -486,8 +526,7 @@ class Docx(DocxParser):
|
||||
|
||||
try:
|
||||
if inline_images:
|
||||
result = mammoth.convert_to_html(docx_file,
|
||||
convert_image=mammoth.images.img_element(_convert_image_to_base64))
|
||||
result = mammoth.convert_to_html(docx_file, convert_image=mammoth.images.img_element(_convert_image_to_base64))
|
||||
else:
|
||||
result = mammoth.convert_to_html(docx_file)
|
||||
|
||||
@ -505,18 +544,11 @@ class Pdf(PdfParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, zoomin=3, callback=None, separate_tables_figures=False):
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None, separate_tables_figures=False):
|
||||
start = timer()
|
||||
first_start = start
|
||||
callback(msg="OCR started")
|
||||
self.__images__(
|
||||
filename if not binary else binary,
|
||||
zoomin,
|
||||
from_page,
|
||||
to_page,
|
||||
callback
|
||||
)
|
||||
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
|
||||
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
||||
logging.info("OCR({}~{}): {:.2f}s".format(from_page, to_page, timer() - start))
|
||||
|
||||
@ -559,13 +591,14 @@ class Markdown(MarkdownParser):
|
||||
return []
|
||||
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
html_content = markdown(text)
|
||||
soup = BeautifulSoup(html_content, 'html.parser')
|
||||
soup = BeautifulSoup(html_content, "html.parser")
|
||||
return soup
|
||||
|
||||
def get_hyperlink_urls(self, soup):
|
||||
if soup:
|
||||
return set([a.get('href') for a in soup.find_all('a') if a.get('href')])
|
||||
return set([a.get("href") for a in soup.find_all("a") if a.get("href")])
|
||||
return []
|
||||
|
||||
def extract_image_urls_with_lines(self, text):
|
||||
@ -588,10 +621,10 @@ class Markdown(MarkdownParser):
|
||||
try:
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
soup = BeautifulSoup(text, 'html.parser')
|
||||
soup = BeautifulSoup(text, "html.parser")
|
||||
newline_offsets = [m.start() for m in re.finditer(r"\n", text)] + [len(text)]
|
||||
for img_tag in soup.find_all('img'):
|
||||
src = img_tag.get('src')
|
||||
for img_tag in soup.find_all("img"):
|
||||
src = img_tag.get("src")
|
||||
if not src:
|
||||
continue
|
||||
|
||||
@ -627,14 +660,14 @@ class Markdown(MarkdownParser):
|
||||
continue
|
||||
img_obj = None
|
||||
try:
|
||||
if url.startswith(('http://', 'https://')):
|
||||
if url.startswith(("http://", "https://")):
|
||||
response = requests.get(url, stream=True, timeout=30)
|
||||
if response.status_code == 200 and response.headers.get('Content-Type', '').startswith('image/'):
|
||||
img_obj = Image.open(BytesIO(response.content)).convert('RGB')
|
||||
if response.status_code == 200 and response.headers.get("Content-Type", "").startswith("image/"):
|
||||
img_obj = Image.open(BytesIO(response.content)).convert("RGB")
|
||||
else:
|
||||
local_path = Path(url)
|
||||
if local_path.exists():
|
||||
img_obj = Image.open(url).convert('RGB')
|
||||
img_obj = Image.open(url).convert("RGB")
|
||||
else:
|
||||
logging.warning(f"Local image file not found: {url}")
|
||||
except Exception as e:
|
||||
@ -652,7 +685,7 @@ class Markdown(MarkdownParser):
|
||||
with open(filename, "r") as f:
|
||||
txt = f.read()
|
||||
|
||||
remainder, tables = self.extract_tables_and_remainder(f'{txt}\n', separate_tables=separate_tables)
|
||||
remainder, tables = self.extract_tables_and_remainder(f"{txt}\n", separate_tables=separate_tables)
|
||||
# To eliminate duplicate tables in chunking result, uncomment code below and set separate_tables to True in line 410.
|
||||
# extractor = MarkdownElementExtractor(remainder)
|
||||
extractor = MarkdownElementExtractor(txt)
|
||||
@ -678,7 +711,7 @@ class Markdown(MarkdownParser):
|
||||
|
||||
tbls = []
|
||||
for table in tables:
|
||||
tbls.append(((None, markdown(table, extensions=['markdown.extensions.tables'])), ""))
|
||||
tbls.append(((None, markdown(table, extensions=["markdown.extensions.tables"])), ""))
|
||||
if return_section_images:
|
||||
return sections, tbls, section_images
|
||||
return sections, tbls
|
||||
@ -694,7 +727,7 @@ def load_from_xml_v2(baseURI, rels_item_xml):
|
||||
if rels_item_xml is not None:
|
||||
rels_elm = parse_xml(rels_item_xml)
|
||||
for rel_elm in rels_elm.Relationship_lst:
|
||||
if rel_elm.target_ref in ('../NULL', 'NULL'):
|
||||
if rel_elm.target_ref in ("../NULL", "NULL"):
|
||||
continue
|
||||
srels._srels.append(_SerializedRelationship(baseURI, rel_elm))
|
||||
return srels
|
||||
@ -702,21 +735,18 @@ def load_from_xml_v2(baseURI, rels_item_xml):
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, excel, txt.
|
||||
This method apply the naive ways to chunk files.
|
||||
Successive text will be sliced into pieces using 'delimiter'.
|
||||
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
|
||||
Supported file formats are docx, pdf, excel, txt.
|
||||
This method apply the naive ways to chunk files.
|
||||
Successive text will be sliced into pieces using 'delimiter'.
|
||||
Next, these successive pieces are merge into chunks whose token number is no more than 'Max token number'.
|
||||
"""
|
||||
urls = set()
|
||||
url_res = []
|
||||
|
||||
is_english = lang.lower() == "english" # is_english(cks)
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
|
||||
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC", "analyze_hyperlink": True})
|
||||
|
||||
child_deli = (parser_config.get("children_delimiter") or "").encode('utf-8').decode('unicode_escape').encode(
|
||||
'latin1').decode('utf-8')
|
||||
child_deli = (parser_config.get("children_delimiter") or "").encode("utf-8").decode("unicode_escape").encode("latin1").decode("utf-8")
|
||||
cust_child_deli = re.findall(r"`([^`]+)`", child_deli)
|
||||
child_deli = "|".join(re.sub(r"`([^`]+)`", "", child_deli))
|
||||
if cust_child_deli:
|
||||
@ -728,10 +758,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
table_context_size = max(0, int(parser_config.get("table_context_size", 0) or 0))
|
||||
image_context_size = max(0, int(parser_config.get("image_context_size", 0) or 0))
|
||||
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
}
|
||||
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
res = []
|
||||
pdf_parser = None
|
||||
@ -750,8 +777,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
# Recursively chunk each embedded file and collect results
|
||||
for embed_filename, embed_bytes in embeds:
|
||||
try:
|
||||
sub_res = chunk(embed_filename, binary=embed_bytes, lang=lang, callback=callback, is_root=False,
|
||||
**kwargs) or []
|
||||
sub_res = chunk(embed_filename, binary=embed_bytes, lang=lang, callback=callback, is_root=False, **kwargs) or []
|
||||
embed_res.extend(sub_res)
|
||||
except Exception as e:
|
||||
error_msg = f"Failed to chunk embed {embed_filename}: {e}"
|
||||
@ -772,8 +798,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
sub_url_res = chunk(url, html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
|
||||
except Exception as e:
|
||||
logging.info(f"Failed to chunk url in registered file type {url}: {e}")
|
||||
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False,
|
||||
**kwargs)
|
||||
sub_url_res = chunk(f"{index}.html", html_bytes, callback=callback, lang=lang, is_root=False, **kwargs)
|
||||
url_res.extend(sub_url_res)
|
||||
|
||||
# fix "There is no item named 'word/NULL' in the archive", referring to https://github.com/python-openxml/python-docx/issues/1105#issuecomment-1298075246
|
||||
@ -784,11 +809,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
|
||||
# chunks list[dict]
|
||||
# images list - index of image chunk in chunks
|
||||
chunks, images = naive_merge_docx(
|
||||
sections, int(parser_config.get(
|
||||
"chunk_token_num", 128)), parser_config.get(
|
||||
"delimiter", "\n!?。;!?"), table_context_size, image_context_size)
|
||||
|
||||
chunks, images = naive_merge_docx(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"), table_context_size, image_context_size)
|
||||
|
||||
vision_figure_parser_docx_wrapper_naive(chunks=chunks, idx_lst=images, callback=callback, **kwargs)
|
||||
|
||||
callback(0.8, "Finish parsing.")
|
||||
@ -801,9 +823,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
return res
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||
parser_config.get("layout_recognize", "DeepDOC")
|
||||
)
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
|
||||
|
||||
if parser_config.get("analyze_hyperlink", False) and is_root:
|
||||
urls = extract_links_from_pdf(binary)
|
||||
@ -824,7 +844,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
callback=callback,
|
||||
layout_recognizer=layout_recognizer,
|
||||
mineru_llm_name=parser_model_name,
|
||||
**kwargs
|
||||
paddleocr_llm_name=parser_model_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not sections and not tables:
|
||||
@ -833,7 +854,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
if table_context_size or image_context_size:
|
||||
tables = append_context2table_image4pdf(sections, tables, image_context_size)
|
||||
|
||||
if name in ["tcadp", "docling", "mineru"]:
|
||||
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
|
||||
parser_config["chunk_token_num"] = 0
|
||||
|
||||
res = tokenize_table(tables, doc, is_english)
|
||||
@ -847,10 +868,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
if layout_recognizer == "TCADP Parser":
|
||||
table_result_type = parser_config.get("table_result_type", "1")
|
||||
markdown_image_response_type = parser_config.get("markdown_image_response_type", "1")
|
||||
tcadp_parser = TCADPParser(
|
||||
table_result_type=table_result_type,
|
||||
markdown_image_response_type=markdown_image_response_type
|
||||
)
|
||||
tcadp_parser = TCADPParser(table_result_type=table_result_type, markdown_image_response_type=markdown_image_response_type)
|
||||
if not tcadp_parser.check_installation():
|
||||
callback(-1, "TCADP parser not available. Please check Tencent Cloud API configuration.")
|
||||
return res
|
||||
@ -858,13 +876,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
# Determine file type based on extension
|
||||
file_type = "XLSX" if re.search(r"\.xlsx?$", filename, re.IGNORECASE) else "CSV"
|
||||
|
||||
sections, tables = tcadp_parser.parse_pdf(
|
||||
filepath=filename,
|
||||
binary=binary,
|
||||
callback=callback,
|
||||
output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""),
|
||||
file_type=file_type
|
||||
)
|
||||
sections, tables = tcadp_parser.parse_pdf(filepath=filename, binary=binary, callback=callback, output_dir=os.environ.get("TCADP_OUTPUT_DIR", ""), file_type=file_type)
|
||||
parser_config["chunk_token_num"] = 0
|
||||
res = tokenize_table(tables, doc, is_english)
|
||||
callback(0.8, "Finish parsing.")
|
||||
@ -879,9 +891,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
|
||||
elif re.search(r"\.(txt|py|js|java|c|cpp|h|php|go|ts|sh|cs|kt|sql)$", filename, re.IGNORECASE):
|
||||
callback(0.1, "Start to parse.")
|
||||
sections = TxtParser()(filename, binary,
|
||||
parser_config.get("chunk_token_num", 128),
|
||||
parser_config.get("delimiter", "\n!?;。;!?"))
|
||||
sections = TxtParser()(filename, binary, parser_config.get("chunk_token_num", 128), parser_config.get("delimiter", "\n!?;。;!?"))
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.(md|markdown|mdx)$", filename, re.IGNORECASE):
|
||||
@ -919,11 +929,9 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
else:
|
||||
section_images = [None] * len(sections)
|
||||
section_images[idx] = combined_image
|
||||
markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=[
|
||||
((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
|
||||
markdown_vision_parser = VisionFigureParser(vision_model=vision_model, figures_data=[((combined_image, ["markdown image"]), [(0, 0, 0, 0, 0)])], **kwargs)
|
||||
boosted_figures = markdown_vision_parser(callback=callback)
|
||||
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]),
|
||||
sections[idx][1])
|
||||
sections[idx] = (section_text + "\n\n" + "\n\n".join([fig[0][1] for fig in boosted_figures]), sections[idx][1])
|
||||
|
||||
else:
|
||||
logging.warning("No visual model detected. Skipping figure parsing enhancement.")
|
||||
@ -962,8 +970,8 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
|
||||
binary = BytesIO(binary)
|
||||
doc_parsed = tika_parser.from_buffer(binary)
|
||||
if doc_parsed.get('content', None) is not None:
|
||||
sections = doc_parsed['content'].split('\n')
|
||||
if doc_parsed.get("content", None) is not None:
|
||||
sections = doc_parsed["content"].split("\n")
|
||||
sections = [(_, "") for _ in sections if _]
|
||||
callback(0.8, "Finish parsing.")
|
||||
else:
|
||||
@ -972,8 +980,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
logging.warning(error_msg)
|
||||
return []
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
||||
raise NotImplementedError("file type not supported yet(pdf, xlsx, doc, docx, txt supported)")
|
||||
|
||||
st = timer()
|
||||
if is_markdown:
|
||||
@ -1021,8 +1028,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
has_images = merged_images and any(img is not None for img in merged_images)
|
||||
|
||||
if has_images:
|
||||
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images,
|
||||
child_delimiters_pattern=child_deli))
|
||||
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, merged_images, child_delimiters_pattern=child_deli))
|
||||
else:
|
||||
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))
|
||||
else:
|
||||
@ -1031,17 +1037,10 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
section_images = None
|
||||
|
||||
if section_images:
|
||||
chunks, images = naive_merge_with_images(sections, section_images,
|
||||
int(parser_config.get(
|
||||
"chunk_token_num", 128)), parser_config.get(
|
||||
"delimiter", "\n!?。;!?"))
|
||||
res.extend(
|
||||
tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli))
|
||||
chunks, images = naive_merge_with_images(sections, section_images, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"))
|
||||
res.extend(tokenize_chunks_with_images(chunks, doc, is_english, images, child_delimiters_pattern=child_deli))
|
||||
else:
|
||||
chunks = naive_merge(
|
||||
sections, int(parser_config.get(
|
||||
"chunk_token_num", 128)), parser_config.get(
|
||||
"delimiter", "\n!?。;!?"))
|
||||
chunks = naive_merge(sections, int(parser_config.get("chunk_token_num", 128)), parser_config.get("delimiter", "\n!?。;!?"))
|
||||
|
||||
res.extend(tokenize_chunks(chunks, doc, is_english, pdf_parser, child_delimiters_pattern=child_deli))
|
||||
|
||||
@ -1071,9 +1070,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", ca
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
|
||||
|
||||
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
|
||||
|
||||
@ -28,18 +28,12 @@ from common.parser_config_utils import normalize_layout_recognizer
|
||||
|
||||
|
||||
class Pdf(PdfParser):
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, zoomin=3, callback=None):
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None):
|
||||
from timeit import default_timer as timer
|
||||
|
||||
start = timer()
|
||||
callback(msg="OCR started")
|
||||
self.__images__(
|
||||
filename if not binary else binary,
|
||||
zoomin,
|
||||
from_page,
|
||||
to_page,
|
||||
callback
|
||||
)
|
||||
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
|
||||
callback(msg="OCR finished ({:.2f}s)".format(timer() - start))
|
||||
|
||||
start = timer()
|
||||
@ -57,21 +51,16 @@ class Pdf(PdfParser):
|
||||
tbls = self._extract_table_figure(True, zoomin, True, True)
|
||||
self._concat_downward()
|
||||
|
||||
sections = [(b["text"], self.get_position(b, zoomin))
|
||||
for i, b in enumerate(self.boxes)]
|
||||
return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (
|
||||
x[-1][0][0], x[-1][0][3], x[-1][0][1]))], tbls
|
||||
sections = [(b["text"], self.get_position(b, zoomin)) for i, b in enumerate(self.boxes)]
|
||||
return [(txt, "") for txt, _ in sorted(sections, key=lambda x: (x[-1][0][0], x[-1][0][3], x[-1][0][1]))], tbls
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, **kwargs):
|
||||
"""
|
||||
Supported file formats are docx, pdf, excel, txt.
|
||||
One file forms a chunk which maintains original text order.
|
||||
Supported file formats are docx, pdf, excel, txt.
|
||||
One file forms a chunk which maintains original text order.
|
||||
"""
|
||||
parser_config = kwargs.get(
|
||||
"parser_config", {
|
||||
"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
parser_config = kwargs.get("parser_config", {"chunk_token_num": 512, "delimiter": "\n!?。;!?", "layout_recognize": "DeepDOC"})
|
||||
eng = lang.lower() == "english" # is_english(cks)
|
||||
|
||||
if re.search(r"\.docx$", filename, re.IGNORECASE):
|
||||
@ -99,9 +88,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||
parser_config.get("layout_recognize", "DeepDOC")
|
||||
)
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
|
||||
|
||||
if isinstance(layout_recognizer, bool):
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
@ -120,13 +107,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
pdf_cls=Pdf,
|
||||
layout_recognizer=layout_recognizer,
|
||||
mineru_llm_name=parser_model_name,
|
||||
**kwargs
|
||||
paddleocr_llm_name=parser_model_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not sections and not tbls:
|
||||
return []
|
||||
|
||||
if name in ["tcadp", "docling", "mineru"]:
|
||||
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
|
||||
parser_config["chunk_token_num"] = 0
|
||||
|
||||
callback(0.8, "Finish parsing.")
|
||||
@ -134,8 +122,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
for (img, rows), poss in tbls:
|
||||
if not rows:
|
||||
continue
|
||||
sections.append((rows if isinstance(rows, str) else rows[0],
|
||||
[(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
|
||||
sections.append((rows if isinstance(rows, str) else rows[0], [(p[0] + 1 - from_page, p[1], p[2], p[3], p[4]) for p in poss]))
|
||||
sections = [s for s, _ in sections if s]
|
||||
|
||||
elif re.search(r"\.xlsx?$", filename, re.IGNORECASE):
|
||||
@ -167,19 +154,15 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
|
||||
binary = BytesIO(binary)
|
||||
doc_parsed = tika_parser.from_buffer(binary)
|
||||
if doc_parsed.get('content', None) is not None:
|
||||
sections = doc_parsed['content'].split('\n')
|
||||
if doc_parsed.get("content", None) is not None:
|
||||
sections = doc_parsed["content"].split("\n")
|
||||
sections = [s for s in sections if s]
|
||||
callback(0.8, "Finish parsing.")
|
||||
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(doc, docx, pdf, txt supported)")
|
||||
raise NotImplementedError("file type not supported yet(doc, docx, pdf, txt supported)")
|
||||
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
}
|
||||
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
tokenize(doc, "\n".join(sections), eng)
|
||||
return [doc]
|
||||
@ -188,9 +171,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
|
||||
def dummy(prog=None, msg=""):
|
||||
pass
|
||||
|
||||
|
||||
chunk(sys.argv[1], from_page=0, to_page=10, callback=dummy)
|
||||
|
||||
@ -36,22 +36,18 @@ class Ppt(PptParser):
|
||||
callback(0.5, "Text extraction finished.")
|
||||
import aspose.slides as slides
|
||||
import aspose.pydrawing as drawing
|
||||
|
||||
imgs = []
|
||||
with slides.Presentation(BytesIO(fnm)) as presentation:
|
||||
for i, slide in enumerate(presentation.slides[from_page: to_page]):
|
||||
for i, slide in enumerate(presentation.slides[from_page:to_page]):
|
||||
try:
|
||||
with BytesIO() as buffered:
|
||||
slide.get_thumbnail(
|
||||
0.1, 0.1).save(
|
||||
buffered, drawing.imaging.ImageFormat.jpeg)
|
||||
slide.get_thumbnail(0.1, 0.1).save(buffered, drawing.imaging.ImageFormat.jpeg)
|
||||
buffered.seek(0)
|
||||
imgs.append(Image.open(buffered).copy())
|
||||
except RuntimeError as e:
|
||||
raise RuntimeError(
|
||||
f'ppt parse error at page {i + 1}, original error: {str(e)}') from e
|
||||
assert len(imgs) == len(
|
||||
txts), "Slides text and image do not match: {} vs. {}".format(
|
||||
len(imgs), len(txts))
|
||||
raise RuntimeError(f"ppt parse error at page {i + 1}, original error: {str(e)}") from e
|
||||
assert len(imgs) == len(txts), "Slides text and image do not match: {} vs. {}".format(len(imgs), len(txts))
|
||||
callback(0.9, "Image extraction finished")
|
||||
self.is_english = is_english(txts)
|
||||
return [(txts[i], imgs[i]) for i in range(len(txts))]
|
||||
@ -61,12 +57,10 @@ class Pdf(PdfParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, zoomin=3, callback=None, **kwargs):
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, zoomin=3, callback=None, **kwargs):
|
||||
# 1. OCR
|
||||
callback(msg="OCR started")
|
||||
self.__images__(filename if not binary else binary, zoomin, from_page,
|
||||
to_page, callback)
|
||||
self.__images__(filename if not binary else binary, zoomin, from_page, to_page, callback)
|
||||
|
||||
# 2. Layout Analysis
|
||||
callback(msg="Layout Analysis")
|
||||
@ -91,12 +85,7 @@ class Pdf(PdfParser):
|
||||
global_page_num = b["page_number"] + from_page
|
||||
if not (from_page < global_page_num <= to_page + from_page):
|
||||
continue
|
||||
page_items[global_page_num].append({
|
||||
"top": b["top"],
|
||||
"x0": b["x0"],
|
||||
"text": b["text"],
|
||||
"type": "text"
|
||||
})
|
||||
page_items[global_page_num].append({"top": b["top"], "x0": b["x0"], "text": b["text"], "type": "text"})
|
||||
|
||||
# (B) Add table and figure
|
||||
for (img, content), positions in tbls:
|
||||
@ -127,12 +116,7 @@ class Pdf(PdfParser):
|
||||
top = positions[0][3]
|
||||
left = positions[0][1]
|
||||
|
||||
page_items[current_page_num].append({
|
||||
"top": top,
|
||||
"x0": left,
|
||||
"text": final_text,
|
||||
"type": "table_or_figure"
|
||||
})
|
||||
page_items[current_page_num].append({"top": top, "x0": left, "text": final_text, "type": "table_or_figure"})
|
||||
|
||||
# 7. Generate result
|
||||
res = []
|
||||
@ -153,18 +137,16 @@ class Pdf(PdfParser):
|
||||
|
||||
|
||||
class PlainPdf(PlainParser):
|
||||
def __call__(self, filename, binary=None, from_page=0,
|
||||
to_page=100000, callback=None, **kwargs):
|
||||
def __call__(self, filename, binary=None, from_page=0, to_page=100000, callback=None, **kwargs):
|
||||
self.pdf = pdf2_read(filename if not binary else BytesIO(binary))
|
||||
page_txt = []
|
||||
for page in self.pdf.pages[from_page: to_page]:
|
||||
for page in self.pdf.pages[from_page:to_page]:
|
||||
page_txt.append(page.extract_text())
|
||||
callback(0.9, "Parsing finished")
|
||||
return [(txt, None) for txt in page_txt], []
|
||||
|
||||
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
lang="Chinese", callback=None, parser_config=None, **kwargs):
|
||||
def chunk(filename, binary=None, from_page=0, to_page=100000, lang="Chinese", callback=None, parser_config=None, **kwargs):
|
||||
"""
|
||||
The supported file formats are pdf, pptx.
|
||||
Every page will be treated as a chunk. And the thumbnail of every page will be stored.
|
||||
@ -173,18 +155,12 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
if parser_config is None:
|
||||
parser_config = {}
|
||||
eng = lang.lower() == "english"
|
||||
doc = {
|
||||
"docnm_kwd": filename,
|
||||
"title_tks": rag_tokenizer.tokenize(
|
||||
re.sub(r"\.[a-zA-Z]+$", "", filename))
|
||||
}
|
||||
doc = {"docnm_kwd": filename, "title_tks": rag_tokenizer.tokenize(re.sub(r"\.[a-zA-Z]+$", "", filename))}
|
||||
doc["title_sm_tks"] = rag_tokenizer.fine_grained_tokenize(doc["title_tks"])
|
||||
res = []
|
||||
if re.search(r"\.pptx?$", filename, re.IGNORECASE):
|
||||
ppt_parser = Ppt()
|
||||
for pn, (txt, img) in enumerate(ppt_parser(
|
||||
filename if not binary else binary, from_page, 1000000,
|
||||
callback)):
|
||||
for pn, (txt, img) in enumerate(ppt_parser(filename if not binary else binary, from_page, 1000000, callback)):
|
||||
d = copy.deepcopy(doc)
|
||||
pn += from_page
|
||||
d["image"] = img
|
||||
@ -196,9 +172,7 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
res.append(d)
|
||||
return res
|
||||
elif re.search(r"\.pdf$", filename, re.IGNORECASE):
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(
|
||||
parser_config.get("layout_recognize", "DeepDOC")
|
||||
)
|
||||
layout_recognizer, parser_model_name = normalize_layout_recognizer(parser_config.get("layout_recognize", "DeepDOC"))
|
||||
|
||||
if isinstance(layout_recognizer, bool):
|
||||
layout_recognizer = "DeepDOC" if layout_recognizer else "Plain Text"
|
||||
@ -217,13 +191,14 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
pdf_cls=Pdf,
|
||||
layout_recognizer=layout_recognizer,
|
||||
mineru_llm_name=parser_model_name,
|
||||
**kwargs
|
||||
paddleocr_llm_name=parser_model_name,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if not sections:
|
||||
return []
|
||||
|
||||
if name in ["tcadp", "docling", "mineru"]:
|
||||
if name in ["tcadp", "docling", "mineru", "paddleocr"]:
|
||||
parser_config["chunk_token_num"] = 0
|
||||
|
||||
callback(0.8, "Finish parsing.")
|
||||
@ -236,22 +211,18 @@ def chunk(filename, binary=None, from_page=0, to_page=100000,
|
||||
d["image"] = img
|
||||
d["page_num_int"] = [pn + 1]
|
||||
d["top_int"] = [0]
|
||||
d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0,
|
||||
img.size[1] if img else 0)]
|
||||
d["position_int"] = [(pn + 1, 0, img.size[0] if img else 0, 0, img.size[1] if img else 0)]
|
||||
tokenize(d, txt, eng)
|
||||
res.append(d)
|
||||
return res
|
||||
|
||||
raise NotImplementedError(
|
||||
"file type not supported yet(pptx, pdf supported)")
|
||||
raise NotImplementedError("file type not supported yet(pptx, pdf supported)")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
|
||||
def dummy(a, b):
|
||||
pass
|
||||
|
||||
|
||||
chunk(sys.argv[1], callback=dummy)
|
||||
|
||||
@ -166,7 +166,7 @@ class ParserParam(ProcessParamBase):
|
||||
pdf_parse_method = pdf_config.get("parse_method", "")
|
||||
self.check_empty(pdf_parse_method, "Parse method abnormal.")
|
||||
|
||||
if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru", "tcadp parser"]:
|
||||
if pdf_parse_method.lower() not in ["deepdoc", "plain_text", "mineru", "tcadp parser", "paddleocr"]:
|
||||
self.check_empty(pdf_config.get("lang", ""), "PDF VLM language")
|
||||
|
||||
pdf_output_format = pdf_config.get("output_format", "")
|
||||
@ -232,6 +232,9 @@ class Parser(ProcessBase):
|
||||
if lowered.endswith("@mineru"):
|
||||
parser_model_name = raw_parse_method.rsplit("@", 1)[0]
|
||||
parse_method = "MinerU"
|
||||
elif lowered.endswith("@paddleocr"):
|
||||
parser_model_name = raw_parse_method.rsplit("@", 1)[0]
|
||||
parse_method = "PaddleOCR"
|
||||
|
||||
if parse_method.lower() == "deepdoc":
|
||||
bboxes = RAGFlowPdfParser().parse_into_bboxes(blob, callback=self.callback)
|
||||
@ -239,6 +242,7 @@ class Parser(ProcessBase):
|
||||
lines, _ = PlainParser()(blob)
|
||||
bboxes = [{"text": t} for t, _ in lines]
|
||||
elif parse_method.lower() == "mineru":
|
||||
|
||||
def resolve_mineru_llm_name():
|
||||
configured = parser_model_name or conf.get("mineru_llm_name")
|
||||
if configured:
|
||||
@ -320,6 +324,84 @@ class Parser(ProcessBase):
|
||||
bboxes.append({"text": section})
|
||||
else:
|
||||
bboxes.append({"text": section})
|
||||
elif parse_method.lower() == "paddleocr":
|
||||
|
||||
def resolve_paddleocr_llm_name():
|
||||
configured = parser_model_name or conf.get("paddleocr_llm_name")
|
||||
if configured:
|
||||
return configured
|
||||
|
||||
tenant_id = self._canvas._tenant_id
|
||||
if not tenant_id:
|
||||
return None
|
||||
|
||||
from api.db.services.tenant_llm_service import TenantLLMService
|
||||
|
||||
env_name = TenantLLMService.ensure_paddleocr_from_env(tenant_id)
|
||||
candidates = TenantLLMService.query(tenant_id=tenant_id, llm_factory="PaddleOCR", model_type=LLMType.OCR.value)
|
||||
if candidates:
|
||||
return candidates[0].llm_name
|
||||
return env_name
|
||||
|
||||
parser_model_name = resolve_paddleocr_llm_name()
|
||||
if not parser_model_name:
|
||||
raise RuntimeError("PaddleOCR model not configured. Please add PaddleOCR in Model Providers or set PADDLEOCR_* env.")
|
||||
|
||||
tenant_id = self._canvas._tenant_id
|
||||
ocr_model = LLMBundle(tenant_id, LLMType.OCR, llm_name=parser_model_name)
|
||||
pdf_parser = ocr_model.mdl
|
||||
|
||||
lines, _ = pdf_parser.parse_pdf(
|
||||
filepath=name,
|
||||
binary=blob,
|
||||
callback=self.callback,
|
||||
parse_method=conf.get("paddleocr_parse_method", "raw"),
|
||||
)
|
||||
bboxes = []
|
||||
for section in lines:
|
||||
# PaddleOCRParser returns sections as tuple, different formats based on parse_method:
|
||||
# - "raw": (text, position_tag)
|
||||
# - "manual": (text, label, position_tag)
|
||||
# - "paper": (text_with_tag, label)
|
||||
text = section[0]
|
||||
|
||||
# Parse position tag if exists
|
||||
position_tag = ""
|
||||
if len(section) > 1:
|
||||
if len(section) == 2: # raw format: (text, tag)
|
||||
position_tag = section[1]
|
||||
elif len(section) == 3: # manual format: (text, label, tag)
|
||||
position_tag = section[2]
|
||||
elif "paper" in conf.get("paddleocr_parse_method", "") and len(section) == 2:
|
||||
# paper format: text may contain tag
|
||||
text_with_tag = text
|
||||
import re
|
||||
|
||||
tag_match = re.search(r"(@@[0-9-]+\t[0-9.\t]+##)", text_with_tag)
|
||||
if tag_match:
|
||||
position_tag = tag_match.group(1)
|
||||
text = text_with_tag.replace(position_tag, "").strip()
|
||||
|
||||
# Extract coordinate information from position tag
|
||||
page_number, x0, x1, top, bottom = 1, 0, 0, 0, 0
|
||||
if position_tag:
|
||||
import re
|
||||
|
||||
tag_match = re.match(r"@@([0-9-]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)\t([0-9.]+)##", position_tag)
|
||||
if tag_match:
|
||||
pn, x0_str, x1_str, top_str, bottom_str = tag_match.groups()
|
||||
page_number = int(pn.split("-")[0]) # Take first page number
|
||||
x0, x1, top, bottom = float(x0_str), float(x1_str), float(top_str), float(bottom_str)
|
||||
|
||||
box = {
|
||||
"text": text,
|
||||
"page_number": page_number,
|
||||
"x0": x0,
|
||||
"x1": x1,
|
||||
"top": top,
|
||||
"bottom": bottom,
|
||||
}
|
||||
bboxes.append(box)
|
||||
else:
|
||||
vision_model = LLMBundle(self._canvas._tenant_id, LLMType.IMAGE2TEXT, llm_name=conf.get("parse_method"), lang=self._param.setups["pdf"].get("lang"))
|
||||
lines, _ = VisionParser(vision_model=vision_model)(blob, callback=self.callback)
|
||||
@ -802,7 +884,7 @@ class Parser(ProcessBase):
|
||||
outs = self.output()
|
||||
tasks = []
|
||||
for d in outs.get("json", []):
|
||||
tasks.append(asyncio.create_task(image2id(d,partial(settings.STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id),get_uuid())))
|
||||
tasks.append(asyncio.create_task(image2id(d, partial(settings.STORAGE_IMPL.put, tenant_id=self._canvas._tenant_id), get_uuid())))
|
||||
|
||||
try:
|
||||
await asyncio.gather(*tasks, return_exceptions=False)
|
||||
|
||||
@ -19,6 +19,7 @@ import os
|
||||
from typing import Any, Optional
|
||||
|
||||
from deepdoc.parser.mineru_parser import MinerUParser
|
||||
from deepdoc.parser.paddleocr_parser import PaddleOCRParser
|
||||
|
||||
|
||||
class Base:
|
||||
@ -60,16 +61,11 @@ class MinerUOcrModel(Base, MinerUParser):
|
||||
# Redact sensitive config keys before logging
|
||||
redacted_config = {}
|
||||
for k, v in config.items():
|
||||
if any(
|
||||
sensitive_word in k.lower()
|
||||
for sensitive_word in ("key", "password", "token", "secret")
|
||||
):
|
||||
if any(sensitive_word in k.lower() for sensitive_word in ("key", "password", "token", "secret")):
|
||||
redacted_config[k] = "[REDACTED]"
|
||||
else:
|
||||
redacted_config[k] = v
|
||||
logging.info(
|
||||
f"Parsed MinerU config (sensitive fields redacted): {redacted_config}"
|
||||
)
|
||||
logging.info(f"Parsed MinerU config (sensitive fields redacted): {redacted_config}")
|
||||
|
||||
MinerUParser.__init__(self, mineru_api=self.mineru_api, mineru_server_url=self.mineru_server_url)
|
||||
|
||||
@ -93,6 +89,60 @@ class MinerUOcrModel(Base, MinerUParser):
|
||||
server_url=self.mineru_server_url,
|
||||
delete_output=self.mineru_delete_output,
|
||||
parse_method=parse_method,
|
||||
**kwargs
|
||||
**kwargs,
|
||||
)
|
||||
return sections, tables
|
||||
|
||||
|
||||
class PaddleOCROcrModel(Base, PaddleOCRParser):
|
||||
_FACTORY_NAME = "PaddleOCR"
|
||||
|
||||
def __init__(self, key: str | dict, model_name: str, **kwargs):
|
||||
Base.__init__(self, key, model_name, **kwargs)
|
||||
raw_config = {}
|
||||
if key:
|
||||
try:
|
||||
raw_config = json.loads(key)
|
||||
except Exception:
|
||||
raw_config = {}
|
||||
|
||||
# nested {"api_key": {...}} from UI
|
||||
# flat {"PADDLEOCR_*": "..."} payload auto-provisioned from env vars
|
||||
config = raw_config.get("api_key", raw_config)
|
||||
if not isinstance(config, dict):
|
||||
config = {}
|
||||
|
||||
def _resolve_config(key: str, env_key: str, default=""):
|
||||
# lower-case keys (UI), upper-case PADDLEOCR_* (env auto-provision), env vars
|
||||
return config.get(key, config.get(env_key, os.environ.get(env_key, default)))
|
||||
|
||||
self.paddleocr_api_url = _resolve_config("paddleocr_api_url", "PADDLEOCR_API_URL", "")
|
||||
self.paddleocr_algorithm = _resolve_config("paddleocr_algorithm", "PADDLEOCR_ALGORITHM", "PaddleOCR-VL")
|
||||
self.paddleocr_access_token = _resolve_config("paddleocr_access_token", "PADDLEOCR_ACCESS_TOKEN", None)
|
||||
|
||||
# Redact sensitive config keys before logging
|
||||
redacted_config = {}
|
||||
for k, v in config.items():
|
||||
if any(sensitive_word in k.lower() for sensitive_word in ("key", "password", "token", "secret")):
|
||||
redacted_config[k] = "[REDACTED]"
|
||||
else:
|
||||
redacted_config[k] = v
|
||||
logging.info(f"Parsed PaddleOCR config (sensitive fields redacted): {redacted_config}")
|
||||
|
||||
PaddleOCRParser.__init__(
|
||||
self,
|
||||
api_url=self.paddleocr_api_url,
|
||||
access_token=self.paddleocr_access_token,
|
||||
algorithm=self.paddleocr_algorithm,
|
||||
)
|
||||
|
||||
def check_available(self) -> tuple[bool, str]:
|
||||
return self.check_installation()
|
||||
|
||||
def parse_pdf(self, filepath: str, binary=None, callback=None, parse_method: str = "raw", **kwargs):
|
||||
ok, reason = self.check_available()
|
||||
if not ok:
|
||||
raise RuntimeError(f"PaddleOCR server not accessible: {reason}")
|
||||
|
||||
sections, tables = PaddleOCRParser.parse_pdf(self, filepath=filepath, binary=binary, callback=callback, parse_method=parse_method, **kwargs)
|
||||
return sections, tables
|
||||
|
||||
14
web/src/assets/svg/llm/paddleocr.svg
Normal file
14
web/src/assets/svg/llm/paddleocr.svg
Normal file
@ -0,0 +1,14 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<svg version="1.1" xmlns="http://www.w3.org/2000/svg" width="204" height="204">
|
||||
<path d="M0 0 C5.28 0 10.56 0 16 0 C16.15427797 5.78542375 16.22313666 9.66447202 14 15 C13.4176812 17.42810911 12.8778743 19.86670228 12.375 22.3125 C12.11460938 23.56675781 11.85421875 24.82101563 11.5859375 26.11328125 C11.39257812 27.06589844 11.19921875 28.01851562 11 29 C10.34 29 9.68 29 9 29 C9.04125 29.86625 9.0825 30.7325 9.125 31.625 C8.99380505 35.16726367 8.15585649 37.66841364 7 41 C6.65596692 43.33177975 6.32050798 45.66487041 6 48 C5.67 48.33 5.34 48.66 5 49 C3.55833967 54.24854466 2.68835557 59.60788136 2 65 C1.34 65 0.68 65 0 65 C-0.09796875 65.63164063 -0.1959375 66.26328125 -0.296875 66.9140625 C-1.22159497 72.44643887 -2.45538005 77.74845326 -4.0625 83.125 C-5.64813561 88.477257 -6.61641909 93.43807684 -7 99 C-7.66 99 -8.32 99 -9 99 C-8.95875 100.0725 -8.9175 101.145 -8.875 102.25 C-8.99814042 105.94421261 -9.40497 106.89631346 -11 110 C-11.52482966 111.79980854 -11.9996991 113.61456629 -12.4375 115.4375 C-12.69273438 116.49453125 -12.94796875 117.5515625 -13.2109375 118.640625 C-13.47132812 119.74921875 -13.73171875 120.8578125 -14 122 C-14.26039062 123.10859375 -14.52078125 124.2171875 -14.7890625 125.359375 C-16.23108798 131.56445435 -17.62923242 137.77882404 -19 144 C-19.66 144 -20.32 144 -21 144 C-21.12375 145.19625 -21.2475 146.3925 -21.375 147.625 C-21.77380462 151.48011134 -22.70431438 154.11294313 -24 158 C-29.28 158 -34.56 158 -40 158 C-39.50048992 151.75612399 -37.93599284 146.12864787 -36.16015625 140.1328125 C-34.83094643 135.52383644 -33.76462788 130.86061164 -32.6875 126.1875 C-30.51622401 116.78617956 -27.95798491 107.51855943 -25.23510742 98.26245117 C-24.94482666 97.25802979 -24.6545459 96.2536084 -24.35546875 95.21875 C-24.09403076 94.33123047 -23.83259277 93.44371094 -23.56323242 92.52929688 C-22.81886716 89.6184652 -22.81886716 89.6184652 -23 85 C-32.9 85 -42.8 85 -53 85 C-51.89473684 75.05263158 -51.89473684 75.05263158 -50 70 C-49.76023438 69.23042969 -49.52046875 68.46085937 -49.2734375 67.66796875 C-48.89058594 66.44013672 -48.89058594 66.44013672 -48.5 65.1875 C-48.2628125 64.41792969 -48.025625 63.64835937 -47.78125 62.85546875 C-47 61 -47 61 -45 60 C-42.67583704 59.91413191 -40.34916693 59.89288957 -38.0234375 59.90234375 C-36.99556625 59.90446617 -36.99556625 59.90446617 -35.94692993 59.90663147 C-33.75624239 59.91223325 -31.5656567 59.92478594 -29.375 59.9375 C-27.89062648 59.94251478 -26.40625135 59.94707772 -24.921875 59.95117188 C-21.28121685 59.96220323 -17.64061836 59.97946901 -14 60 C-13.94376465 59.35498535 -13.8875293 58.7099707 -13.82958984 58.04541016 C-12.96693744 48.72910992 -11.24508629 39.82715762 -7.74609375 31.125 C-6.4362897 27.39445861 -5.68000873 23.5319145 -4.828125 19.67578125 C-3.34357442 13.08633739 -1.65158 6.54936895 0 0 Z " fill="#2831DF" transform="translate(53,46)"/>
|
||||
<path d="M0 0 C5.28 0 10.56 0 16 0 C15.235399 10.70441398 12.26943536 20.9810415 9.08203125 31.1875 C7.02129712 37.82748353 5.22384225 44.51569215 3.5 51.25 C3.19739258 52.42224121 2.89478516 53.59448242 2.58300781 54.80224609 C0.71701016 62.07123696 -1.04512446 69.35696753 -2.71826172 76.67236328 C-3.85473071 81.57050729 -5.31273138 86.26962192 -7 91 C-8.59119364 96.03877987 -9.44914797 100.76690573 -10 106 C-10.66 106 -11.32 106 -12 106 C-12.03738281 106.70125 -12.07476562 107.4025 -12.11328125 108.125 C-12.48264852 113.45547666 -13.1249218 117.94818256 -15.18359375 122.87109375 C-16.93149684 127.42902286 -17.86250115 132.26131996 -19 137 C-19.34588869 138.38455735 -19.69363488 139.76865163 -20.04296875 141.15234375 C-20.222229 141.86261719 -20.40148926 142.57289063 -20.58618164 143.3046875 C-20.93785233 144.69671066 -21.28988648 146.08864207 -21.64233398 147.48046875 C-22.52090977 150.97902837 -23.29193239 154.45966195 -24 158 C-29.61 158 -35.22 158 -41 158 C-40.24791147 152.73538029 -39.34974669 147.7690517 -38.125 142.625 C-37.94718994 141.87186523 -37.76937988 141.11873047 -37.58618164 140.34277344 C-36.50974786 135.8706144 -35.25275445 131.54235526 -33.7109375 127.20703125 C-32.48712712 123.40783969 -31.6904287 119.50981261 -30.8203125 115.6171875 C-29.90622587 111.58650076 -28.93409458 107.5734341 -27.9375 103.5625 C-27.77515869 102.90048584 -27.61281738 102.23847168 -27.44555664 101.55639648 C-26.70403999 98.58317455 -25.89147112 95.67598058 -24.91796875 92.76953125 C-23.84587401 89.53499862 -23.40522292 86.37685768 -23 83 C-22.34 83 -21.68 83 -21 83 C-20.896875 81.741875 -20.79375 80.48375 -20.6875 79.1875 C-20.17690392 74.48853603 -19.00603931 70.13009242 -17.69921875 65.59765625 C-16.83798598 62.39810434 -16.36546873 59.2892186 -16 56 C-15.34 56 -14.68 56 -14 56 C-13.96624268 55.33282959 -13.93248535 54.66565918 -13.89770508 53.97827148 C-13.41579319 46.91496121 -11.9250941 40.75547542 -9.6875 34.0625 C-6.29781688 23.49159445 -3.92624433 12.9233607 -2 2 C-1.34 2 -0.68 2 0 2 C0 1.34 0 0.68 0 0 Z " fill="#2831DF" transform="translate(134,46)"/>
|
||||
<path d="M0 0 C2.43757413 -0.02698422 4.87484915 -0.04683099 7.3125 -0.0625 C8.00279297 -0.07087891 8.69308594 -0.07925781 9.40429688 -0.08789062 C12.5483025 -0.10307906 14.99420019 -0.00193327 18 1 C18 1.66 18 2.32 18 3 C18.66 3 19.32 3 20 3 C21.0064275 4.66279327 22.00585407 6.32983484 23 8 C23.99 8.66 24.98 9.32 26 10 C27.08301035 12.29598193 28.07796291 14.63477442 29 17 C29.350625 17.763125 29.70125 18.52625 30.0625 19.3125 C34.88894127 33.14829831 35.10756712 51.23425219 30 65 C28.24403886 68.52667826 26.22195061 71.75208274 24 75 C23.62488281 75.63164063 23.24976563 76.26328125 22.86328125 76.9140625 C21.38081833 79.38678722 20.48665592 80.81074492 17.75 81.875 C17.1725 81.91625 16.595 81.9575 16 82 C16 82.66 16 83.32 16 84 C6.01354054 84.86352013 -3.85054488 85.12644402 -13.875 85.0625 C-15.25650876 85.05746511 -16.63801933 85.05290579 -18.01953125 85.04882812 C-21.34640246 85.03719571 -24.67317602 85.0208528 -28 85 C-25.88008565 64.88008565 -25.88008565 64.88008565 -22 61 C-14.05344286 59.6982022 -6.19562196 60.02377079 1.82421875 60.390625 C7.89338929 60.57880147 7.89338929 60.57880147 13.078125 57.671875 C19.2379721 49.10818513 18.49422308 39.18370502 17 29 C16.34 29 15.68 29 15 29 C14.4740625 27.824375 14.4740625 27.824375 13.9375 26.625 C11.4077885 23.19764893 9.50197264 22.98385012 5.4296875 22.26953125 C1.91534221 21.87967623 -1.46670023 21.87151637 -5 22 C-4.76233962 20.09871698 -4.52388673 18.19752936 -4.28125 16.296875 C-4.09907454 14.80910876 -3.93145621 13.31945809 -3.78125 11.828125 C-3.39419678 8.15111943 -2.95070301 5.16989238 -1 2 C-0.67 1.34 -0.34 0.68 0 0 Z " fill="#2932DF" transform="translate(82,46)"/>
|
||||
<path d="M0 0 C15.71406556 -1.02980527 15.71406556 -1.02980527 20.5 2 C28.76609987 9.71502654 33.63280425 23.17948938 34.09179688 34.20507812 C34.57281659 49.21054692 33.72804383 61.68962202 26 75 C24.38541822 76.71717245 22.72413232 78.39285243 21 80 C20.38125 80.7425 19.7625 81.485 19.125 82.25 C14.68229769 85.90869602 8.01215129 85.1279183 2.4921875 85.1328125 C1.45942184 85.13424759 1.45942184 85.13424759 0.40579224 85.13571167 C-1.03790377 85.13638732 -2.48160293 85.13457633 -3.92529297 85.13037109 C-6.1260461 85.12499745 -8.32659672 85.13034546 -10.52734375 85.13671875 C-11.93750045 85.13605818 -13.34765702 85.13477746 -14.7578125 85.1328125 C-16.66264404 85.13112061 -16.66264404 85.13112061 -18.60595703 85.12939453 C-21.54244033 85.01744401 -24.15099128 84.68422413 -27 84 C-26.31697126 78.58609855 -25.22293698 73.36322421 -23.9375 68.0625 C-23.75123047 67.28326172 -23.56496094 66.50402344 -23.37304688 65.70117188 C-22.91818991 63.80010303 -22.4594938 61.89995343 -22 60 C-21.01604248 60.00523682 -20.03208496 60.01047363 -19.01831055 60.01586914 C-15.36744089 60.03397029 -11.71657521 60.04545075 -8.06567383 60.05493164 C-6.48558892 60.05996014 -4.90550866 60.06678378 -3.32543945 60.07543945 C-1.05418754 60.08756992 1.21700285 60.09324313 3.48828125 60.09765625 C4.54780754 60.10539818 4.54780754 60.10539818 5.6287384 60.11329651 C8.70254391 60.11364803 11.0590442 59.9803186 14 59 C15.10836423 57.11683841 15.10836423 57.11683841 16 55 C16.66 54.01 17.32 53.02 18 52 C20.37643288 44.15777151 19.63162649 35.30648664 16 28 C13.87039364 25.37894602 12.27236626 23.57037108 9.1875 22.1875 C7.08572233 22.00734763 5.04601849 21.98845702 2.9375 22 C-1 22 -1 22 -4 21 C-3.25428948 16.35197907 -2.3717461 11.7381195 -1.4375 7.125 C-1.29892578 6.43664063 -1.16035156 5.74828125 -1.01757812 5.0390625 C-0.67927173 3.35919626 -0.33976088 1.67957266 0 0 Z " fill="#2831DF" transform="translate(161,46)"/>
|
||||
<path d="M0 0 C2.8125 1.0625 2.8125 1.0625 5 3 C5.8125 6.6875 5.8125 6.6875 6 10 C6.66 10 7.32 10 8 10 C7.40672298 19.64855779 6.96022527 26.87405508 0 34 C-2 35.125 -2 35.125 -4 35 C-8.45279556 32.34952645 -9.59522559 29.86268066 -11 25 C-11.66 24.67 -12.32 24.34 -13 24 C-13.05425226 21.60385874 -13.09379027 19.20896147 -13.125 16.8125 C-13.14175781 16.13896484 -13.15851563 15.46542969 -13.17578125 14.77148438 C-13.22959846 9.25521997 -12.63330947 5.67867918 -8.75 1.6875 C-5.44146305 -0.34273858 -3.8200098 -0.55708476 0 0 Z " fill="#2832DE" transform="translate(102,2)"/>
|
||||
<path d="M0 0 C4.83670567 4.83670567 5.20245233 9.25686475 5.25 15.8125 C5.270625 16.51181641 5.29125 17.21113281 5.3125 17.93164062 C5.34861844 22.75796677 4.39813869 25.85170455 2 30 C1.67 30.99 1.34 31.98 1 33 C-2.3 33 -5.6 33 -9 33 C-13.20796432 26.12699161 -16.24978351 19.27600545 -15 11 C-13.68274847 6.82417685 -12.03704153 3.21584179 -9.1875 -0.125 C-5.75721453 -1.49711419 -3.50872971 -1.0024942 0 0 Z " fill="#2931DF" transform="translate(184,3)"/>
|
||||
<path d="M0 0 C1.0219043 -0.00676758 1.0219043 -0.00676758 2.06445312 -0.01367188 C5.73102226 -0.00285604 9.12254501 0.25617151 12.6875 1.125 C12.6875 1.785 12.6875 2.445 12.6875 3.125 C2.4575 3.125 -7.7725 3.125 -18.3125 3.125 C-18.3125 2.465 -18.3125 1.805 -18.3125 1.125 C-12.2065942 0.30305114 -6.15338301 0.01692039 0 0 Z " fill="#2C35ED" transform="translate(159.3125,105.875)"/>
|
||||
<path d="M0 0 C3.8544062 0.13989034 7.70845629 0.28798006 11.5625 0.4375 C12.64853516 0.47681641 13.73457031 0.51613281 14.85351562 0.55664062 C20.30410746 0.77201355 25.62075441 1.08159222 31 2 C31 2.33 31 2.66 31 3 C20.44 3 9.88 3 -1 3 C-0.67 2.01 -0.34 1.02 0 0 Z " fill="#2B35EB" transform="translate(8,106)"/>
|
||||
<path d="M0 0 C0 0.33 0 0.66 0 1 C-10.23 1 -20.46 1 -31 1 C-31 0.67 -31 0.34 -31 0 C-20.35831508 -2.10743678 -10.627168 -2.28562253 0 0 Z " fill="#2C35EF" transform="translate(92,108)"/>
|
||||
<path d="" fill="#0000FF" transform="translate(0,0)"/>
|
||||
<path d="" fill="#000000" transform="translate(0,0)"/>
|
||||
</svg>
|
||||
|
After Width: | Height: | Size: 10 KiB |
@ -6,6 +6,7 @@ import { camelCase } from 'lodash';
|
||||
import { ReactNode, useMemo } from 'react';
|
||||
import { useFormContext } from 'react-hook-form';
|
||||
import { MinerUOptionsFormField } from './mineru-options-form-field';
|
||||
import { PaddleOCROptionsFormField } from './paddleocr-options-form-field';
|
||||
import { SelectWithSearch } from './originui/select-with-search';
|
||||
import {
|
||||
FormControl,
|
||||
@ -28,12 +29,14 @@ export function LayoutRecognizeFormField({
|
||||
optionsWithoutLLM,
|
||||
label,
|
||||
showMineruOptions = true,
|
||||
showPaddleocrOptions = true,
|
||||
}: {
|
||||
name?: string;
|
||||
horizontal?: boolean;
|
||||
optionsWithoutLLM?: { value: string; label: string }[];
|
||||
label?: ReactNode;
|
||||
showMineruOptions?: boolean;
|
||||
showPaddleocrOptions?: boolean;
|
||||
}) {
|
||||
const form = useFormContext();
|
||||
|
||||
@ -113,6 +116,7 @@ export function LayoutRecognizeFormField({
|
||||
</div>
|
||||
</FormItem>
|
||||
{showMineruOptions && <MinerUOptionsFormField />}
|
||||
{showPaddleocrOptions && <PaddleOCROptionsFormField />}
|
||||
</>
|
||||
);
|
||||
}}
|
||||
|
||||
95
web/src/components/paddleocr-options-form-field.tsx
Normal file
95
web/src/components/paddleocr-options-form-field.tsx
Normal file
@ -0,0 +1,95 @@
|
||||
import { RAGFlowFormItem } from '@/components/ragflow-form';
|
||||
import { Input } from '@/components/ui/input';
|
||||
import { RAGFlowSelect } from '@/components/ui/select';
|
||||
import { LLMFactory } from '@/constants/llm';
|
||||
import { buildOptions } from '@/utils/form';
|
||||
import { useFormContext, useWatch } from 'react-hook-form';
|
||||
import { useTranslation } from 'react-i18next';
|
||||
|
||||
const algorithmOptions = buildOptions(['PaddleOCR-VL']);
|
||||
|
||||
export function PaddleOCROptionsFormField({
|
||||
namePrefix = 'parser_config',
|
||||
}: {
|
||||
namePrefix?: string;
|
||||
}) {
|
||||
const form = useFormContext();
|
||||
const { t } = useTranslation();
|
||||
const buildName = (field: string) =>
|
||||
namePrefix ? `${namePrefix}.${field}` : field;
|
||||
|
||||
const layoutRecognize = useWatch({
|
||||
control: form.control,
|
||||
name: 'parser_config.layout_recognize',
|
||||
});
|
||||
|
||||
// Check if PaddleOCR is selected (the value contains 'PaddleOCR' or matches the factory name)
|
||||
const isPaddleOCRSelected =
|
||||
layoutRecognize?.includes(LLMFactory.PaddleOCR) ||
|
||||
layoutRecognize?.toLowerCase()?.includes('paddleocr');
|
||||
|
||||
if (!isPaddleOCRSelected) {
|
||||
return null;
|
||||
}
|
||||
|
||||
return (
|
||||
<div className="space-y-4 border-l-2 border-primary/30 pl-4 ml-2">
|
||||
<div className="text-sm font-medium text-text-secondary">
|
||||
{t('knowledgeConfiguration.paddleocrOptions', 'PaddleOCR Options')}
|
||||
</div>
|
||||
|
||||
<RAGFlowFormItem
|
||||
name={buildName('paddleocr_api_url')}
|
||||
label={t('knowledgeConfiguration.paddleocrApiUrl', 'PaddleOCR API URL')}
|
||||
tooltip={t(
|
||||
'knowledgeConfiguration.paddleocrApiUrlTip',
|
||||
'The API endpoint URL for PaddleOCR service',
|
||||
)}
|
||||
horizontal={true}
|
||||
>
|
||||
{(field) => (
|
||||
<Input
|
||||
{...field}
|
||||
placeholder={t('knowledgeConfiguration.paddleocrApiUrlPlaceholder')}
|
||||
/>
|
||||
)}
|
||||
</RAGFlowFormItem>
|
||||
|
||||
<RAGFlowFormItem
|
||||
name={buildName('paddleocr_access_token')}
|
||||
label={t('knowledgeConfiguration.paddleocrAccessToken', 'AI Studio Access Token')}
|
||||
tooltip={t(
|
||||
'knowledgeConfiguration.paddleocrAccessTokenTip',
|
||||
'Access token for PaddleOCR API (optional)',
|
||||
)}
|
||||
horizontal={true}
|
||||
>
|
||||
{(field) => (
|
||||
<Input
|
||||
{...field}
|
||||
placeholder={t('knowledgeConfiguration.paddleocrAccessTokenPlaceholder')}
|
||||
/>
|
||||
)}
|
||||
</RAGFlowFormItem>
|
||||
|
||||
<RAGFlowFormItem
|
||||
name={buildName('paddleocr_algorithm')}
|
||||
label={t('knowledgeConfiguration.paddleocrAlgorithm', 'PaddleOCR Algorithm')}
|
||||
tooltip={t(
|
||||
'knowledgeConfiguration.paddleocrAlgorithmTip',
|
||||
'Algorithm to use for PaddleOCR parsing',
|
||||
)}
|
||||
horizontal={true}
|
||||
>
|
||||
{(field) => (
|
||||
<RAGFlowSelect
|
||||
value={field.value || 'PaddleOCR-VL'}
|
||||
onChange={field.onChange}
|
||||
options={algorithmOptions}
|
||||
placeholder={t('common.selectPlaceholder', 'Select value')}
|
||||
/>
|
||||
)}
|
||||
</RAGFlowFormItem>
|
||||
</div>
|
||||
);
|
||||
}
|
||||
@ -105,6 +105,7 @@ export const LlmIcon = ({
|
||||
LLMFactory.Gemini,
|
||||
LLMFactory.StepFun,
|
||||
LLMFactory.MinerU,
|
||||
LLMFactory.PaddleOCR,
|
||||
// LLMFactory.DeerAPI,
|
||||
];
|
||||
if (svgIcons.includes(name as LLMFactory)) {
|
||||
|
||||
@ -61,6 +61,7 @@ export enum LLMFactory {
|
||||
JiekouAI = 'Jiekou.AI',
|
||||
Builtin = 'Builtin',
|
||||
MinerU = 'MinerU',
|
||||
PaddleOCR = 'PaddleOCR',
|
||||
}
|
||||
|
||||
// Please lowercase the file name
|
||||
@ -127,6 +128,7 @@ export const IconMap = {
|
||||
[LLMFactory.JiekouAI]: 'jiekouai',
|
||||
[LLMFactory.Builtin]: 'builtin',
|
||||
[LLMFactory.MinerU]: 'mineru',
|
||||
[LLMFactory.PaddleOCR]: 'paddleocr',
|
||||
};
|
||||
|
||||
export const APIMapUrl = {
|
||||
@ -178,4 +180,5 @@ export const APIMapUrl = {
|
||||
[LLMFactory.DeerAPI]: 'https://api.deerapi.com/token',
|
||||
[LLMFactory.TokenPony]: 'https://www.tokenpony.cn/#/user/keys',
|
||||
[LLMFactory.DeepInfra]: 'https://deepinfra.com/dash/api_keys',
|
||||
[LLMFactory.PaddleOCR]: 'https://www.paddleocr.ai/latest/',
|
||||
};
|
||||
|
||||
@ -385,6 +385,17 @@ Prozedurales Gedächtnis: Erlernte Fähigkeiten, Gewohnheiten und automatisierte
|
||||
'Formelerkennung aktivieren. Hinweis: Dies funktioniert möglicherweise nicht korrekt bei kyrillischen Dokumenten.',
|
||||
mineruTableEnable: 'Tabellenerkennung',
|
||||
mineruTableEnableTip: 'Tabellenerkennung und -extraktion aktivieren.',
|
||||
paddleocrOptions: 'PaddleOCR-Optionen',
|
||||
paddleocrApiUrl: 'PaddleOCR API-URL',
|
||||
paddleocrApiUrlTip: 'API-Endpunkt-URL des PaddleOCR-Dienstes',
|
||||
paddleocrApiUrlPlaceholder: 'Zum Beispiel: https://paddleocr-server.com/layout-parsing',
|
||||
paddleocrAccessToken: 'AI Studio-Zugriffstoken',
|
||||
paddleocrAccessTokenTip: 'Zugriffstoken für die PaddleOCR-API (optional)',
|
||||
paddleocrAccessTokenPlaceholder: 'Ihr AI Studio-Token (optional)',
|
||||
paddleocrAlgorithm: 'PaddleOCR-Algorithmus',
|
||||
paddleocrAlgorithmTip: 'Algorithmus, der für die PaddleOCR-Verarbeitung verwendet wird',
|
||||
paddleocrSelectAlgorithm: 'Algorithmus auswählen',
|
||||
paddleocrModelNamePlaceholder: 'Zum Beispiel: paddleocr-umgebung-1',
|
||||
overlappedPercent: 'Überlappungsprozent(%)',
|
||||
generationScopeTip:
|
||||
'Bestimmt, ob RAPTOR für den gesamten Datensatz oder für eine einzelne Datei generiert wird.',
|
||||
@ -475,7 +486,7 @@ Prozedurales Gedächtnis: Erlernte Fähigkeiten, Gewohnheiten und automatisierte
|
||||
book: `<p>Unterstützte Dateiformate sind <b>DOCX</b>, <b>PDF</b>, <b>TXT</b>.</p><p>
|
||||
Für jedes Buch im PDF-Format stellen Sie bitte die <i>Seitenbereiche</i> ein, um unerwünschte Informationen zu entfernen und die Analysezeit zu reduzieren.</p>`,
|
||||
laws: `<p>Unterstützte Dateiformate sind <b>DOCX</b>, <b>PDF</b>, <b>TXT</b>.</p><p>
|
||||
Rechtliche Dokumente folgen in der Regel einem strengen Schreibformat. Wir verwenden Textmerkmale, um Teilungspunkte zu identifizieren.
|
||||
Rechtliche Dokumente folgen in der Regel einem strengen Schreibformat. Wir verwenden Textmerkmale, um Teilungspunkte zu identifizieren.
|
||||
</p><p>
|
||||
Der Chunk hat eine Granularität, die mit 'ARTIKEL' übereinstimmt, wobei sichergestellt wird, dass der gesamte übergeordnete Text im Chunk enthalten ist.
|
||||
</p>`,
|
||||
@ -489,7 +500,7 @@ Prozedurales Gedächtnis: Erlernte Fähigkeiten, Gewohnheiten und automatisierte
|
||||
<li>Dann werden benachbarte Segmente kombiniert, bis die Token-Anzahl den durch 'Chunk-Token-Anzahl' festgelegten Schwellenwert überschreitet, woraufhin ein Chunk erstellt wird.</li></p>`,
|
||||
paper: `<p>Nur <b>PDF</b>-Dateien werden unterstützt.</p><p>
|
||||
Papers werden nach Abschnitten wie <i>abstract, 1.1, 1.2</i> aufgeteilt. </p><p>
|
||||
Dieser Ansatz ermöglicht es dem LLM, das Paper effektiver zusammenzufassen und umfassendere, verständlichere Antworten zu liefern.
|
||||
Dieser Ansatz ermöglicht es dem LLM, das Paper effektiver zusammenzufassen und umfassendere, verständlichere Antworten zu liefern.
|
||||
Es erhöht jedoch auch den Kontext für KI-Gespräche und die Rechenkosten für das LLM. Daher sollten Sie während eines Gesprächs erwägen, den Wert von '<b>topN</b>' zu reduzieren.</p>`,
|
||||
presentation: `<p>Unterstützte Dateiformate sind <b>PDF</b>, <b>PPTX</b>.</p><p>
|
||||
Jede Seite in den Folien wird als Chunk behandelt, wobei ihr Vorschaubild gespeichert wird.</p><p>
|
||||
@ -1108,6 +1119,17 @@ Beispiel: Virtual Hosted Style`,
|
||||
modelTypeMessage: 'Bitte geben Sie Ihren Modelltyp ein!',
|
||||
addLlmBaseUrl: 'Basis-URL',
|
||||
baseUrlNameMessage: 'Bitte geben Sie Ihre Basis-URL ein!',
|
||||
paddleocr: {
|
||||
apiUrl: 'PaddleOCR API-URL',
|
||||
apiUrlPlaceholder: 'Zum Beispiel: https://paddleocr-server.com/layout-parsing',
|
||||
accessToken: 'AI Studio-Zugriffstoken',
|
||||
accessTokenPlaceholder: 'Ihr AI Studio-Token (optional)',
|
||||
algorithm: 'PaddleOCR-Algorithmus',
|
||||
selectAlgorithm: 'Algorithmus auswählen',
|
||||
modelNamePlaceholder: 'Zum Beispiel: paddleocr-from-env-1',
|
||||
modelNameRequired: 'Der Modellname ist ein Pflichtfeld',
|
||||
apiUrlRequired: 'Die PaddleOCR API-URL ist ein Pflichtfeld'
|
||||
},
|
||||
vision: 'Unterstützt es Vision?',
|
||||
ollamaLink: 'Wie integriere ich {{name}}',
|
||||
FishAudioLink: 'Wie verwende ich FishAudio',
|
||||
|
||||
@ -148,7 +148,7 @@ Procedural Memory: Learned skills, habits, and automated procedures.`,
|
||||
action: 'Action',
|
||||
},
|
||||
config: {
|
||||
memorySizeTooltip: `Accounts for each message's content + its embedding vector (≈ Content + Dimensions × 8 Bytes).
|
||||
memorySizeTooltip: `Accounts for each message's content + its embedding vector (≈ Content + Dimensions × 8 Bytes).
|
||||
Example: A 1 KB message with 1024-dim embedding uses ~9 KB. The 5 MB default limit holds ~500 such messages.`,
|
||||
avatar: 'Avatar',
|
||||
description: 'Description',
|
||||
@ -424,6 +424,17 @@ Example: A 1 KB message with 1024-dim embedding uses ~9 KB. The 5 MB default lim
|
||||
'Enable formula recognition. Note: This may not work correctly for Cyrillic documents.',
|
||||
mineruTableEnable: 'Table recognition',
|
||||
mineruTableEnableTip: 'Enable table recognition and extraction.',
|
||||
paddleocrOptions: 'PaddleOCR Options',
|
||||
paddleocrApiUrl: 'PaddleOCR API URL',
|
||||
paddleocrApiUrlTip: 'The API endpoint URL for PaddleOCR service',
|
||||
paddleocrApiUrlPlaceholder: 'e.g. https://paddleocr-server.com/layout-parsing',
|
||||
paddleocrAccessToken: 'AI Studio Access Token',
|
||||
paddleocrAccessTokenTip: 'Access token for PaddleOCR API (optional)',
|
||||
paddleocrAccessTokenPlaceholder: 'Your AI Studio token (optional)',
|
||||
paddleocrAlgorithm: 'PaddleOCR Algorithm',
|
||||
paddleocrAlgorithmTip: 'Algorithm to use for PaddleOCR parsing',
|
||||
paddleocrSelectAlgorithm: 'Select Algorithm',
|
||||
paddleocrModelNamePlaceholder: 'e.g. paddleocr-from-env-1',
|
||||
overlappedPercent: 'Overlapped percent(%)',
|
||||
generationScopeTip:
|
||||
'Determines whether RAPTOR is generated for the entire dataset or for a single file.',
|
||||
@ -1094,6 +1105,17 @@ Example: Virtual Hosted Style`,
|
||||
modelTypeMessage: 'Please input your model type!',
|
||||
addLlmBaseUrl: 'Base url',
|
||||
baseUrlNameMessage: 'Please input your base url!',
|
||||
paddleocr: {
|
||||
apiUrl: 'PaddleOCR API URL',
|
||||
apiUrlPlaceholder: 'For example: https://paddleocr-server.com/layout-parsing',
|
||||
accessToken: 'AI Studio Access Token',
|
||||
accessTokenPlaceholder: 'Your AI Studio token (optional)',
|
||||
algorithm: 'PaddleOCR Algorithm',
|
||||
selectAlgorithm: 'Select Algorithm',
|
||||
modelNamePlaceholder: 'For example: paddleocr-from-env-1',
|
||||
modelNameRequired: 'Model name is required',
|
||||
apiUrlRequired: 'PaddleOCR API URL is required'
|
||||
},
|
||||
vision: 'Does it support Vision?',
|
||||
ollamaLink: 'How to integrate {{name}}',
|
||||
FishAudioLink: 'How to use FishAudio',
|
||||
|
||||
@ -159,6 +159,20 @@ export default {
|
||||
html4excelTip: `Usar junto con el método de fragmentación General. Cuando está desactivado, los archivos de hoja de cálculo (XLSX, XLS (Excel 97-2003)) se analizan línea por línea como pares clave-valor. Cuando está activado, los archivos de hoja de cálculo se convierten en tablas HTML. Si la tabla original tiene más de 12 filas, el sistema la dividirá automáticamente en varias tablas HTML cada 12 filas. Para más información, consulte https://ragflow.io/docs/dev/enable_excel2html.`,
|
||||
},
|
||||
|
||||
knowledgeConfiguration: {
|
||||
paddleocrOptions: 'Opciones de PaddleOCR',
|
||||
paddleocrApiUrl: 'URL de API de PaddleOCR',
|
||||
paddleocrApiUrlTip: 'La URL del endpoint de la API para el servicio PaddleOCR',
|
||||
paddleocrApiUrlPlaceholder: 'ej: https://servidor-paddleocr.com/api',
|
||||
paddleocrAccessToken: 'Token de acceso de AI Studio',
|
||||
paddleocrAccessTokenTip: 'Token de acceso para la API de PaddleOCR (opcional)',
|
||||
paddleocrAccessTokenPlaceholder: 'Su token de AI Studio (opcional)',
|
||||
paddleocrAlgorithm: 'Algoritmo de PaddleOCR',
|
||||
paddleocrAlgorithmTip: 'Algoritmo a utilizar para el análisis de PaddleOCR',
|
||||
paddleocrSelectAlgorithm: 'Seleccionar algoritmo',
|
||||
paddleocrModelNamePlaceholder: 'ej: paddleocr-desde-env-1',
|
||||
},
|
||||
|
||||
// Otros bloques de traducción
|
||||
// Continua con la misma estructura
|
||||
chat: {
|
||||
@ -379,6 +393,17 @@ export default {
|
||||
modelTypeMessage: '¡Por favor ingresa el tipo de tu modelo!',
|
||||
addLlmBaseUrl: 'URL base',
|
||||
baseUrlNameMessage: '¡Por favor ingresa tu URL base!',
|
||||
paddleocr: {
|
||||
apiUrl: 'URL de la API de PaddleOCR',
|
||||
apiUrlPlaceholder: 'Por ejemplo: https://paddleocr-server.com/layout-parsing',
|
||||
accessToken: 'Token de acceso de AI Studio',
|
||||
accessTokenPlaceholder: 'Su token de AI Studio (opcional)',
|
||||
algorithm: 'Algoritmo de PaddleOCR',
|
||||
selectAlgorithm: 'Seleccionar algoritmo',
|
||||
modelNamePlaceholder: 'Por ejemplo: paddleocr-from-env-1',
|
||||
modelNameRequired: 'El nombre del modelo es obligatorio',
|
||||
apiUrlRequired: 'La URL de la API de PaddleOCR es obligatoria'
|
||||
},
|
||||
vision: '¿Soporta visión?',
|
||||
ollamaLink: 'Cómo integrar {{name}}',
|
||||
FishAudioLink: 'Cómo usar FishAudio',
|
||||
|
||||
@ -293,6 +293,17 @@ export default {
|
||||
communityTip: `Un "community" est un groupe d’entités liées. Le LLM peut générer un résumé pour chaque groupe. Voir plus ici : https: //www.microsoft.com/en-us/research/blog/graphrag-improving-global-search-via-dynamic-community-selection/`,
|
||||
theDocumentBeingParsedCannotBeDeleted:
|
||||
'Le document en cours d’analyse ne peut pas être supprimé',
|
||||
paddleocrOptions: 'Options PaddleOCR',
|
||||
paddleocrApiUrl: 'URL de l’API PaddleOCR',
|
||||
paddleocrApiUrlTip: 'URL du point de terminaison de l’API du service PaddleOCR',
|
||||
paddleocrApiUrlPlaceholder: 'Par exemple : https://paddleocr-server.com/layout-parsing',
|
||||
paddleocrAccessToken: 'Jeton d’accès AI Studio',
|
||||
paddleocrAccessTokenTip: 'Jeton d’accès à l’API PaddleOCR (optionnel)',
|
||||
paddleocrAccessTokenPlaceholder: 'Votre jeton AI Studio (optionnel)',
|
||||
paddleocrAlgorithm: 'Algorithme PaddleOCR',
|
||||
paddleocrAlgorithmTip: 'Algorithme utilisé pour l’analyse PaddleOCR',
|
||||
paddleocrSelectAlgorithm: 'Sélectionner un algorithme',
|
||||
paddleocrModelNamePlaceholder: 'Par exemple : paddleocr-environnement-1',
|
||||
},
|
||||
chunk: {
|
||||
chunk: 'Segment',
|
||||
@ -566,6 +577,17 @@ export default {
|
||||
modelTypeMessage: 'Veuillez saisir le type de votre modèle !',
|
||||
addLlmBaseUrl: 'URL de base',
|
||||
baseUrlNameMessage: 'Veuillez saisir votre URL de base !',
|
||||
paddleocr: {
|
||||
apiUrl: 'URL de l’API PaddleOCR',
|
||||
apiUrlPlaceholder: 'Par exemple : https://paddleocr-server.com/layout-parsing',
|
||||
accessToken: 'Jeton d’accès AI Studio',
|
||||
accessTokenPlaceholder: 'Votre jeton AI Studio (optionnel)',
|
||||
algorithm: 'Algorithme PaddleOCR',
|
||||
selectAlgorithm: 'Sélectionner un algorithme',
|
||||
modelNamePlaceholder: 'Par exemple : paddleocr-from-env-1',
|
||||
modelNameRequired: 'Le nom du modèle est obligatoire',
|
||||
apiUrlRequired: 'L’URL de l’API PaddleOCR est obligatoire'
|
||||
},
|
||||
vision: 'Supporte-t-il la vision ?',
|
||||
ollamaLink: 'Comment intégrer {{name}}',
|
||||
FishAudioLink: 'Comment utiliser FishAudio',
|
||||
|
||||
@ -316,6 +316,17 @@ export default {
|
||||
randomSeed: 'Benih acak',
|
||||
randomSeedMessage: 'Benih acak diperlukan',
|
||||
entityTypes: 'Jenis entitas',
|
||||
paddleocrOptions: 'Opsi PaddleOCR',
|
||||
paddleocrApiUrl: 'URL API PaddleOCR',
|
||||
paddleocrApiUrlTip: 'URL endpoint API layanan PaddleOCR',
|
||||
paddleocrApiUrlPlaceholder: 'Contoh: https://paddleocr-server.com/layout-parsing',
|
||||
paddleocrAccessToken: 'Token Akses AI Studio',
|
||||
paddleocrAccessTokenTip: 'Token akses untuk API PaddleOCR (opsional)',
|
||||
paddleocrAccessTokenPlaceholder: 'Token AI Studio Anda (opsional)',
|
||||
paddleocrAlgorithm: 'Algoritma PaddleOCR',
|
||||
paddleocrAlgorithmTip: 'Algoritma yang digunakan untuk pemrosesan PaddleOCR',
|
||||
paddleocrSelectAlgorithm: 'Pilih algoritma',
|
||||
paddleocrModelNamePlaceholder: 'Contoh: paddleocr-lingkungan-1',
|
||||
},
|
||||
chunk: {
|
||||
chunk: 'Potongan',
|
||||
@ -553,6 +564,17 @@ export default {
|
||||
modelTypeMessage: 'Silakan masukkan jenis model Anda!',
|
||||
addLlmBaseUrl: 'Base url',
|
||||
baseUrlNameMessage: 'Silakan masukkan base url Anda!',
|
||||
paddleocr: {
|
||||
apiUrl: 'URL API PaddleOCR',
|
||||
apiUrlPlaceholder: 'Contoh: https://paddleocr-server.com/layout-parsing',
|
||||
accessToken: 'Token Akses AI Studio',
|
||||
accessTokenPlaceholder: 'Token AI Studio Anda (opsional)',
|
||||
algorithm: 'Algoritma PaddleOCR',
|
||||
selectAlgorithm: 'Pilih algoritma',
|
||||
modelNamePlaceholder: 'Contoh: paddleocr-from-env-1',
|
||||
modelNameRequired: 'Nama model wajib diisi',
|
||||
apiUrlRequired: 'URL API PaddleOCR wajib diisi'
|
||||
},
|
||||
vision: 'Apakah mendukung Vision?',
|
||||
ollamaLink: 'Cara mengintegrasikan {{name}}',
|
||||
FishAudioLink: 'Cara menggunakan FishAudio',
|
||||
|
||||
@ -488,6 +488,17 @@ Quanto sopra è il contenuto che devi riassumere.`,
|
||||
'In un grafo della conoscenza, una comunità è un cluster di entità collegate da relazioni. Puoi far generare al LLM un abstract per ogni comunità, noto come report comunità.',
|
||||
theDocumentBeingParsedCannotBeDeleted:
|
||||
'Il documento in fase di analisi non può essere eliminato',
|
||||
paddleocrOptions: 'Opzioni PaddleOCR',
|
||||
paddleocrApiUrl: 'URL API di PaddleOCR',
|
||||
paddleocrApiUrlTip: 'URL dell’endpoint API del servizio PaddleOCR',
|
||||
paddleocrApiUrlPlaceholder: 'Ad esempio: https://paddleocr-server.com/layout-parsing',
|
||||
paddleocrAccessToken: 'Token di accesso AI Studio',
|
||||
paddleocrAccessTokenTip: 'Token di accesso per l’API PaddleOCR (facoltativo)',
|
||||
paddleocrAccessTokenPlaceholder: 'Il tuo token AI Studio (facoltativo)',
|
||||
paddleocrAlgorithm: 'Algoritmo PaddleOCR',
|
||||
paddleocrAlgorithmTip: 'Algoritmo utilizzato per l’elaborazione PaddleOCR',
|
||||
paddleocrSelectAlgorithm: 'Seleziona algoritmo',
|
||||
paddleocrModelNamePlaceholder: 'Ad esempio: paddleocr-ambiente-1',
|
||||
},
|
||||
chunk: {
|
||||
chunk: 'Chunk',
|
||||
@ -785,6 +796,17 @@ Quanto sopra è il contenuto che devi riassumere.`,
|
||||
modelTypeMessage: 'Inserisci il tuo tipo di modello!',
|
||||
addLlmBaseUrl: 'URL base',
|
||||
baseUrlNameMessage: 'Inserisci il tuo URL base!',
|
||||
paddleocr: {
|
||||
apiUrl: 'URL API di PaddleOCR',
|
||||
apiUrlPlaceholder: 'Ad esempio: https://paddleocr-server.com/layout-parsing',
|
||||
accessToken: 'Token di accesso AI Studio',
|
||||
accessTokenPlaceholder: 'Il tuo token AI Studio (facoltativo)',
|
||||
algorithm: 'Algoritmo PaddleOCR',
|
||||
selectAlgorithm: 'Seleziona algoritmo',
|
||||
modelNamePlaceholder: 'Ad esempio: paddleocr-from-env-1',
|
||||
modelNameRequired: 'Il nome del modello è obbligatorio',
|
||||
apiUrlRequired: 'L’URL API di PaddleOCR è obbligatorio'
|
||||
},
|
||||
vision: 'Supporta Vision?',
|
||||
ollamaLink: 'Come integrare {{name}}',
|
||||
FishAudioLink: 'Come usare FishAudio',
|
||||
|
||||
@ -240,7 +240,7 @@ export default {
|
||||
<b>XLSX</b>形式のファイルには、ヘッダーのない2つの
|
||||
列が必要です: 1つは質問の列でもう1つは回答の列です
|
||||
(質問列が先行)。複数のシートも可能です。
|
||||
|
||||
|
||||
</li>
|
||||
<li>
|
||||
<b>CSV/TXT</b>形式のファイルは、TABで区切られたUTF-8エンコードである必要があります。
|
||||
@ -285,7 +285,7 @@ export default {
|
||||
LLMがその量のコンテキスト長を処理できる場合に、ドキュメント全体を要約する必要があるときに適用されます。
|
||||
</p>`,
|
||||
knowledgeGraph: `<p>対応ファイル形式は<b>DOCX, EXCEL, PPT, IMAGE, PDF, TXT, MD, JSON, EML</b>です。
|
||||
|
||||
|
||||
<p>このアプローチでは、ファイルを'ナイーブ'/'一般'メソッドを使用してチャンクに分割します。ドキュメントをセグメントに分割し、隣接するセグメントを結合してトークン数が'チャンクトークン数'で指定されたしきい値を超えるまで続け、その時点でチャンクが作成されます。</p>
|
||||
<p>その後、チャンクはLLMに入力され、ナレッジグラフとマインドマップのエンティティと関係を抽出します。</p>
|
||||
<p><b>エンティティタイプ</b>を設定することを忘れないでください。</p>`,
|
||||
@ -314,6 +314,17 @@ export default {
|
||||
entityTypes: 'エンティティタイプ',
|
||||
pageRank: 'ページランク',
|
||||
pageRankTip: `検索時に特定の知識ベースにより高いPageRankスコアを割り当てることができます。対応するスコアは、これらの知識ベースから取得されたチャンクのハイブリッド類似度スコアに加算され、ランキングが向上します。詳細については、https://ragflow.io/docs/dev/set_page_rank を参照してください。`,
|
||||
paddleocrOptions: 'PaddleOCRオプション',
|
||||
paddleocrApiUrl: 'PaddleOCR API URL',
|
||||
paddleocrApiUrlTip: 'PaddleOCRサービスのAPIエンドポイントURL',
|
||||
paddleocrApiUrlPlaceholder: '例: https://paddleocr-server.com/api',
|
||||
paddleocrAccessToken: 'AI Studioアクセストークン',
|
||||
paddleocrAccessTokenTip: 'PaddleOCR APIのアクセストークン(オプション)',
|
||||
paddleocrAccessTokenPlaceholder: 'AI Studioトークン(オプション)',
|
||||
paddleocrAlgorithm: 'PaddleOCRアルゴリズム',
|
||||
paddleocrAlgorithmTip: 'PaddleOCR解析に使用するアルゴリズム',
|
||||
paddleocrSelectAlgorithm: 'アルゴリズムを選択',
|
||||
paddleocrModelNamePlaceholder: '例: paddleocr-from-env-1',
|
||||
},
|
||||
chunk: {
|
||||
chunk: 'チャンク',
|
||||
@ -596,6 +607,17 @@ export default {
|
||||
modelTypeMessage: 'モデルタイプを入力してください!',
|
||||
addLlmBaseUrl: 'ベースURL',
|
||||
baseUrlNameMessage: 'ベースURLを入力してください!',
|
||||
paddleocr: {
|
||||
apiUrl: 'PaddleOCR API URL',
|
||||
apiUrlPlaceholder: '例:https://paddleocr-server.com/layout-parsing',
|
||||
accessToken: 'AI Studio アクセストークン',
|
||||
accessTokenPlaceholder: 'AI Studio のトークン(任意)',
|
||||
algorithm: 'PaddleOCR アルゴリズム',
|
||||
selectAlgorithm: 'アルゴリズムを選択',
|
||||
modelNamePlaceholder: '例:paddleocr-from-env-1',
|
||||
modelNameRequired: 'モデル名は必須です',
|
||||
apiUrlRequired: 'PaddleOCR API URL は必須です'
|
||||
},
|
||||
vision: 'ビジョンをサポートしていますか?',
|
||||
ollamaLink: '{{name}}を統合する方法',
|
||||
FishAudioLink: 'FishAudioの使用方法',
|
||||
|
||||
@ -310,6 +310,17 @@ export default {
|
||||
topnTags: 'Top-N Etiquetas',
|
||||
tags: 'Etiquetas',
|
||||
addTag: 'Adicionar etiqueta',
|
||||
paddleocrOptions: 'Opções do PaddleOCR',
|
||||
paddleocrApiUrl: 'URL da API do PaddleOCR',
|
||||
paddleocrApiUrlTip: 'A URL do endpoint da API para o serviço PaddleOCR',
|
||||
paddleocrApiUrlPlaceholder: 'ex: https://servidor-paddleocr.com/api',
|
||||
paddleocrAccessToken: 'Token de Acesso do AI Studio',
|
||||
paddleocrAccessTokenTip: 'Token de acesso para a API do PaddleOCR (opcional)',
|
||||
paddleocrAccessTokenPlaceholder: 'Seu token do AI Studio (opcional)',
|
||||
paddleocrAlgorithm: 'Algoritmo do PaddleOCR',
|
||||
paddleocrAlgorithmTip: 'Algoritmo a ser usado para a análise do PaddleOCR',
|
||||
paddleocrSelectAlgorithm: 'Selecionar algoritmo',
|
||||
paddleocrModelNamePlaceholder: 'ex: paddleocr-do-ambiente-1',
|
||||
},
|
||||
chunk: {
|
||||
chunk: 'Fragmento',
|
||||
@ -546,6 +557,17 @@ export default {
|
||||
modelTypeMessage: 'Por favor, insira o tipo do seu modelo!',
|
||||
addLlmBaseUrl: 'URL base',
|
||||
baseUrlNameMessage: 'Por favor, insira sua URL base!',
|
||||
paddleocr: {
|
||||
apiUrl: 'URL da API do PaddleOCR',
|
||||
apiUrlPlaceholder: 'Por exemplo: https://paddleocr-server.com/layout-parsing',
|
||||
accessToken: 'Token de acesso do AI Studio',
|
||||
accessTokenPlaceholder: 'Seu token do AI Studio (opcional)',
|
||||
algorithm: 'Algoritmo do PaddleOCR',
|
||||
selectAlgorithm: 'Selecionar algoritmo',
|
||||
modelNamePlaceholder: 'Por exemplo: paddleocr-from-env-1',
|
||||
modelNameRequired: 'O nome do modelo é obrigatório',
|
||||
apiUrlRequired: 'A URL da API do PaddleOCR é obrigatória'
|
||||
},
|
||||
vision: 'Suporta visão?',
|
||||
ollamaLink: 'Como integrar {{name}}',
|
||||
FishAudioLink: 'Como usar FishAudio',
|
||||
|
||||
@ -510,6 +510,17 @@ export default {
|
||||
'В графе знаний сообщество - это кластер сущностей, связанных отношениями. Вы можете поручить LLM генерировать аннотацию для каждого сообщества, известную как отчет сообщества. Более подробная информация здесь: https://www.microsoft.com/en-us/research/blog/graphrag-improving-global-search-via-dynamic-community-selection/',
|
||||
theDocumentBeingParsedCannotBeDeleted:
|
||||
'Документ, который в данный момент парсится, не может быть удален',
|
||||
paddleocrOptions: 'Параметры PaddleOCR',
|
||||
paddleocrApiUrl: 'URL API PaddleOCR',
|
||||
paddleocrApiUrlTip: 'URL конечной точки API сервиса PaddleOCR',
|
||||
paddleocrApiUrlPlaceholder: 'Например: https://paddleocr-server.com/layout-parsing',
|
||||
paddleocrAccessToken: 'Токен доступа AI Studio',
|
||||
paddleocrAccessTokenTip: 'Токен доступа к API PaddleOCR (необязательно)',
|
||||
paddleocrAccessTokenPlaceholder: 'Ваш токен AI Studio (необязательно)',
|
||||
paddleocrAlgorithm: 'Алгоритм PaddleOCR',
|
||||
paddleocrAlgorithmTip: 'Алгоритм, используемый для обработки PaddleOCR',
|
||||
paddleocrSelectAlgorithm: 'Выбрать алгоритм',
|
||||
paddleocrModelNamePlaceholder: 'Например: paddleocr-среда-1',
|
||||
},
|
||||
chunk: {
|
||||
chunk: 'Чанк',
|
||||
@ -716,7 +727,7 @@ export default {
|
||||
'Базовый URL вашего экземпляра Confluence (например, https://your-domain.atlassian.net/wiki)',
|
||||
confluenceSpaceKeyTip:
|
||||
'Необязательно: Укажите ключ пространства для синхронизации только определенного пространства. Оставьте пустым для синхронизации всех доступных пространств. Для нескольких пространств разделите запятыми (например, DEV,DOCS,HR)',
|
||||
s3PrefixTip: `Укажите путь к папке в вашем S3 бакете для получения файлов.
|
||||
s3PrefixTip: `Укажите путь к папке в вашем S3 бакете для получения файлов.
|
||||
Пример: general/v2/`,
|
||||
S3CompatibleEndpointUrlTip: `Требуется для S3 совместимого Storage Box. Укажите URL конечной точки, совместимой с S3.
|
||||
Пример: https://fsn1.your-objectstorage.com`,
|
||||
@ -1034,6 +1045,17 @@ export default {
|
||||
modelsToBeAddedTooltip:
|
||||
'Если ваш провайдер моделей не указан, но заявляет о "совместимости с OpenAI-API", выберите карточку OpenAI-API-compatible, чтобы добавить соответствующие модели. ',
|
||||
mcp: 'MCP',
|
||||
paddleocr: {
|
||||
apiUrl: 'URL API PaddleOCR',
|
||||
apiUrlPlaceholder: 'Например: https://paddleocr-server.com/layout-parsing',
|
||||
accessToken: 'Токен доступа AI Studio',
|
||||
accessTokenPlaceholder: 'Ваш токен AI Studio (необязательно)',
|
||||
algorithm: 'Алгоритм PaddleOCR',
|
||||
selectAlgorithm: 'Выбрать алгоритм',
|
||||
modelNamePlaceholder: 'Например: paddleocr-from-env-1',
|
||||
modelNameRequired: 'Имя модели является обязательным',
|
||||
apiUrlRequired: 'URL API PaddleOCR является обязательным'
|
||||
},
|
||||
},
|
||||
message: {
|
||||
registered: 'Зарегистрирован!',
|
||||
|
||||
@ -354,6 +354,17 @@ export default {
|
||||
community: 'Xây dựng mối quan hệ cộng đồng',
|
||||
communityTip:
|
||||
'Các liên kết được nhóm lại thành các cộng đồng phân cấp, với các thực thể và mối quan hệ kết nối từng phân đoạn lên các cấp độ trừu tượng cao hơn. Sau đó, chúng tôi sử dụng một LLM để tạo ra bản tóm tắt cho mỗi cộng đồng, được gọi là báo cáo cộng đồng. Xem thêm: https://www.microsoft.com/en-us/research/blog/graphrag-improving-global-search-via-dynamic-community-selection/',
|
||||
paddleocrOptions: 'Tùy chọn PaddleOCR',
|
||||
paddleocrApiUrl: 'URL API PaddleOCR',
|
||||
paddleocrApiUrlTip: 'URL điểm cuối API của dịch vụ PaddleOCR',
|
||||
paddleocrApiUrlPlaceholder: 'Ví dụ: https://paddleocr-server.com/layout-parsing',
|
||||
paddleocrAccessToken: 'Token truy cập AI Studio',
|
||||
paddleocrAccessTokenTip: 'Token truy cập cho API PaddleOCR (tùy chọn)',
|
||||
paddleocrAccessTokenPlaceholder: 'Token AI Studio của bạn (tùy chọn)',
|
||||
paddleocrAlgorithm: 'Thuật toán PaddleOCR',
|
||||
paddleocrAlgorithmTip: 'Thuật toán được sử dụng để xử lý PaddleOCR',
|
||||
paddleocrSelectAlgorithm: 'Chọn thuật toán',
|
||||
paddleocrModelNamePlaceholder: 'Ví dụ: paddleocr-môi-trường-1',
|
||||
},
|
||||
chunk: {
|
||||
chunk: 'Khối',
|
||||
@ -595,6 +606,17 @@ export default {
|
||||
modelTypeMessage: 'Vui lòng nhập loại mô hình của bạn!',
|
||||
addLlmBaseUrl: 'URL cơ sở',
|
||||
baseUrlNameMessage: 'Vui lòng nhập URL cơ sở của bạn!',
|
||||
paddleocr: {
|
||||
apiUrl: 'URL API PaddleOCR',
|
||||
apiUrlPlaceholder: 'Ví dụ: https://paddleocr-server.com/layout-parsing',
|
||||
accessToken: 'Token truy cập AI Studio',
|
||||
accessTokenPlaceholder: 'Token AI Studio của bạn (tùy chọn)',
|
||||
algorithm: 'Thuật toán PaddleOCR',
|
||||
selectAlgorithm: 'Chọn thuật toán',
|
||||
modelNamePlaceholder: 'Ví dụ: paddleocr-from-env-1',
|
||||
modelNameRequired: 'Tên mô hình là bắt buộc',
|
||||
apiUrlRequired: 'URL API PaddleOCR là bắt buộc'
|
||||
},
|
||||
vision: 'Có hỗ trợ Tầm nhìn không?',
|
||||
ollamaLink: 'Cách tích hợp {{name}}',
|
||||
FishAudioLink: 'Cách sử dụng FishAudio',
|
||||
|
||||
@ -367,6 +367,17 @@ export default {
|
||||
`,
|
||||
tags: '標籤',
|
||||
addTag: '增加標籤',
|
||||
paddleocrOptions: 'PaddleOCR 選項',
|
||||
paddleocrApiUrl: 'PaddleOCR API URL',
|
||||
paddleocrApiUrlTip: 'PaddleOCR 服務的 API 端點 URL',
|
||||
paddleocrApiUrlPlaceholder: '例如:https://paddleocr-server.com/layout-parsing',
|
||||
paddleocrAccessToken: 'AI Studio 訪問令牌',
|
||||
paddleocrAccessTokenTip: 'PaddleOCR API 的訪問令牌(可選)',
|
||||
paddleocrAccessTokenPlaceholder: '您的 AI Studio 令牌(可選)',
|
||||
paddleocrAlgorithm: 'PaddleOCR 算法',
|
||||
paddleocrAlgorithmTip: '用於 PaddleOCR 解析的算法',
|
||||
paddleocrSelectAlgorithm: '選擇算法',
|
||||
paddleocrModelNamePlaceholder: '例如:paddleocr-環境-1',
|
||||
useGraphRag: '提取知識圖譜',
|
||||
useGraphRagTip:
|
||||
'基於知識庫內所有切好的文本塊構建知識圖譜,用以提升多跳和複雜問題回答的正確率。請注意:構建知識圖譜將消耗大量 token 和時間。詳見 https://ragflow.io/docs/dev/construct_knowledge_graph。',
|
||||
@ -644,6 +655,17 @@ export default {
|
||||
modelNameMessage: '請輸入模型名稱!',
|
||||
modelTypeMessage: '請輸入模型類型!',
|
||||
baseUrlNameMessage: '請輸入基礎 Url!',
|
||||
paddleocr: {
|
||||
apiUrl: 'PaddleOCR API URL',
|
||||
apiUrlPlaceholder: '例如:https://paddleocr-server.com/layout-parsing',
|
||||
accessToken: 'AI Studio 存取權杖',
|
||||
accessTokenPlaceholder: '您的 AI Studio 權杖(選填)',
|
||||
algorithm: 'PaddleOCR 演算法',
|
||||
selectAlgorithm: '選擇演算法',
|
||||
modelNamePlaceholder: '例如:paddleocr-from-env-1',
|
||||
modelNameRequired: '模型名稱為必填項目',
|
||||
apiUrlRequired: 'PaddleOCR API URL 為必填項目'
|
||||
},
|
||||
ollamaLink: '如何集成 {{name}}',
|
||||
FishAudioLink: '如何使用Fish Audio',
|
||||
TencentCloudLink: '如何使用騰訊雲語音識別',
|
||||
|
||||
@ -390,6 +390,17 @@ export default {
|
||||
'启用公式识别。注意:对于西里尔文档可能无法正常工作。',
|
||||
mineruTableEnable: '表格识别',
|
||||
mineruTableEnableTip: '启用表格识别和提取。',
|
||||
paddleocrOptions: 'PaddleOCR 选项',
|
||||
paddleocrApiUrl: 'PaddleOCR API URL',
|
||||
paddleocrApiUrlTip: 'PaddleOCR 服务的 API 端点 URL',
|
||||
paddleocrApiUrlPlaceholder: '例如:https://paddleocr-server.com/layout-parsing',
|
||||
paddleocrAccessToken: 'AI Studio 访问令牌',
|
||||
paddleocrAccessTokenTip: 'PaddleOCR API 的访问令牌(可选)',
|
||||
paddleocrAccessTokenPlaceholder: '您的 AI Studio 令牌(可选)',
|
||||
paddleocrAlgorithm: 'PaddleOCR 算法',
|
||||
paddleocrAlgorithmTip: '用于 PaddleOCR 解析的算法',
|
||||
paddleocrSelectAlgorithm: '选择算法',
|
||||
paddleocrModelNamePlaceholder: '例如:paddleocr-环境-1',
|
||||
generationScopeTip: '选择 RAPTOR 的生成范围:整个知识库或单个文件。',
|
||||
generationScope: '生成范围',
|
||||
scopeSingleFile: '单文件',
|
||||
@ -1113,6 +1124,17 @@ General:实体和关系提取提示来自 GitHub - microsoft/graphrag:基于
|
||||
vlmLmdeployEngine: '基于LMDeploy引擎的视觉语言模型(实验性)',
|
||||
},
|
||||
},
|
||||
paddleocr: {
|
||||
apiUrl: 'PaddleOCR API URL',
|
||||
apiUrlPlaceholder: '例如:https://paddleocr-server.com/layout-parsing',
|
||||
accessToken: 'AI Studio访问令牌',
|
||||
accessTokenPlaceholder: '您的 AI Studio 令牌(可选)',
|
||||
algorithm: 'PaddleOCR算法',
|
||||
selectAlgorithm: '选择算法',
|
||||
modelNamePlaceholder: '例如:paddleocr-from-env-1',
|
||||
modelNameRequired: '模型名称为必填项',
|
||||
apiUrlRequired: 'PaddleOCR API URL 为必填项'
|
||||
},
|
||||
},
|
||||
message: {
|
||||
registered: '注册成功',
|
||||
|
||||
@ -504,3 +504,43 @@ export const useSubmitMinerU = () => {
|
||||
mineruLoading: loading,
|
||||
};
|
||||
};
|
||||
|
||||
export const useSubmitPaddleOCR = () => {
|
||||
const { addLlm, loading } = useAddLlm();
|
||||
const {
|
||||
visible: paddleocrVisible,
|
||||
hideModal: hidePaddleOCRModal,
|
||||
showModal: showPaddleOCRModal,
|
||||
} = useSetModalState();
|
||||
|
||||
const onPaddleOCROk = useCallback(
|
||||
async (payload: any) => {
|
||||
const cfg: any = {
|
||||
...payload,
|
||||
};
|
||||
const req: IAddLlmRequestBody = {
|
||||
llm_factory: LLMFactory.PaddleOCR,
|
||||
llm_name: payload.llm_name,
|
||||
model_type: 'ocr',
|
||||
api_key: cfg,
|
||||
api_base: '',
|
||||
max_tokens: 0,
|
||||
};
|
||||
const ret = await addLlm(req);
|
||||
if (ret === 0) {
|
||||
hidePaddleOCRModal();
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
},
|
||||
[addLlm, hidePaddleOCRModal],
|
||||
);
|
||||
|
||||
return {
|
||||
paddleocrVisible,
|
||||
hidePaddleOCRModal,
|
||||
showPaddleOCRModal,
|
||||
onPaddleOCROk,
|
||||
paddleocrLoading: loading,
|
||||
};
|
||||
};
|
||||
|
||||
@ -15,6 +15,7 @@ import {
|
||||
useSubmitHunyuan,
|
||||
useSubmitMinerU,
|
||||
useSubmitOllama,
|
||||
useSubmitPaddleOCR,
|
||||
useSubmitSpark,
|
||||
useSubmitSystemModelSetting,
|
||||
useSubmitTencentCloud,
|
||||
@ -28,6 +29,7 @@ import FishAudioModal from './modal/fish-audio-modal';
|
||||
import GoogleModal from './modal/google-modal';
|
||||
import HunyuanModal from './modal/hunyuan-modal';
|
||||
import MinerUModal from './modal/mineru-modal';
|
||||
import PaddleOCRModal from './modal/paddleocr-modal';
|
||||
import TencentCloudModal from './modal/next-tencent-modal';
|
||||
import OllamaModal from './modal/ollama-modal';
|
||||
import SparkModal from './modal/spark-modal';
|
||||
@ -138,6 +140,14 @@ const ModelProviders = () => {
|
||||
mineruLoading,
|
||||
} = useSubmitMinerU();
|
||||
|
||||
const {
|
||||
paddleocrVisible,
|
||||
hidePaddleOCRModal,
|
||||
showPaddleOCRModal,
|
||||
onPaddleOCROk,
|
||||
paddleocrLoading,
|
||||
} = useSubmitPaddleOCR();
|
||||
|
||||
const ModalMap = useMemo(
|
||||
() => ({
|
||||
[LLMFactory.Bedrock]: showBedrockAddingModal,
|
||||
@ -150,6 +160,7 @@ const ModelProviders = () => {
|
||||
[LLMFactory.GoogleCloud]: showGoogleAddingModal,
|
||||
[LLMFactory.AzureOpenAI]: showAzureAddingModal,
|
||||
[LLMFactory.MinerU]: showMineruModal,
|
||||
[LLMFactory.PaddleOCR]: showPaddleOCRModal,
|
||||
}),
|
||||
[
|
||||
showBedrockAddingModal,
|
||||
@ -162,6 +173,7 @@ const ModelProviders = () => {
|
||||
showGoogleAddingModal,
|
||||
showAzureAddingModal,
|
||||
showMineruModal,
|
||||
showPaddleOCRModal,
|
||||
],
|
||||
);
|
||||
|
||||
@ -309,6 +321,12 @@ const ModelProviders = () => {
|
||||
onOk={onMineruOk}
|
||||
loading={mineruLoading}
|
||||
></MinerUModal>
|
||||
<PaddleOCRModal
|
||||
visible={paddleocrVisible}
|
||||
hideModal={hidePaddleOCRModal}
|
||||
onOk={onPaddleOCROk}
|
||||
loading={paddleocrLoading}
|
||||
></PaddleOCRModal>
|
||||
</div>
|
||||
);
|
||||
};
|
||||
|
||||
@ -0,0 +1,135 @@
|
||||
import { useForm } from 'react-hook-form';
|
||||
import { useTranslation } from 'react-i18next';
|
||||
import { z } from 'zod';
|
||||
import { zodResolver } from '@hookform/resolvers/zod';
|
||||
import { t } from 'i18next';
|
||||
import {
|
||||
Dialog,
|
||||
DialogContent,
|
||||
DialogHeader,
|
||||
DialogTitle,
|
||||
} from '@/components/ui/dialog';
|
||||
import { RAGFlowFormItem } from '@/components/ragflow-form';
|
||||
import { RAGFlowSelect, RAGFlowSelectOptionType } from '@/components/ui/select';
|
||||
import { Input } from '@/components/ui/input';
|
||||
import { Form } from '@/components/ui/form';
|
||||
import { LLMHeader } from '../../components/llm-header';
|
||||
import { LLMFactory } from '@/constants/llm';
|
||||
|
||||
const FormSchema = z.object({
|
||||
llm_name: z.string().min(1, {
|
||||
message: t('setting.paddleocr.modelNameRequired'),
|
||||
}),
|
||||
paddleocr_api_url: z.string().min(1, {
|
||||
message: t('setting.paddleocr.apiUrlRequired'),
|
||||
}),
|
||||
paddleocr_access_token: z.string().optional(),
|
||||
paddleocr_algorithm: z.string().default('PaddleOCR-VL'),
|
||||
});
|
||||
|
||||
export type PaddleOCRFormValues = z.infer<typeof FormSchema>;
|
||||
|
||||
export interface IModalProps<T> {
|
||||
visible: boolean;
|
||||
hideModal: () => void;
|
||||
onOk?: (data: T) => Promise<boolean>;
|
||||
loading?: boolean;
|
||||
}
|
||||
|
||||
const algorithmOptions: RAGFlowSelectOptionType[] = [
|
||||
{ label: 'PaddleOCR-VL', value: 'PaddleOCR-VL' },
|
||||
];
|
||||
|
||||
const PaddleOCRModal = ({
|
||||
visible,
|
||||
hideModal,
|
||||
onOk,
|
||||
loading,
|
||||
}: IModalProps<PaddleOCRFormValues>) => {
|
||||
const { t } = useTranslation();
|
||||
|
||||
const form = useForm<PaddleOCRFormValues>({
|
||||
resolver: zodResolver(FormSchema),
|
||||
defaultValues: {
|
||||
paddleocr_algorithm: 'PaddleOCR-VL',
|
||||
},
|
||||
});
|
||||
|
||||
const handleOk = async (values: PaddleOCRFormValues) => {
|
||||
const ret = await onOk?.(values as any);
|
||||
if (ret) {
|
||||
hideModal?.();
|
||||
}
|
||||
};
|
||||
|
||||
return (
|
||||
<Dialog open={visible} onOpenChange={hideModal}>
|
||||
<DialogContent>
|
||||
<DialogHeader>
|
||||
<DialogTitle>
|
||||
<LLMHeader name={LLMFactory.PaddleOCR} />
|
||||
</DialogTitle>
|
||||
</DialogHeader>
|
||||
<Form {...form}>
|
||||
<form
|
||||
onSubmit={form.handleSubmit(handleOk)}
|
||||
className="space-y-6"
|
||||
id="paddleocr-form"
|
||||
>
|
||||
<RAGFlowFormItem
|
||||
name="llm_name"
|
||||
label={t('setting.modelName')}
|
||||
required
|
||||
>
|
||||
<Input placeholder={t('setting.paddleocr.modelNamePlaceholder')} />
|
||||
</RAGFlowFormItem>
|
||||
<RAGFlowFormItem
|
||||
name="paddleocr_api_url"
|
||||
label={t('setting.paddleocr.apiUrl')}
|
||||
required
|
||||
>
|
||||
<Input placeholder={t('setting.paddleocr.apiUrlPlaceholder')} />
|
||||
</RAGFlowFormItem>
|
||||
<RAGFlowFormItem
|
||||
name="paddleocr_access_token"
|
||||
label={t('setting.paddleocr.accessToken')}
|
||||
>
|
||||
<Input placeholder={t('setting.paddleocr.accessTokenPlaceholder')} />
|
||||
</RAGFlowFormItem>
|
||||
<RAGFlowFormItem
|
||||
name="paddleocr_algorithm"
|
||||
label={t('setting.paddleocr.algorithm')}
|
||||
>
|
||||
{(field) => (
|
||||
<RAGFlowSelect
|
||||
value={field.value}
|
||||
onChange={field.onChange}
|
||||
options={algorithmOptions}
|
||||
placeholder={t('setting.paddleocr.selectAlgorithm')}
|
||||
/>
|
||||
)}
|
||||
</RAGFlowFormItem>
|
||||
<div className="flex justify-end space-x-2">
|
||||
<button
|
||||
type="button"
|
||||
onClick={hideModal}
|
||||
className="btn btn-secondary"
|
||||
>
|
||||
{t('common.cancel')}
|
||||
</button>
|
||||
<button
|
||||
type="submit"
|
||||
disabled={loading}
|
||||
className="btn btn-primary"
|
||||
>
|
||||
{loading ? t('common.adding') : t('common.add')}
|
||||
</button>
|
||||
</div>
|
||||
</form>
|
||||
</Form>
|
||||
</DialogContent>
|
||||
</Dialog>
|
||||
);
|
||||
};
|
||||
|
||||
export default PaddleOCRModal;
|
||||
Reference in New Issue
Block a user