feat: add paddleocr parser (#12513)

### What problem does this PR solve? Add PaddleOCR as a new PDF parser. ### Type of change - [x] New Feature (non-breaking change which adds functionality)
2026-01-19 11:45:10 +08:00 · 2026-01-09 17:48:45 +08:00
parent 6abf55c048
commit 2e09db02f3
34 changed files with 1510 additions and 453 deletions
--- a/api/apps/llm_app.py
+++ b/api/apps/llm_app.py
@ -195,6 +195,9 @@ async def add_llm():
    elif factory == "MinerU":
        api_key = apikey_json(["api_key", "provider_order"])

+    elif factory == "PaddleOCR":
+        api_key = apikey_json(["api_key", "provider_order"])
+
    llm = {
        "tenant_id": current_user.id,
        "llm_factory": factory,
@ -230,8 +233,7 @@ async def add_llm():
                **extra,
            )
            try:
-                m, tc = await mdl.async_chat(None, [{"role": "user", "content": "Hello! How are you doing!"}],
-                                             {"temperature": 0.9})
+                m, tc = await mdl.async_chat(None, [{"role": "user", "content": "Hello! How are you doing!"}], {"temperature": 0.9})
                if not tc and m.find("**ERROR**:") >= 0:
                    raise Exception(m)
            except Exception as e:
@ -381,7 +383,7 @@ def list_app():
        facts = set([o.to_dict()["llm_factory"] for o in objs if o.api_key and o.status == StatusEnum.VALID.value])
        status = {(o.llm_name + "@" + o.llm_factory) for o in objs if o.status == StatusEnum.VALID.value}
        llms = LLMService.get_all()
-        llms = [m.to_dict() for m in llms if m.status == StatusEnum.VALID.value and m.fid not in weighted and (m.fid == 'Builtin' or (m.llm_name + "@" + m.fid) in status)]
+        llms = [m.to_dict() for m in llms if m.status == StatusEnum.VALID.value and m.fid not in weighted and (m.fid == "Builtin" or (m.llm_name + "@" + m.fid) in status)]
        for m in llms:
            m["available"] = m["fid"] in facts or m["llm_name"].lower() == "flag-embedding" or m["fid"] in self_deployed
            if "tei-" in os.getenv("COMPOSE_PROFILES", "") and m["model_type"] == LLMType.EMBEDDING and m["fid"] == "Builtin" and m["llm_name"] == os.getenv("TEI_MODEL", ""):
--- a/api/db/services/tenant_llm_service.py
+++ b/api/db/services/tenant_llm_service.py
@ -19,7 +19,7 @@ import logging
 from peewee import IntegrityError
 from langfuse import Langfuse
 from common import settings
-from common.constants import MINERU_DEFAULT_CONFIG, MINERU_ENV_KEYS, LLMType
+from common.constants import MINERU_DEFAULT_CONFIG, MINERU_ENV_KEYS, PADDLEOCR_DEFAULT_CONFIG, PADDLEOCR_ENV_KEYS, LLMType
 from api.db.db_models import DB, LLMFactories, TenantLLM
 from api.db.services.common_service import CommonService
 from api.db.services.langfuse_service import TenantLangfuseService
@ -60,10 +60,8 @@ class TenantLLMService(CommonService):
    @classmethod
    @DB.connection_context()
    def get_my_llms(cls, tenant_id):
-        fields = [cls.model.llm_factory, LLMFactories.logo, LLMFactories.tags, cls.model.model_type, cls.model.llm_name,
-                  cls.model.used_tokens, cls.model.status]
-        objs = cls.model.select(*fields).join(LLMFactories, on=(cls.model.llm_factory == LLMFactories.name)).where(
-            cls.model.tenant_id == tenant_id, ~cls.model.api_key.is_null()).dicts()
+        fields = [cls.model.llm_factory, LLMFactories.logo, LLMFactories.tags, cls.model.model_type, cls.model.llm_name, cls.model.used_tokens, cls.model.status]
+        objs = cls.model.select(*fields).join(LLMFactories, on=(cls.model.llm_factory == LLMFactories.name)).where(cls.model.tenant_id == tenant_id, ~cls.model.api_key.is_null()).dicts()

        return list(objs)

@ -90,6 +88,7 @@ class TenantLLMService(CommonService):
    @DB.connection_context()
    def get_model_config(cls, tenant_id, llm_type, llm_name=None):
        from api.db.services.llm_service import LLMService
+
        e, tenant = TenantService.get_by_id(tenant_id)
        if not e:
            raise LookupError("Tenant not found")
@ -119,9 +118,9 @@ class TenantLLMService(CommonService):
            model_config = cls.get_api_key(tenant_id, mdlnm)
        if model_config:
            model_config = model_config.to_dict()
-        elif llm_type == LLMType.EMBEDDING and fid == 'Builtin' and "tei-" in os.getenv("COMPOSE_PROFILES", "") and mdlnm == os.getenv('TEI_MODEL', ''):
+        elif llm_type == LLMType.EMBEDDING and fid == "Builtin" and "tei-" in os.getenv("COMPOSE_PROFILES", "") and mdlnm == os.getenv("TEI_MODEL", ""):
            embedding_cfg = settings.EMBEDDING_CFG
-            model_config = {"llm_factory": 'Builtin', "api_key": embedding_cfg["api_key"], "llm_name": mdlnm, "api_base": embedding_cfg["base_url"]}
+            model_config = {"llm_factory": "Builtin", "api_key": embedding_cfg["api_key"], "llm_name": mdlnm, "api_base": embedding_cfg["base_url"]}
        else:
            raise LookupError(f"Model({mdlnm}@{fid}) not authorized")

@ -140,33 +139,27 @@ class TenantLLMService(CommonService):
        if llm_type == LLMType.EMBEDDING.value:
            if model_config["llm_factory"] not in EmbeddingModel:
                return None
-            return EmbeddingModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"],
-                                                               base_url=model_config["api_base"])
+            return EmbeddingModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"])

        elif llm_type == LLMType.RERANK:
            if model_config["llm_factory"] not in RerankModel:
                return None
-            return RerankModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"],
-                                                            base_url=model_config["api_base"])
+            return RerankModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"])

        elif llm_type == LLMType.IMAGE2TEXT.value:
            if model_config["llm_factory"] not in CvModel:
                return None
-            return CvModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], lang,
-                                                        base_url=model_config["api_base"], **kwargs)
+            return CvModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], lang, base_url=model_config["api_base"], **kwargs)

        elif llm_type == LLMType.CHAT.value:
            if model_config["llm_factory"] not in ChatModel:
                return None
-            return ChatModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"],
-                                                          base_url=model_config["api_base"], **kwargs)
+            return ChatModel[model_config["llm_factory"]](model_config["api_key"], model_config["llm_name"], base_url=model_config["api_base"], **kwargs)

        elif llm_type == LLMType.SPEECH2TEXT:
            if model_config["llm_factory"] not in Seq2txtModel:
                return None
-            return Seq2txtModel[model_config["llm_factory"]](key=model_config["api_key"],
-                                                             model_name=model_config["llm_name"], lang=lang,
-                                                             base_url=model_config["api_base"])
+            return Seq2txtModel[model_config["llm_factory"]](key=model_config["api_key"], model_name=model_config["llm_name"], lang=lang, base_url=model_config["api_base"])
        elif llm_type == LLMType.TTS:
            if model_config["llm_factory"] not in TTSModel:
                return None
@ -216,14 +209,11 @@ class TenantLLMService(CommonService):
        try:
            num = (
                cls.model.update(used_tokens=cls.model.used_tokens + used_tokens)
-                .where(cls.model.tenant_id == tenant_id, cls.model.llm_name == llm_name,
-                       cls.model.llm_factory == llm_factory if llm_factory else True)
+                .where(cls.model.tenant_id == tenant_id, cls.model.llm_name == llm_name, cls.model.llm_factory == llm_factory if llm_factory else True)
                .execute()
            )
        except Exception:
-            logging.exception(
-                "TenantLLMService.increase_usage got exception,Failed to update used_tokens for tenant_id=%s, llm_name=%s",
-                tenant_id, llm_name)
+            logging.exception("TenantLLMService.increase_usage got exception,Failed to update used_tokens for tenant_id=%s, llm_name=%s", tenant_id, llm_name)
            return 0

        return num
@ -231,9 +221,7 @@ class TenantLLMService(CommonService):
    @classmethod
    @DB.connection_context()
    def get_openai_models(cls):
-        objs = cls.model.select().where((cls.model.llm_factory == "OpenAI"),
-                                        ~(cls.model.llm_name == "text-embedding-3-small"),
-                                        ~(cls.model.llm_name == "text-embedding-3-large")).dicts()
+        objs = cls.model.select().where((cls.model.llm_factory == "OpenAI"), ~(cls.model.llm_name == "text-embedding-3-small"), ~(cls.model.llm_name == "text-embedding-3-large")).dicts()
        return list(objs)

    @classmethod
@ -298,6 +286,68 @@ class TenantLLMService(CommonService):
                idx += 1
                continue

+    @classmethod
+    def _collect_paddleocr_env_config(cls) -> dict | None:
+        cfg = PADDLEOCR_DEFAULT_CONFIG
+        found = False
+        for key in PADDLEOCR_ENV_KEYS:
+            val = os.environ.get(key)
+            if val:
+                found = True
+                cfg[key] = val
+        return cfg if found else None
+
+    @classmethod
+    @DB.connection_context()
+    def ensure_paddleocr_from_env(cls, tenant_id: str) -> str | None:
+        """
+        Ensure a PaddleOCR model exists for the tenant if env variables are present.
+        Return the existing or newly created llm_name, or None if env not set.
+        """
+        cfg = cls._collect_paddleocr_env_config()
+        if not cfg:
+            return None
+
+        saved_paddleocr_models = cls.query(tenant_id=tenant_id, llm_factory="PaddleOCR", model_type=LLMType.OCR.value)
+
+        def _parse_api_key(raw: str) -> dict:
+            try:
+                return json.loads(raw or "{}")
+            except Exception:
+                return {}
+
+        for item in saved_paddleocr_models:
+            api_cfg = _parse_api_key(item.api_key)
+            normalized = {k: api_cfg.get(k, PADDLEOCR_DEFAULT_CONFIG.get(k)) for k in PADDLEOCR_ENV_KEYS}
+            if normalized == cfg:
+                return item.llm_name
+
+        used_names = {item.llm_name for item in saved_paddleocr_models}
+        idx = 1
+        base_name = "paddleocr-from-env"
+        while True:
+            candidate = f"{base_name}-{idx}"
+            if candidate in used_names:
+                idx += 1
+                continue
+
+            try:
+                cls.save(
+                    tenant_id=tenant_id,
+                    llm_factory="PaddleOCR",
+                    llm_name=candidate,
+                    model_type=LLMType.OCR.value,
+                    api_key=json.dumps(cfg),
+                    api_base="",
+                    max_tokens=0,
+                )
+                return candidate
+            except IntegrityError:
+                logging.warning("PaddleOCR env model %s already exists for tenant %s, retry with next name", candidate, tenant_id)
+                used_names.add(candidate)
+                idx += 1
+                continue
+
    @classmethod
    @DB.connection_context()
    def delete_by_tenant_id(cls, tenant_id):
@ -306,6 +356,7 @@ class TenantLLMService(CommonService):
    @staticmethod
    def llm_id2llm_type(llm_id: str) -> str | None:
        from api.db.services.llm_service import LLMService
+
        llm_id, *_ = TenantLLMService.split_model_name_and_factory(llm_id)
        llm_factories = settings.FACTORY_LLM_INFOS
        for llm_factory in llm_factories:
@ -340,8 +391,7 @@ class LLM4Tenant:
        langfuse_keys = TenantLangfuseService.filter_by_tenant(tenant_id=tenant_id)
        self.langfuse = None
        if langfuse_keys:
-            langfuse = Langfuse(public_key=langfuse_keys.public_key, secret_key=langfuse_keys.secret_key,
-                                host=langfuse_keys.host)
+            langfuse = Langfuse(public_key=langfuse_keys.public_key, secret_key=langfuse_keys.secret_key, host=langfuse_keys.host)
            if langfuse.auth_check():
                self.langfuse = langfuse
                trace_id = self.langfuse.create_trace_id()