From 5d022d83e8b2caa10cc55846435bc7e4eeea20fd Mon Sep 17 00:00:00 2001 From: Haruko386 Date: Mon, 25 May 2026 12:12:57 +0800 Subject: [PATCH] Go: implement provider: PaddleOCR_Local (#15158) ### What problem does this PR solve? Go: implement provider: PaddleOCR_Local **Verified from CLI** ``` RAGFlow(user)> ocr with 'PaddleOCR-VL@test@paddleocr_local' file './internal/test1.jpg' +----------------------+ | text | +----------------------+ | ## Parallel to these | +----------------------+ ``` ### Type of change - [X] Bug Fix (non-breaking change which fixes an issue) - [X] New Feature (non-breaking change which adds functionality) - [X] Refactoring --- conf/models/gitee.json | 2 +- conf/models/mineru_local.json | 2 +- conf/models/paddleocr_local.json | 7 + conf/models/togetherai.json | 3 +- conf/models/zhipu-ai.json | 9 +- internal/entity/models/factory.go | 2 + internal/entity/models/paddleocr_local.go | 205 ++++++++++++++++++++++ 7 files changed, 226 insertions(+), 4 deletions(-) create mode 100644 conf/models/paddleocr_local.json create mode 100644 internal/entity/models/paddleocr_local.go diff --git a/conf/models/gitee.json b/conf/models/gitee.json index 6b1a0732e..9b9a16f80 100644 --- a/conf/models/gitee.json +++ b/conf/models/gitee.json @@ -8,7 +8,7 @@ "models": "models", "status": "", "balance": "tokens/packages/balance", - "embedding": "embedding", + "embedding": "embeddings", "rerank": "rerank", "ocr": "images/ocr", "doc_parse": "async/documents/parse", diff --git a/conf/models/mineru_local.json b/conf/models/mineru_local.json index 54bd46e39..a51a75766 100644 --- a/conf/models/mineru_local.json +++ b/conf/models/mineru_local.json @@ -1,5 +1,5 @@ { - "name": "mineru_local", + "name": "MinerU_local", "url_suffix": { "doc_parse": "file_parse", "task": "tasks" diff --git a/conf/models/paddleocr_local.json b/conf/models/paddleocr_local.json new file mode 100644 index 000000000..176fff430 --- /dev/null +++ b/conf/models/paddleocr_local.json @@ -0,0 +1,7 @@ +{ + "name": "PaddleOCR_Local", + "url_suffix": { + "ocr": "layout-parsing" + }, + "class": "local" +} \ No newline at end of file diff --git a/conf/models/togetherai.json b/conf/models/togetherai.json index 4907b8db4..2ec898376 100644 --- a/conf/models/togetherai.json +++ b/conf/models/togetherai.json @@ -57,7 +57,7 @@ }, { "name": "mixedbread-ai/mxbai-rerank-large-v2", - "max_tokens": "16384", + "max_tokens": 16384, "model_types": [ "rerank" ] @@ -76,3 +76,4 @@ } ] } + diff --git a/conf/models/zhipu-ai.json b/conf/models/zhipu-ai.json index 2587b82f1..1c95fa417 100644 --- a/conf/models/zhipu-ai.json +++ b/conf/models/zhipu-ai.json @@ -12,7 +12,8 @@ "asr": "audio/transcriptions", "tts": "audio/speech", "files": "files", - "models": "models" + "models": "models", + "ocr": "layout_parsing" }, "class": "glm", "models": [ @@ -268,6 +269,12 @@ "model_types": [ "rerank" ] + }, + { + "name": "glm-ocr", + "model_types": [ + "ocr" + ] } ] } diff --git a/internal/entity/models/factory.go b/internal/entity/models/factory.go index 66c024706..515d55482 100644 --- a/internal/entity/models/factory.go +++ b/internal/entity/models/factory.go @@ -135,6 +135,8 @@ func (f *ModelFactory) CreateModelDriver(providerName string, baseURL map[string return NewGPUStackModel(baseURL, urlSuffix), nil case "n1n": return NewN1NModel(baseURL, urlSuffix), nil + case "paddleocr_local": + return NewPaddleOCRLocalModel(baseURL, urlSuffix), nil default: return NewDummyModel(baseURL, urlSuffix), nil } diff --git a/internal/entity/models/paddleocr_local.go b/internal/entity/models/paddleocr_local.go new file mode 100644 index 000000000..5213c0eff --- /dev/null +++ b/internal/entity/models/paddleocr_local.go @@ -0,0 +1,205 @@ +package models + +import ( + "bytes" + "encoding/base64" + "encoding/json" + "fmt" + "io" + "net/http" + "strings" + "time" +) + +type PaddleOCRLocalModel struct { + BaseURL map[string]string + URLSuffix URLSuffix + httpClient *http.Client +} + +func NewPaddleOCRLocalModel(baseURL map[string]string, urlSuffix URLSuffix) *PaddleOCRLocalModel { + return &PaddleOCRLocalModel{ + BaseURL: baseURL, + URLSuffix: urlSuffix, + httpClient: &http.Client{ + Timeout: time.Second * 120, + Transport: &http.Transport{ + MaxIdleConns: 10, + MaxIdleConnsPerHost: 100, + IdleConnTimeout: time.Second * 90, + DisableCompression: false, + }, + }, + } +} + +func (p *PaddleOCRLocalModel) NewInstance(baseURL map[string]string) ModelDriver { + return &PaddleOCRLocalModel{ + BaseURL: baseURL, + URLSuffix: p.URLSuffix, + httpClient: &http.Client{ + Timeout: time.Second * 120, + Transport: &http.Transport{ + MaxIdleConns: 10, + MaxIdleConnsPerHost: 100, + IdleConnTimeout: time.Second * 90, + DisableCompression: false, + }, + }, + } +} + +func (p *PaddleOCRLocalModel) Name() string { + return "paddleocr_local" +} + +func (p *PaddleOCRLocalModel) ChatWithMessages(modelName string, messages []Message, apiConfig *APIConfig, chatModelConfig *ChatConfig) (*ChatResponse, error) { + return nil, fmt.Errorf("%s no such method", p.Name()) +} + +func (p *PaddleOCRLocalModel) ChatStreamlyWithSender(modelName string, messages []Message, apiConfig *APIConfig, modelConfig *ChatConfig, sender func(*string, *string) error) error { + return fmt.Errorf("%s no such method", p.Name()) +} + +func (p *PaddleOCRLocalModel) Embed(modelName *string, texts []string, apiConfig *APIConfig, embeddingConfig *EmbeddingConfig) ([]EmbeddingData, error) { + return nil, fmt.Errorf("%s no such method", p.Name()) +} + +func (p *PaddleOCRLocalModel) Rerank(modelName *string, query string, documents []string, apiConfig *APIConfig, rerankConfig *RerankConfig) (*RerankResponse, error) { + return nil, fmt.Errorf("%s no such method", p.Name()) +} + +func (p *PaddleOCRLocalModel) TranscribeAudio(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig) (*ASRResponse, error) { + return nil, fmt.Errorf("%s no such method", p.Name()) +} + +func (p *PaddleOCRLocalModel) TranscribeAudioWithSender(modelName *string, file *string, apiConfig *APIConfig, asrConfig *ASRConfig, sender func(*string, *string) error) error { + return fmt.Errorf("%s no such method", p.Name()) +} + +func (p *PaddleOCRLocalModel) AudioSpeech(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig) (*TTSResponse, error) { + return nil, fmt.Errorf("%s no such method", p.Name()) +} + +func (p *PaddleOCRLocalModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig, sender func(*string, *string) error) error { + return fmt.Errorf("%s no such method", p.Name()) +} + +// For different model, paddleOCR have different url_suffix: +// e.g.PaddleOCR-VL: /layout-parsing | PP-OCRv5: /ocr +// We select `PaddleOCR-VL` here +type paddleLocalOCRResponse struct { + LogId string `json:"logId"` + ErrorCode int `json:"errorCode"` + ErrorMsg string `json:"errorMsg"` + Result struct { + LayoutParsingResults []struct { + Markdown struct { + Text string `json:"text"` + } `json:"markdown"` + } `json:"layoutParsingResults"` + } `json:"result"` +} + +func (p *PaddleOCRLocalModel) OCRFile(modelName *string, content []byte, fileURL *string, apiConfig *APIConfig, ocrConfig *OCRConfig) (*OCRFileResponse, error) { + if len(content) == 0 { + return nil, fmt.Errorf("local PaddleOCR requires file content, but content is empty") + } + + var region = "default" + if apiConfig != nil && apiConfig.Region != nil && *apiConfig.Region != "" { + region = *apiConfig.Region + } + + url := fmt.Sprintf("%s/%s", p.BaseURL[region], p.URLSuffix.OCR) + + base64Str := base64.StdEncoding.EncodeToString(content) + + fileType := 1 + if fileURL != nil && *fileURL != "" { + if strings.HasSuffix(strings.ToLower(*fileURL), ".pdf") { + fileType = 0 + } + } else if len(content) > 4 && string(content[:4]) == "%PDF" { + fileType = 0 + } + + reqData := map[string]interface{}{ + "file": base64Str, + "fileType": fileType, + } + + jsonData, err := json.Marshal(reqData) + if err != nil { + return nil, fmt.Errorf("failed to marshal local PaddleOCR request: %w", err) + } + + req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData)) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + req.Header.Set("Content-Type", "application/json") + + resp, err := p.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to send request to local PaddleOCR: %w", err) + } + defer resp.Body.Close() + + respBody, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read response body: %w", err) + } + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("local PaddleOCR failed with status %d: %s", resp.StatusCode, string(respBody)) + } + + var ocrResp paddleLocalOCRResponse + if err := json.Unmarshal(respBody, &ocrResp); err != nil { + return nil, fmt.Errorf("failed to parse local PaddleOCR response: %w, raw: %s", err, string(respBody)) + } + + if ocrResp.ErrorCode != 0 { + return nil, fmt.Errorf("local PaddleOCR task failed: %s (errorCode: %d)", ocrResp.ErrorMsg, ocrResp.ErrorCode) + } + + var fullMarkdown strings.Builder + for _, layoutRes := range ocrResp.Result.LayoutParsingResults { + if layoutRes.Markdown.Text != "" { + fullMarkdown.WriteString(layoutRes.Markdown.Text) + fullMarkdown.WriteString("\n\n") + } + } + + extractedText := strings.TrimSpace(fullMarkdown.String()) + + return &OCRFileResponse{ + Text: &extractedText, + }, nil +} + +func (p *PaddleOCRLocalModel) ParseFile(modelName *string, content []byte, url *string, apiConfig *APIConfig, parseFileConfig *ParseFileConfig) (*ParseFileResponse, error) { + return nil, fmt.Errorf("%s no such method", p.Name()) +} + +func (p *PaddleOCRLocalModel) ListModels(apiConfig *APIConfig) ([]string, error) { + return nil, fmt.Errorf("%s no such method", p.Name()) +} + +func (p *PaddleOCRLocalModel) Balance(apiConfig *APIConfig) (map[string]interface{}, error) { + return nil, fmt.Errorf("%s no such method", p.Name()) +} + +func (p *PaddleOCRLocalModel) CheckConnection(apiConfig *APIConfig) error { + return fmt.Errorf("%s no such method", p.Name()) +} + +func (p *PaddleOCRLocalModel) ListTasks(apiConfig *APIConfig) ([]ListTaskStatus, error) { + return nil, fmt.Errorf("%s no such method", p.Name()) +} + +func (p *PaddleOCRLocalModel) ShowTask(taskID string, apiConfig *APIConfig) (*TaskResponse, error) { + return nil, fmt.Errorf("%s no such method", p.Name()) +}