From ef46005ef1358bf5b50ee9a72d3014c98ffd6535 Mon Sep 17 00:00:00 2001 From: Haruko386 Date: Thu, 14 May 2026 13:19:31 +0800 Subject: [PATCH] Go: implement TTS for MiniMax provider and CLI testing for TTS (#14911) ### What problem does this PR solve? This PR implement TTS for MiniMax provider and CLI testing for TTS **The following functionalities are now supported:** **MiniMax:** - [x] Chat / Stream Chat - [x] Embedding - [x] Rerank - [x] Model listing - [x] Provider connection checking - [x] Text To Speech - [ ] OCRFile - [ ] ~~Audio To Text~~ - [ ] ~~Balance~~ **Verified examples from the CLI:** ```plaintext RAGFlow(user)> tts with 'speech-2.8-hd@test@minimax' text 'He who desires but acts not, breeds pestilence.' play format 'wav' save './internal' param '{"voice_setting": {"voice_id": "English_radiant_girl", "speed": 1, "vol": 1, "pitch": 0}, "audio_setting": {"sample_rate": 32000, "bitrate": 128000, "format": "wav", "channel": 1}, "output_format": "hex"}' Saved to directory: /home/infiniflow/Documents/development/ragflow/internal/speech-2.8-hd_output.wav SUCCESS RAGFlow(user)> stream tts with 'speech-2.8-hd@test@minimax' text 'He who desires but acts not, breeds pestilence.' play format 'wav' save './internal' param '{"voice_setting": {"voice_id": "English_radiant_girl", "speed": 1, "vol": 1, "pitch": 0}, "audio_setting": {"sample_rate": 32000, "bitrate": 128000, "format": "wav", "channel": 1}, "output_format": "hex"}' Saved to directory: /home/infiniflow/Documents/development/ragflow/internal/speech-2.8-hd_output.wav SUCCESS ``` Set `Play` to play audio in CLI Set `Save` `PATH_TO_SAVE` to save file Set `format` to save file in wav or mp3 Set `Param` align with official request body ### Type of change - [x] New Feature (non-breaking change which adds functionality) --- conf/models/minimax.json | 7 ++ internal/cli/lexer.go | 8 ++ internal/cli/parser.go | 1 + internal/cli/types.go | 4 + internal/cli/user_command.go | 141 ++++++++++++++++++++-- internal/cli/user_parser.go | 67 ++++++++--- internal/development.md | 8 ++ internal/entity/models/minimax.go | 188 +++++++++++++++++++++++++++++- internal/entity/models/types.go | 3 + internal/handler/providers.go | 17 +-- 10 files changed, 405 insertions(+), 39 deletions(-) diff --git a/conf/models/minimax.json b/conf/models/minimax.json index 31760ac25..49aa6700a 100644 --- a/conf/models/minimax.json +++ b/conf/models/minimax.json @@ -99,6 +99,13 @@ "default_value": true, "clear_thinking": true } + }, + { + "name": "speech-2.8-hd", + "max_tokens": 8192, + "model_types": [ + "tts" + ] } ] } \ No newline at end of file diff --git a/internal/cli/lexer.go b/internal/cli/lexer.go index 5f2aadea1..6a0d1b0ff 100644 --- a/internal/cli/lexer.go +++ b/internal/cli/lexer.go @@ -455,6 +455,14 @@ func (l *Lexer) lookupIdent(ident string) Token { return Token{Type: TokenFatal, Value: ident} case "PANIC": return Token{Type: TokenPanic, Value: ident} + case "PARAM": + return Token{Type: TokenParam, Value: ident} + case "PLAY": + return Token{Type: TokenPlay, Value: ident} + case "FORMAT": + return Token{Type: TokenFormat, Value: ident} + case "SAVE": + return Token{Type: TokenSave, Value: ident} default: return Token{Type: TokenIdentifier, Value: ident} } diff --git a/internal/cli/parser.go b/internal/cli/parser.go index 0bba27847..035d6b12e 100644 --- a/internal/cli/parser.go +++ b/internal/cli/parser.go @@ -219,6 +219,7 @@ func (p *Parser) parseUserCommand() (*Command, error) { return p.parseUpdateCommand() case TokenRemove: return p.parseRemoveCommand() + default: return nil, fmt.Errorf("unknown command: %s", p.curToken.Value) } diff --git a/internal/cli/types.go b/internal/cli/types.go index a30f26c6a..9dd32f55c 100644 --- a/internal/cli/types.go +++ b/internal/cli/types.go @@ -105,6 +105,10 @@ const ( TokenEmbed TokenText TokenQuery + TokenFormat + TokenParam + TokenPlay + TokenSave TokenTop TokenDimension TokenAsync diff --git a/internal/cli/user_command.go b/internal/cli/user_command.go index f0f025cee..7a2b27596 100644 --- a/internal/cli/user_command.go +++ b/internal/cli/user_command.go @@ -27,6 +27,7 @@ import ( "net" netUrl "net/url" "os" + "os/exec" "path/filepath" ce "ragflow/internal/cli/filesystem" "strings" @@ -1973,6 +1974,52 @@ func (c *RAGFlowClient) TTSUserCommand(cmd *Command) (ResponseIf, error) { "text": text, } + ttsConfigPayload := make(map[string]interface{}) + + explicitFormat, hasExplicitFormat := cmd.Params["format"].(string) + + if paramStr, ok := cmd.Params["param_str"].(string); ok && paramStr != "" { + var dynamicParams map[string]interface{} + if err := json.Unmarshal([]byte(paramStr), &dynamicParams); err != nil { + return nil, fmt.Errorf("param string must be valid JSON. Error: %w", err) + } + + ttsConfigPayload["params"] = dynamicParams + + if !hasExplicitFormat { + var findFormat func(map[string]interface{}) string + findFormat = func(m map[string]interface{}) string { + if val, ok := m["format"]; ok { + return fmt.Sprintf("%v", val) + } + if val, ok := m["response_format"]; ok { + return fmt.Sprintf("%v", val) + } + for _, v := range m { + if subMap, ok := v.(map[string]interface{}); ok { + if res := findFormat(subMap); res != "" { + return res + } + } + } + return "" + } + if ext := findFormat(dynamicParams); ext != "" { + explicitFormat = ext + } + } + } + + if explicitFormat != "" { + ttsConfigPayload["format"] = explicitFormat + } else { + explicitFormat = "mp3" + } + + if len(ttsConfigPayload) > 0 { + payload["tts_config"] = ttsConfigPayload + } + url := "/audio/speech" resp, err := c.HTTPClient.Request("POST", url, "web", nil, payload) @@ -1982,21 +2029,91 @@ func (c *RAGFlowClient) TTSUserCommand(cmd *Command) (ResponseIf, error) { if resp.StatusCode != 200 { return nil, fmt.Errorf("failed to TTS document: HTTP %d, body: %s", resp.StatusCode, string(resp.Body)) } - var result CommonResponse - if err = json.Unmarshal(resp.Body, &result); err != nil { + + var ttsResult struct { + Code int `json:"code"` + Message string `json:"message"` + Data struct { + Audio string `json:"audio"` + } `json:"data"` + } + + if err = json.Unmarshal(resp.Body, &ttsResult); err != nil { return nil, fmt.Errorf("TTS document failed: invalid JSON (%w)", err) } - if result.Code != 0 { - return nil, fmt.Errorf("%s", result.Message) - } - result.Duration = resp.Duration - // save file - //err = os.WriteFile(fileToSave, resp.Body, 0644) - //if err != nil { - // result.Message += fmt.Sprintf("failed to save file: %s", err.Error()) - // result.Code = 1 - //} + if ttsResult.Code != 0 { + return nil, fmt.Errorf("%s", ttsResult.Message) + } + + // Convert Base64 back to the original audio byte stream + audioBytes, err := base64.StdEncoding.DecodeString(ttsResult.Data.Audio) + if err != nil { + return nil, fmt.Errorf("failed to decode audio base64: %w", err) + } + + shouldPlay, _ := cmd.Params["play"].(bool) + shouldSave, _ := cmd.Params["save"].(bool) + saveDir, _ := cmd.Params["save_path"].(string) + + + fileName := fmt.Sprintf("%s_output.%s", modelName, explicitFormat) + + cwd, err := os.Getwd() + if err != nil { + cwd = "." + } + localPath := filepath.Join(cwd, fileName) + + if err := os.WriteFile(localPath, audioBytes, 0644); err != nil { + return nil, fmt.Errorf("failed to write local audio file: %w", err) + } + + if shouldPlay { + cmdExec := exec.Command("aplay", localPath) + if err := cmdExec.Run(); err != nil { + fmt.Printf("Play error: %v (Hint: did you use 'format: wav' in your params?)\n", err) + } + } + + var finalMessage string + if shouldSave { + if saveDir == "" { + saveDir = cwd + } else { + absSaveDir, err := filepath.Abs(saveDir) + if err == nil { + saveDir = absSaveDir + } + + if err := os.MkdirAll(saveDir, 0755); err != nil { + return nil, fmt.Errorf("failed to create save directory: %w", err) + } + + finalPath := filepath.Join(saveDir, fileName) + if err := os.WriteFile(finalPath, audioBytes, 0644); err != nil { + return nil, fmt.Errorf("failed to save file to target directory: %w", err) + } + + if saveDir != cwd { + os.Remove(localPath) + } + + finalMessage = fmt.Sprintf("Saved to directory: %s", finalPath) + } + } else { + defer os.Remove(localPath) + finalMessage = "TTS Task Completed (Audio not saved)" + } + + if finalMessage != "" && shouldSave { + fmt.Println(finalMessage) + } + + var result SimpleResponse + result.Code = 0 + result.Message = "SUCCESS" + result.Duration = resp.Duration return &result, nil } diff --git a/internal/cli/user_parser.go b/internal/cli/user_parser.go index b1e2e2ed5..04ebc7e87 100644 --- a/internal/cli/user_parser.go +++ b/internal/cli/user_parser.go @@ -2773,38 +2773,71 @@ func (p *Parser) parseASRCommand() (*Command, error) { } func (p *Parser) parseTTSCommand() (*Command, error) { - p.nextToken() // consume TTS + p.nextToken() + + cmd := NewCommand("tts_user_command") if p.curToken.Type != TokenWith { - return nil, fmt.Errorf("expected WITH after TTS") + return nil, fmt.Errorf("expect 'with' after tts") } - p.nextToken() // consume WITH + p.nextToken() - compositeModelName, err := p.parseQuotedString() - if err != nil { - return nil, err + if p.curToken.Type != TokenQuotedString && p.curToken.Type != TokenIdentifier { + return nil, fmt.Errorf("expect model name after 'with'") } + cmd.Params["composite_model_name"] = strings.Trim(p.curToken.Value, "\"'") p.nextToken() if p.curToken.Type != TokenText { - return nil, fmt.Errorf("expected TEXT to TTS") - } - p.nextToken() // consume FILE - - text, err := p.parseQuotedString() - if err != nil { - return nil, err + return nil, fmt.Errorf("expect 'text' parameter") } p.nextToken() - // Semicolon is optional for UNSET TOKEN + if p.curToken.Type != TokenQuotedString { + return nil, fmt.Errorf("expect quoted string after 'text'") + } + cmd.Params["text"] = strings.Trim(p.curToken.Value, "\"'") + p.nextToken() + + for p.curToken.Type != TokenEOF && p.curToken.Type != TokenSemicolon { + switch p.curToken.Type { + case TokenPlay: + p.nextToken() + cmd.Params["play"] = true + case TokenParam: + p.nextToken() + if p.curToken.Type != TokenQuotedString { + return nil, fmt.Errorf("expect quoted string after 'param'") + } + cmd.Params["param_str"] = strings.Trim(p.curToken.Value, "\"'") + p.nextToken() + p.nextToken() + case TokenSave: + p.nextToken() + + if p.curToken.Type != TokenQuotedString && p.curToken.Type != TokenIdentifier { + return nil, fmt.Errorf("expect directory path after 'save'") + } + + cmd.Params["save"] = true + cmd.Params["save_path"] = strings.Trim(p.curToken.Value, "\"'") + p.nextToken() + case TokenFormat: + p.nextToken() + if p.curToken.Type != TokenQuotedString && p.curToken.Type != TokenIdentifier { + return nil, fmt.Errorf("expect format string (e.g. 'wav') after 'format'") + } + cmd.Params["format"] = strings.Trim(p.curToken.Value, "\"'") + p.nextToken() + default: + return nil, fmt.Errorf("unexpected token: %s", p.curToken.Value) + } + } + if p.curToken.Type == TokenSemicolon { p.nextToken() } - cmd := NewCommand("tts_user_command") - cmd.Params["composite_model_name"] = compositeModelName - cmd.Params["text"] = text return cmd, nil } diff --git a/internal/development.md b/internal/development.md index 41ff7013a..c477e7a73 100644 --- a/internal/development.md +++ b/internal/development.md @@ -356,3 +356,11 @@ RAGFlow(user)> list datasets; | 0 | naive | 1 | embedding-2@ZHIPU-AI | 0abe79f9423311f1ad8d38a74640adcc | English | ccc | aaa | me | 2ba4881420fa11f19e9c38a74640adcc | 0 | 1777375201933 | +-------------+--------------+----------------+----------------------+----------------------------------+----------+------+----------+------------+----------------------------------+-----------+---------------+ ``` + +### 6.23 Text to Speech + +``` +RAGFlow(user)> tts with 'speech-2.8-hd@test@minimax' text 'He who desires but acts not, breeds pestilence.' play format 'wav' save './internal' param '{"voice_setting": {"voice_id": "English_radiant_girl", "speed": 1, "vol": 1, "pitch": 0}, "audio_setting": {"sample_rate": 32000, "bitrate": 128000, "format": "wav", "channel": 1}, "output_format": "hex"}' +Saved to directory: /home/infiniflow/Documents/development/ragflow/internal/speech-2.8-hd_output.wav +SUCCESS +``` diff --git a/internal/entity/models/minimax.go b/internal/entity/models/minimax.go index e761a2286..1f8afe3b9 100644 --- a/internal/entity/models/minimax.go +++ b/internal/entity/models/minimax.go @@ -19,6 +19,7 @@ package models import ( "bufio" "bytes" + "encoding/hex" "encoding/json" "fmt" "io" @@ -464,11 +465,194 @@ func (z *MinimaxModel) TranscribeAudioWithSender(modelName *string, file *string // AudioSpeech convert audio to text func (z *MinimaxModel) AudioSpeech(modelName *string, audioContent *string, apiConfig *APIConfig, asrConfig *TTSConfig) (*TTSResponse, error) { - return nil, fmt.Errorf("%s, no such method", z.Name()) + if apiConfig == nil || apiConfig.ApiKey == nil || *apiConfig.ApiKey == "" { + return nil, fmt.Errorf("MiniMax API key is missing") + } + if audioContent == nil || *audioContent == "" { + return nil, fmt.Errorf("text content is empty") + } + + var region = "default" + if apiConfig.Region != nil && *apiConfig.Region != "" { + region = *apiConfig.Region + } + + url := fmt.Sprintf("%s/%s", z.BaseURL[region], z.URLSuffix.TTS) + + reqBody := map[string]interface{}{ + "model": modelName, + "text": audioContent, + } + if asrConfig != nil && asrConfig.Params != nil { + for key, value := range asrConfig.Params { + reqBody[key] = value + } + } + reqBody["stream"] = false + + jsonData, err := json.Marshal(reqBody) + if err != nil { + return nil, fmt.Errorf("failed to marshal request: %w", err) + } + + req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData)) + if err != nil { + return nil, fmt.Errorf("failed to create request: %w", err) + } + + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", strings.TrimSpace(*apiConfig.ApiKey))) + + resp, err := z.httpClient.Do(req) + if err != nil { + return nil, fmt.Errorf("failed to send request: %w", err) + } + defer resp.Body.Close() + + body, err := io.ReadAll(resp.Body) + if err != nil { + return nil, fmt.Errorf("failed to read response body: %w", err) + } + + if resp.StatusCode != http.StatusOK { + return nil, fmt.Errorf("MiniMax TTS API error: status %d, body: %s", resp.StatusCode, string(body)) + } + + var result struct { + BaseResp struct { + StatusCode int `json:"status_code"` + StatusMsg string `json:"status_msg"` + } `json:"base_resp"` + Data struct { + Audio string `json:"audio"` // HEX + } `json:"data"` + } + + if err := json.Unmarshal(body, &result); err != nil { + return nil, fmt.Errorf("failed to parse response: %w", err) + } + + if result.BaseResp.StatusCode != 0 { + return nil, fmt.Errorf("MiniMax TTS returned error: %d - %s", result.BaseResp.StatusCode, result.BaseResp.StatusMsg) + } + + // format HEX + audioBytes, err := hex.DecodeString(result.Data.Audio) + if err != nil { + return nil, fmt.Errorf("failed to decode MiniMax hex audio: %w", err) + } + + return &TTSResponse{ + Audio: audioBytes, + }, nil } +// tts with 'speech-2.8-hd@test@minimax' text 'If that day, out position was switched, would our fate, be different?' voice 'English_expressive_narrator' param '{"voice_setting": {"voice_id": "English_expressive_narrator", "speed": 1, "vol": 1, "pitch": 0}, "audio_setting": {"sample_rate": 32000, "bitrate": 128000, "format": "wav", "channel": 1}, "output_format": "hex"}' func (z *MinimaxModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig, sender func(*string, *string) error) error { - return fmt.Errorf("%s, no such method", z.Name()) + if apiConfig == nil || apiConfig.ApiKey == nil || *apiConfig.ApiKey == "" { + return fmt.Errorf("MiniMax API key is missing") + } + if audioContent == nil || *audioContent == "" { + return fmt.Errorf("text content is empty") + } + + var region = "default" + if apiConfig.Region != nil && *apiConfig.Region != "" { + region = *apiConfig.Region + } + + baseURL := strings.TrimSuffix(z.BaseURL[region], "/") + if baseURL == "" { + baseURL = strings.TrimSuffix(z.BaseURL["default"], "/") + } + suffix := strings.TrimPrefix(z.URLSuffix.TTS, "/") + if suffix == "" { + suffix = "v1/t2a_v2" + } + url := fmt.Sprintf("%s/%s", baseURL, suffix) + + reqBody := map[string]interface{}{ + "model": modelName, + "text": audioContent, + } + if ttsConfig != nil && ttsConfig.Params != nil { + for key, value := range ttsConfig.Params { + reqBody[key] = value + } + } + reqBody["stream"] = false + reqBody["stream"] = true + + jsonData, err := json.Marshal(reqBody) + if err != nil { + return fmt.Errorf("failed to marshal request: %w", err) + } + + req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData)) + if err != nil { + return fmt.Errorf("failed to create request: %w", err) + } + + req.Header.Set("Content-Type", "application/json") + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", strings.TrimSpace(*apiConfig.ApiKey))) + + resp, err := z.httpClient.Do(req) + if err != nil { + return fmt.Errorf("failed to send request: %w", err) + } + defer resp.Body.Close() + + if resp.StatusCode != http.StatusOK { + body, _ := io.ReadAll(resp.Body) + return fmt.Errorf("MiniMax stream TTS API error: %d, body: %s", resp.StatusCode, string(body)) + } + + scanner := bufio.NewScanner(resp.Body) + scanner.Buffer(make([]byte, 64*1024), 2*1024*1024) + + for scanner.Scan() { + line := scanner.Text() + + if !strings.HasPrefix(line, "data:") { + continue + } + + dataStr := strings.TrimSpace(line[5:]) + if dataStr == "" { + continue + } + + var event struct { + Data struct { + Audio string `json:"audio"` + Status int `json:"status"` + } `json:"data"` + } + + if err := json.Unmarshal([]byte(dataStr), &event); err != nil { + continue + } + + if event.Data.Audio != "" { + audioBytes, err := hex.DecodeString(event.Data.Audio) + if err == nil && len(audioBytes) > 0 { + chunk := string(audioBytes) + if errSend := sender(&chunk, nil); errSend != nil { + return errSend + } + } + } + + if event.Data.Status == 2 { + break + } + } + + if err := scanner.Err(); err != nil { + return fmt.Errorf("error reading minimax stream: %w", err) + } + + return nil } // OCRFile OCR file diff --git a/internal/entity/models/types.go b/internal/entity/models/types.go index d4e6e0502..3de7ac515 100644 --- a/internal/entity/models/types.go +++ b/internal/entity/models/types.go @@ -65,6 +65,7 @@ type ASRResponse struct { } type TTSResponse struct { + Audio []byte `json:"audio"` } type OCRResponse struct { @@ -83,6 +84,7 @@ type URLSuffix struct { Balance string `json:"balance"` Files string `json:"files"` Status string `json:"status"` + TTS string `json:"tts"` } type ChatConfig struct { @@ -116,6 +118,7 @@ type ASRConfig struct { } type TTSConfig struct { + Params map[string]interface{} } type OCRConfig struct { diff --git a/internal/handler/providers.go b/internal/handler/providers.go index 58dba8352..3b060a509 100644 --- a/internal/handler/providers.go +++ b/internal/handler/providers.go @@ -1166,14 +1166,12 @@ func (h *ProviderHandler) TranscribeAudio(c *gin.Context) { } type AudioSpeechRequest struct { - ProviderName *string `json:"provider_name"` - InstanceName *string `json:"instance_name"` - ModelName *string `json:"model_name"` - Text *string `json:"text"` - Language []string `json:"language"` - Voice int `json:"voice"` - Stream bool `json:"stream"` - Volume bool `json:"volume"` + ProviderName *string `json:"provider_name"` + InstanceName *string `json:"instance_name"` + ModelName *string `json:"model_name"` + Text *string `json:"text"` + Stream bool `json:"stream"` + TTSConfig *models.TTSConfig `json:"tts_config"` } func (h *ProviderHandler) AudioSpeech(c *gin.Context) { @@ -1219,6 +1217,9 @@ func (h *ProviderHandler) AudioSpeech(c *gin.Context) { } ttsConfig := models.TTSConfig{} + if req.TTSConfig != nil { + ttsConfig = *req.TTSConfig + } // Check if it's a stream request if req.Stream {