Go: implement TTS for MiniMax provider and CLI testing for TTS (#14911)

### What problem does this PR solve?

This PR implement TTS for MiniMax provider and CLI testing for TTS

**The following functionalities are now supported:**

**MiniMax:**
- [x] Chat / Stream Chat 
- [x] Embedding
- [x] Rerank
- [x] Model listing
- [x] Provider connection checking
- [x] Text To Speech
- [ ] OCRFile
- [ ] ~~Audio To Text~~
- [ ] ~~Balance~~

**Verified examples from the CLI:**

```plaintext
RAGFlow(user)> tts with 'speech-2.8-hd@test@minimax' text 'He who desires but acts not, breeds pestilence.' play format 'wav' save './internal' param '{"voice_setting": {"voice_id": "English_radiant_girl", "speed": 1, "vol": 1, "pitch": 0}, "audio_setting": {"sample_rate": 32000, "bitrate": 128000, "format": "wav", "channel": 1}, "output_format": "hex"}'
Saved to directory: /home/infiniflow/Documents/development/ragflow/internal/speech-2.8-hd_output.wav
SUCCESS

RAGFlow(user)> stream tts with 'speech-2.8-hd@test@minimax' text 'He who desires but acts not, breeds pestilence.' play format 'wav' save './internal' param '{"voice_setting": {"voice_id": "English_radiant_girl", "speed": 1, "vol": 1, "pitch": 0}, "audio_setting": {"sample_rate": 32000, "bitrate": 128000, "format": "wav", "channel": 1}, "output_format": "hex"}'
Saved to directory: /home/infiniflow/Documents/development/ragflow/internal/speech-2.8-hd_output.wav
SUCCESS
```
Set `Play` to play audio in CLI
Set `Save` `PATH_TO_SAVE` to save file
Set `format` to save file in wav or mp3
Set `Param` align with official request body

### Type of change

- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
Haruko386
2026-05-14 13:19:31 +08:00
committed by GitHub
parent d46bbd30f7
commit ef46005ef1
10 changed files with 405 additions and 39 deletions

View File

@ -99,6 +99,13 @@
"default_value": true,
"clear_thinking": true
}
},
{
"name": "speech-2.8-hd",
"max_tokens": 8192,
"model_types": [
"tts"
]
}
]
}

View File

@ -455,6 +455,14 @@ func (l *Lexer) lookupIdent(ident string) Token {
return Token{Type: TokenFatal, Value: ident}
case "PANIC":
return Token{Type: TokenPanic, Value: ident}
case "PARAM":
return Token{Type: TokenParam, Value: ident}
case "PLAY":
return Token{Type: TokenPlay, Value: ident}
case "FORMAT":
return Token{Type: TokenFormat, Value: ident}
case "SAVE":
return Token{Type: TokenSave, Value: ident}
default:
return Token{Type: TokenIdentifier, Value: ident}
}

View File

@ -219,6 +219,7 @@ func (p *Parser) parseUserCommand() (*Command, error) {
return p.parseUpdateCommand()
case TokenRemove:
return p.parseRemoveCommand()
default:
return nil, fmt.Errorf("unknown command: %s", p.curToken.Value)
}

View File

@ -105,6 +105,10 @@ const (
TokenEmbed
TokenText
TokenQuery
TokenFormat
TokenParam
TokenPlay
TokenSave
TokenTop
TokenDimension
TokenAsync

View File

@ -27,6 +27,7 @@ import (
"net"
netUrl "net/url"
"os"
"os/exec"
"path/filepath"
ce "ragflow/internal/cli/filesystem"
"strings"
@ -1973,6 +1974,52 @@ func (c *RAGFlowClient) TTSUserCommand(cmd *Command) (ResponseIf, error) {
"text": text,
}
ttsConfigPayload := make(map[string]interface{})
explicitFormat, hasExplicitFormat := cmd.Params["format"].(string)
if paramStr, ok := cmd.Params["param_str"].(string); ok && paramStr != "" {
var dynamicParams map[string]interface{}
if err := json.Unmarshal([]byte(paramStr), &dynamicParams); err != nil {
return nil, fmt.Errorf("param string must be valid JSON. Error: %w", err)
}
ttsConfigPayload["params"] = dynamicParams
if !hasExplicitFormat {
var findFormat func(map[string]interface{}) string
findFormat = func(m map[string]interface{}) string {
if val, ok := m["format"]; ok {
return fmt.Sprintf("%v", val)
}
if val, ok := m["response_format"]; ok {
return fmt.Sprintf("%v", val)
}
for _, v := range m {
if subMap, ok := v.(map[string]interface{}); ok {
if res := findFormat(subMap); res != "" {
return res
}
}
}
return ""
}
if ext := findFormat(dynamicParams); ext != "" {
explicitFormat = ext
}
}
}
if explicitFormat != "" {
ttsConfigPayload["format"] = explicitFormat
} else {
explicitFormat = "mp3"
}
if len(ttsConfigPayload) > 0 {
payload["tts_config"] = ttsConfigPayload
}
url := "/audio/speech"
resp, err := c.HTTPClient.Request("POST", url, "web", nil, payload)
@ -1982,21 +2029,91 @@ func (c *RAGFlowClient) TTSUserCommand(cmd *Command) (ResponseIf, error) {
if resp.StatusCode != 200 {
return nil, fmt.Errorf("failed to TTS document: HTTP %d, body: %s", resp.StatusCode, string(resp.Body))
}
var result CommonResponse
if err = json.Unmarshal(resp.Body, &result); err != nil {
var ttsResult struct {
Code int `json:"code"`
Message string `json:"message"`
Data struct {
Audio string `json:"audio"`
} `json:"data"`
}
if err = json.Unmarshal(resp.Body, &ttsResult); err != nil {
return nil, fmt.Errorf("TTS document failed: invalid JSON (%w)", err)
}
if result.Code != 0 {
return nil, fmt.Errorf("%s", result.Message)
}
result.Duration = resp.Duration
// save file
//err = os.WriteFile(fileToSave, resp.Body, 0644)
//if err != nil {
// result.Message += fmt.Sprintf("failed to save file: %s", err.Error())
// result.Code = 1
//}
if ttsResult.Code != 0 {
return nil, fmt.Errorf("%s", ttsResult.Message)
}
// Convert Base64 back to the original audio byte stream
audioBytes, err := base64.StdEncoding.DecodeString(ttsResult.Data.Audio)
if err != nil {
return nil, fmt.Errorf("failed to decode audio base64: %w", err)
}
shouldPlay, _ := cmd.Params["play"].(bool)
shouldSave, _ := cmd.Params["save"].(bool)
saveDir, _ := cmd.Params["save_path"].(string)
fileName := fmt.Sprintf("%s_output.%s", modelName, explicitFormat)
cwd, err := os.Getwd()
if err != nil {
cwd = "."
}
localPath := filepath.Join(cwd, fileName)
if err := os.WriteFile(localPath, audioBytes, 0644); err != nil {
return nil, fmt.Errorf("failed to write local audio file: %w", err)
}
if shouldPlay {
cmdExec := exec.Command("aplay", localPath)
if err := cmdExec.Run(); err != nil {
fmt.Printf("Play error: %v (Hint: did you use 'format: wav' in your params?)\n", err)
}
}
var finalMessage string
if shouldSave {
if saveDir == "" {
saveDir = cwd
} else {
absSaveDir, err := filepath.Abs(saveDir)
if err == nil {
saveDir = absSaveDir
}
if err := os.MkdirAll(saveDir, 0755); err != nil {
return nil, fmt.Errorf("failed to create save directory: %w", err)
}
finalPath := filepath.Join(saveDir, fileName)
if err := os.WriteFile(finalPath, audioBytes, 0644); err != nil {
return nil, fmt.Errorf("failed to save file to target directory: %w", err)
}
if saveDir != cwd {
os.Remove(localPath)
}
finalMessage = fmt.Sprintf("Saved to directory: %s", finalPath)
}
} else {
defer os.Remove(localPath)
finalMessage = "TTS Task Completed (Audio not saved)"
}
if finalMessage != "" && shouldSave {
fmt.Println(finalMessage)
}
var result SimpleResponse
result.Code = 0
result.Message = "SUCCESS"
result.Duration = resp.Duration
return &result, nil
}

View File

@ -2773,38 +2773,71 @@ func (p *Parser) parseASRCommand() (*Command, error) {
}
func (p *Parser) parseTTSCommand() (*Command, error) {
p.nextToken() // consume TTS
p.nextToken()
cmd := NewCommand("tts_user_command")
if p.curToken.Type != TokenWith {
return nil, fmt.Errorf("expected WITH after TTS")
return nil, fmt.Errorf("expect 'with' after tts")
}
p.nextToken() // consume WITH
p.nextToken()
compositeModelName, err := p.parseQuotedString()
if err != nil {
return nil, err
if p.curToken.Type != TokenQuotedString && p.curToken.Type != TokenIdentifier {
return nil, fmt.Errorf("expect model name after 'with'")
}
cmd.Params["composite_model_name"] = strings.Trim(p.curToken.Value, "\"'")
p.nextToken()
if p.curToken.Type != TokenText {
return nil, fmt.Errorf("expected TEXT to TTS")
}
p.nextToken() // consume FILE
text, err := p.parseQuotedString()
if err != nil {
return nil, err
return nil, fmt.Errorf("expect 'text' parameter")
}
p.nextToken()
// Semicolon is optional for UNSET TOKEN
if p.curToken.Type != TokenQuotedString {
return nil, fmt.Errorf("expect quoted string after 'text'")
}
cmd.Params["text"] = strings.Trim(p.curToken.Value, "\"'")
p.nextToken()
for p.curToken.Type != TokenEOF && p.curToken.Type != TokenSemicolon {
switch p.curToken.Type {
case TokenPlay:
p.nextToken()
cmd.Params["play"] = true
case TokenParam:
p.nextToken()
if p.curToken.Type != TokenQuotedString {
return nil, fmt.Errorf("expect quoted string after 'param'")
}
cmd.Params["param_str"] = strings.Trim(p.curToken.Value, "\"'")
p.nextToken()
p.nextToken()
case TokenSave:
p.nextToken()
if p.curToken.Type != TokenQuotedString && p.curToken.Type != TokenIdentifier {
return nil, fmt.Errorf("expect directory path after 'save'")
}
cmd.Params["save"] = true
cmd.Params["save_path"] = strings.Trim(p.curToken.Value, "\"'")
p.nextToken()
case TokenFormat:
p.nextToken()
if p.curToken.Type != TokenQuotedString && p.curToken.Type != TokenIdentifier {
return nil, fmt.Errorf("expect format string (e.g. 'wav') after 'format'")
}
cmd.Params["format"] = strings.Trim(p.curToken.Value, "\"'")
p.nextToken()
default:
return nil, fmt.Errorf("unexpected token: %s", p.curToken.Value)
}
}
if p.curToken.Type == TokenSemicolon {
p.nextToken()
}
cmd := NewCommand("tts_user_command")
cmd.Params["composite_model_name"] = compositeModelName
cmd.Params["text"] = text
return cmd, nil
}

View File

@ -356,3 +356,11 @@ RAGFlow(user)> list datasets;
| 0 | naive | 1 | embedding-2@ZHIPU-AI | 0abe79f9423311f1ad8d38a74640adcc | English | ccc | aaa | me | 2ba4881420fa11f19e9c38a74640adcc | 0 | 1777375201933 |
+-------------+--------------+----------------+----------------------+----------------------------------+----------+------+----------+------------+----------------------------------+-----------+---------------+
```
### 6.23 Text to Speech
```
RAGFlow(user)> tts with 'speech-2.8-hd@test@minimax' text 'He who desires but acts not, breeds pestilence.' play format 'wav' save './internal' param '{"voice_setting": {"voice_id": "English_radiant_girl", "speed": 1, "vol": 1, "pitch": 0}, "audio_setting": {"sample_rate": 32000, "bitrate": 128000, "format": "wav", "channel": 1}, "output_format": "hex"}'
Saved to directory: /home/infiniflow/Documents/development/ragflow/internal/speech-2.8-hd_output.wav
SUCCESS
```

View File

@ -19,6 +19,7 @@ package models
import (
"bufio"
"bytes"
"encoding/hex"
"encoding/json"
"fmt"
"io"
@ -464,11 +465,194 @@ func (z *MinimaxModel) TranscribeAudioWithSender(modelName *string, file *string
// AudioSpeech convert audio to text
func (z *MinimaxModel) AudioSpeech(modelName *string, audioContent *string, apiConfig *APIConfig, asrConfig *TTSConfig) (*TTSResponse, error) {
return nil, fmt.Errorf("%s, no such method", z.Name())
if apiConfig == nil || apiConfig.ApiKey == nil || *apiConfig.ApiKey == "" {
return nil, fmt.Errorf("MiniMax API key is missing")
}
if audioContent == nil || *audioContent == "" {
return nil, fmt.Errorf("text content is empty")
}
var region = "default"
if apiConfig.Region != nil && *apiConfig.Region != "" {
region = *apiConfig.Region
}
url := fmt.Sprintf("%s/%s", z.BaseURL[region], z.URLSuffix.TTS)
reqBody := map[string]interface{}{
"model": modelName,
"text": audioContent,
}
if asrConfig != nil && asrConfig.Params != nil {
for key, value := range asrConfig.Params {
reqBody[key] = value
}
}
reqBody["stream"] = false
jsonData, err := json.Marshal(reqBody)
if err != nil {
return nil, fmt.Errorf("failed to marshal request: %w", err)
}
req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData))
if err != nil {
return nil, fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", strings.TrimSpace(*apiConfig.ApiKey)))
resp, err := z.httpClient.Do(req)
if err != nil {
return nil, fmt.Errorf("failed to send request: %w", err)
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to read response body: %w", err)
}
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("MiniMax TTS API error: status %d, body: %s", resp.StatusCode, string(body))
}
var result struct {
BaseResp struct {
StatusCode int `json:"status_code"`
StatusMsg string `json:"status_msg"`
} `json:"base_resp"`
Data struct {
Audio string `json:"audio"` // HEX
} `json:"data"`
}
if err := json.Unmarshal(body, &result); err != nil {
return nil, fmt.Errorf("failed to parse response: %w", err)
}
if result.BaseResp.StatusCode != 0 {
return nil, fmt.Errorf("MiniMax TTS returned error: %d - %s", result.BaseResp.StatusCode, result.BaseResp.StatusMsg)
}
// format HEX
audioBytes, err := hex.DecodeString(result.Data.Audio)
if err != nil {
return nil, fmt.Errorf("failed to decode MiniMax hex audio: %w", err)
}
return &TTSResponse{
Audio: audioBytes,
}, nil
}
// tts with 'speech-2.8-hd@test@minimax' text 'If that day, out position was switched, would our fate, be different?' voice 'English_expressive_narrator' param '{"voice_setting": {"voice_id": "English_expressive_narrator", "speed": 1, "vol": 1, "pitch": 0}, "audio_setting": {"sample_rate": 32000, "bitrate": 128000, "format": "wav", "channel": 1}, "output_format": "hex"}'
func (z *MinimaxModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig, sender func(*string, *string) error) error {
return fmt.Errorf("%s, no such method", z.Name())
if apiConfig == nil || apiConfig.ApiKey == nil || *apiConfig.ApiKey == "" {
return fmt.Errorf("MiniMax API key is missing")
}
if audioContent == nil || *audioContent == "" {
return fmt.Errorf("text content is empty")
}
var region = "default"
if apiConfig.Region != nil && *apiConfig.Region != "" {
region = *apiConfig.Region
}
baseURL := strings.TrimSuffix(z.BaseURL[region], "/")
if baseURL == "" {
baseURL = strings.TrimSuffix(z.BaseURL["default"], "/")
}
suffix := strings.TrimPrefix(z.URLSuffix.TTS, "/")
if suffix == "" {
suffix = "v1/t2a_v2"
}
url := fmt.Sprintf("%s/%s", baseURL, suffix)
reqBody := map[string]interface{}{
"model": modelName,
"text": audioContent,
}
if ttsConfig != nil && ttsConfig.Params != nil {
for key, value := range ttsConfig.Params {
reqBody[key] = value
}
}
reqBody["stream"] = false
reqBody["stream"] = true
jsonData, err := json.Marshal(reqBody)
if err != nil {
return fmt.Errorf("failed to marshal request: %w", err)
}
req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData))
if err != nil {
return fmt.Errorf("failed to create request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", strings.TrimSpace(*apiConfig.ApiKey)))
resp, err := z.httpClient.Do(req)
if err != nil {
return fmt.Errorf("failed to send request: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
body, _ := io.ReadAll(resp.Body)
return fmt.Errorf("MiniMax stream TTS API error: %d, body: %s", resp.StatusCode, string(body))
}
scanner := bufio.NewScanner(resp.Body)
scanner.Buffer(make([]byte, 64*1024), 2*1024*1024)
for scanner.Scan() {
line := scanner.Text()
if !strings.HasPrefix(line, "data:") {
continue
}
dataStr := strings.TrimSpace(line[5:])
if dataStr == "" {
continue
}
var event struct {
Data struct {
Audio string `json:"audio"`
Status int `json:"status"`
} `json:"data"`
}
if err := json.Unmarshal([]byte(dataStr), &event); err != nil {
continue
}
if event.Data.Audio != "" {
audioBytes, err := hex.DecodeString(event.Data.Audio)
if err == nil && len(audioBytes) > 0 {
chunk := string(audioBytes)
if errSend := sender(&chunk, nil); errSend != nil {
return errSend
}
}
}
if event.Data.Status == 2 {
break
}
}
if err := scanner.Err(); err != nil {
return fmt.Errorf("error reading minimax stream: %w", err)
}
return nil
}
// OCRFile OCR file

View File

@ -65,6 +65,7 @@ type ASRResponse struct {
}
type TTSResponse struct {
Audio []byte `json:"audio"`
}
type OCRResponse struct {
@ -83,6 +84,7 @@ type URLSuffix struct {
Balance string `json:"balance"`
Files string `json:"files"`
Status string `json:"status"`
TTS string `json:"tts"`
}
type ChatConfig struct {
@ -116,6 +118,7 @@ type ASRConfig struct {
}
type TTSConfig struct {
Params map[string]interface{}
}
type OCRConfig struct {

View File

@ -1166,14 +1166,12 @@ func (h *ProviderHandler) TranscribeAudio(c *gin.Context) {
}
type AudioSpeechRequest struct {
ProviderName *string `json:"provider_name"`
InstanceName *string `json:"instance_name"`
ModelName *string `json:"model_name"`
Text *string `json:"text"`
Language []string `json:"language"`
Voice int `json:"voice"`
Stream bool `json:"stream"`
Volume bool `json:"volume"`
ProviderName *string `json:"provider_name"`
InstanceName *string `json:"instance_name"`
ModelName *string `json:"model_name"`
Text *string `json:"text"`
Stream bool `json:"stream"`
TTSConfig *models.TTSConfig `json:"tts_config"`
}
func (h *ProviderHandler) AudioSpeech(c *gin.Context) {
@ -1219,6 +1217,9 @@ func (h *ProviderHandler) AudioSpeech(c *gin.Context) {
}
ttsConfig := models.TTSConfig{}
if req.TTSConfig != nil {
ttsConfig = *req.TTSConfig
}
// Check if it's a stream request
if req.Stream {