mirror of
https://github.com/infiniflow/ragflow.git
synced 2026-05-30 04:27:30 +08:00
Go: implement TTS for MiniMax provider and CLI testing for TTS (#14911)
### What problem does this PR solve?
This PR implement TTS for MiniMax provider and CLI testing for TTS
**The following functionalities are now supported:**
**MiniMax:**
- [x] Chat / Stream Chat
- [x] Embedding
- [x] Rerank
- [x] Model listing
- [x] Provider connection checking
- [x] Text To Speech
- [ ] OCRFile
- [ ] ~~Audio To Text~~
- [ ] ~~Balance~~
**Verified examples from the CLI:**
```plaintext
RAGFlow(user)> tts with 'speech-2.8-hd@test@minimax' text 'He who desires but acts not, breeds pestilence.' play format 'wav' save './internal' param '{"voice_setting": {"voice_id": "English_radiant_girl", "speed": 1, "vol": 1, "pitch": 0}, "audio_setting": {"sample_rate": 32000, "bitrate": 128000, "format": "wav", "channel": 1}, "output_format": "hex"}'
Saved to directory: /home/infiniflow/Documents/development/ragflow/internal/speech-2.8-hd_output.wav
SUCCESS
RAGFlow(user)> stream tts with 'speech-2.8-hd@test@minimax' text 'He who desires but acts not, breeds pestilence.' play format 'wav' save './internal' param '{"voice_setting": {"voice_id": "English_radiant_girl", "speed": 1, "vol": 1, "pitch": 0}, "audio_setting": {"sample_rate": 32000, "bitrate": 128000, "format": "wav", "channel": 1}, "output_format": "hex"}'
Saved to directory: /home/infiniflow/Documents/development/ragflow/internal/speech-2.8-hd_output.wav
SUCCESS
```
Set `Play` to play audio in CLI
Set `Save` `PATH_TO_SAVE` to save file
Set `format` to save file in wav or mp3
Set `Param` align with official request body
### Type of change
- [x] New Feature (non-breaking change which adds functionality)
This commit is contained in:
@ -99,6 +99,13 @@
|
||||
"default_value": true,
|
||||
"clear_thinking": true
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "speech-2.8-hd",
|
||||
"max_tokens": 8192,
|
||||
"model_types": [
|
||||
"tts"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
@ -455,6 +455,14 @@ func (l *Lexer) lookupIdent(ident string) Token {
|
||||
return Token{Type: TokenFatal, Value: ident}
|
||||
case "PANIC":
|
||||
return Token{Type: TokenPanic, Value: ident}
|
||||
case "PARAM":
|
||||
return Token{Type: TokenParam, Value: ident}
|
||||
case "PLAY":
|
||||
return Token{Type: TokenPlay, Value: ident}
|
||||
case "FORMAT":
|
||||
return Token{Type: TokenFormat, Value: ident}
|
||||
case "SAVE":
|
||||
return Token{Type: TokenSave, Value: ident}
|
||||
default:
|
||||
return Token{Type: TokenIdentifier, Value: ident}
|
||||
}
|
||||
|
||||
@ -219,6 +219,7 @@ func (p *Parser) parseUserCommand() (*Command, error) {
|
||||
return p.parseUpdateCommand()
|
||||
case TokenRemove:
|
||||
return p.parseRemoveCommand()
|
||||
|
||||
default:
|
||||
return nil, fmt.Errorf("unknown command: %s", p.curToken.Value)
|
||||
}
|
||||
|
||||
@ -105,6 +105,10 @@ const (
|
||||
TokenEmbed
|
||||
TokenText
|
||||
TokenQuery
|
||||
TokenFormat
|
||||
TokenParam
|
||||
TokenPlay
|
||||
TokenSave
|
||||
TokenTop
|
||||
TokenDimension
|
||||
TokenAsync
|
||||
|
||||
@ -27,6 +27,7 @@ import (
|
||||
"net"
|
||||
netUrl "net/url"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
ce "ragflow/internal/cli/filesystem"
|
||||
"strings"
|
||||
@ -1973,6 +1974,52 @@ func (c *RAGFlowClient) TTSUserCommand(cmd *Command) (ResponseIf, error) {
|
||||
"text": text,
|
||||
}
|
||||
|
||||
ttsConfigPayload := make(map[string]interface{})
|
||||
|
||||
explicitFormat, hasExplicitFormat := cmd.Params["format"].(string)
|
||||
|
||||
if paramStr, ok := cmd.Params["param_str"].(string); ok && paramStr != "" {
|
||||
var dynamicParams map[string]interface{}
|
||||
if err := json.Unmarshal([]byte(paramStr), &dynamicParams); err != nil {
|
||||
return nil, fmt.Errorf("param string must be valid JSON. Error: %w", err)
|
||||
}
|
||||
|
||||
ttsConfigPayload["params"] = dynamicParams
|
||||
|
||||
if !hasExplicitFormat {
|
||||
var findFormat func(map[string]interface{}) string
|
||||
findFormat = func(m map[string]interface{}) string {
|
||||
if val, ok := m["format"]; ok {
|
||||
return fmt.Sprintf("%v", val)
|
||||
}
|
||||
if val, ok := m["response_format"]; ok {
|
||||
return fmt.Sprintf("%v", val)
|
||||
}
|
||||
for _, v := range m {
|
||||
if subMap, ok := v.(map[string]interface{}); ok {
|
||||
if res := findFormat(subMap); res != "" {
|
||||
return res
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
if ext := findFormat(dynamicParams); ext != "" {
|
||||
explicitFormat = ext
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if explicitFormat != "" {
|
||||
ttsConfigPayload["format"] = explicitFormat
|
||||
} else {
|
||||
explicitFormat = "mp3"
|
||||
}
|
||||
|
||||
if len(ttsConfigPayload) > 0 {
|
||||
payload["tts_config"] = ttsConfigPayload
|
||||
}
|
||||
|
||||
url := "/audio/speech"
|
||||
|
||||
resp, err := c.HTTPClient.Request("POST", url, "web", nil, payload)
|
||||
@ -1982,21 +2029,91 @@ func (c *RAGFlowClient) TTSUserCommand(cmd *Command) (ResponseIf, error) {
|
||||
if resp.StatusCode != 200 {
|
||||
return nil, fmt.Errorf("failed to TTS document: HTTP %d, body: %s", resp.StatusCode, string(resp.Body))
|
||||
}
|
||||
var result CommonResponse
|
||||
if err = json.Unmarshal(resp.Body, &result); err != nil {
|
||||
|
||||
var ttsResult struct {
|
||||
Code int `json:"code"`
|
||||
Message string `json:"message"`
|
||||
Data struct {
|
||||
Audio string `json:"audio"`
|
||||
} `json:"data"`
|
||||
}
|
||||
|
||||
if err = json.Unmarshal(resp.Body, &ttsResult); err != nil {
|
||||
return nil, fmt.Errorf("TTS document failed: invalid JSON (%w)", err)
|
||||
}
|
||||
if result.Code != 0 {
|
||||
return nil, fmt.Errorf("%s", result.Message)
|
||||
}
|
||||
result.Duration = resp.Duration
|
||||
|
||||
// save file
|
||||
//err = os.WriteFile(fileToSave, resp.Body, 0644)
|
||||
//if err != nil {
|
||||
// result.Message += fmt.Sprintf("failed to save file: %s", err.Error())
|
||||
// result.Code = 1
|
||||
//}
|
||||
if ttsResult.Code != 0 {
|
||||
return nil, fmt.Errorf("%s", ttsResult.Message)
|
||||
}
|
||||
|
||||
// Convert Base64 back to the original audio byte stream
|
||||
audioBytes, err := base64.StdEncoding.DecodeString(ttsResult.Data.Audio)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to decode audio base64: %w", err)
|
||||
}
|
||||
|
||||
shouldPlay, _ := cmd.Params["play"].(bool)
|
||||
shouldSave, _ := cmd.Params["save"].(bool)
|
||||
saveDir, _ := cmd.Params["save_path"].(string)
|
||||
|
||||
|
||||
fileName := fmt.Sprintf("%s_output.%s", modelName, explicitFormat)
|
||||
|
||||
cwd, err := os.Getwd()
|
||||
if err != nil {
|
||||
cwd = "."
|
||||
}
|
||||
localPath := filepath.Join(cwd, fileName)
|
||||
|
||||
if err := os.WriteFile(localPath, audioBytes, 0644); err != nil {
|
||||
return nil, fmt.Errorf("failed to write local audio file: %w", err)
|
||||
}
|
||||
|
||||
if shouldPlay {
|
||||
cmdExec := exec.Command("aplay", localPath)
|
||||
if err := cmdExec.Run(); err != nil {
|
||||
fmt.Printf("Play error: %v (Hint: did you use 'format: wav' in your params?)\n", err)
|
||||
}
|
||||
}
|
||||
|
||||
var finalMessage string
|
||||
if shouldSave {
|
||||
if saveDir == "" {
|
||||
saveDir = cwd
|
||||
} else {
|
||||
absSaveDir, err := filepath.Abs(saveDir)
|
||||
if err == nil {
|
||||
saveDir = absSaveDir
|
||||
}
|
||||
|
||||
if err := os.MkdirAll(saveDir, 0755); err != nil {
|
||||
return nil, fmt.Errorf("failed to create save directory: %w", err)
|
||||
}
|
||||
|
||||
finalPath := filepath.Join(saveDir, fileName)
|
||||
if err := os.WriteFile(finalPath, audioBytes, 0644); err != nil {
|
||||
return nil, fmt.Errorf("failed to save file to target directory: %w", err)
|
||||
}
|
||||
|
||||
if saveDir != cwd {
|
||||
os.Remove(localPath)
|
||||
}
|
||||
|
||||
finalMessage = fmt.Sprintf("Saved to directory: %s", finalPath)
|
||||
}
|
||||
} else {
|
||||
defer os.Remove(localPath)
|
||||
finalMessage = "TTS Task Completed (Audio not saved)"
|
||||
}
|
||||
|
||||
if finalMessage != "" && shouldSave {
|
||||
fmt.Println(finalMessage)
|
||||
}
|
||||
|
||||
var result SimpleResponse
|
||||
result.Code = 0
|
||||
result.Message = "SUCCESS"
|
||||
result.Duration = resp.Duration
|
||||
|
||||
return &result, nil
|
||||
}
|
||||
|
||||
@ -2773,38 +2773,71 @@ func (p *Parser) parseASRCommand() (*Command, error) {
|
||||
}
|
||||
|
||||
func (p *Parser) parseTTSCommand() (*Command, error) {
|
||||
p.nextToken() // consume TTS
|
||||
p.nextToken()
|
||||
|
||||
cmd := NewCommand("tts_user_command")
|
||||
|
||||
if p.curToken.Type != TokenWith {
|
||||
return nil, fmt.Errorf("expected WITH after TTS")
|
||||
return nil, fmt.Errorf("expect 'with' after tts")
|
||||
}
|
||||
p.nextToken() // consume WITH
|
||||
p.nextToken()
|
||||
|
||||
compositeModelName, err := p.parseQuotedString()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
if p.curToken.Type != TokenQuotedString && p.curToken.Type != TokenIdentifier {
|
||||
return nil, fmt.Errorf("expect model name after 'with'")
|
||||
}
|
||||
cmd.Params["composite_model_name"] = strings.Trim(p.curToken.Value, "\"'")
|
||||
p.nextToken()
|
||||
|
||||
if p.curToken.Type != TokenText {
|
||||
return nil, fmt.Errorf("expected TEXT to TTS")
|
||||
}
|
||||
p.nextToken() // consume FILE
|
||||
|
||||
text, err := p.parseQuotedString()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
return nil, fmt.Errorf("expect 'text' parameter")
|
||||
}
|
||||
p.nextToken()
|
||||
|
||||
// Semicolon is optional for UNSET TOKEN
|
||||
if p.curToken.Type != TokenQuotedString {
|
||||
return nil, fmt.Errorf("expect quoted string after 'text'")
|
||||
}
|
||||
cmd.Params["text"] = strings.Trim(p.curToken.Value, "\"'")
|
||||
p.nextToken()
|
||||
|
||||
for p.curToken.Type != TokenEOF && p.curToken.Type != TokenSemicolon {
|
||||
switch p.curToken.Type {
|
||||
case TokenPlay:
|
||||
p.nextToken()
|
||||
cmd.Params["play"] = true
|
||||
case TokenParam:
|
||||
p.nextToken()
|
||||
if p.curToken.Type != TokenQuotedString {
|
||||
return nil, fmt.Errorf("expect quoted string after 'param'")
|
||||
}
|
||||
cmd.Params["param_str"] = strings.Trim(p.curToken.Value, "\"'")
|
||||
p.nextToken()
|
||||
p.nextToken()
|
||||
case TokenSave:
|
||||
p.nextToken()
|
||||
|
||||
if p.curToken.Type != TokenQuotedString && p.curToken.Type != TokenIdentifier {
|
||||
return nil, fmt.Errorf("expect directory path after 'save'")
|
||||
}
|
||||
|
||||
cmd.Params["save"] = true
|
||||
cmd.Params["save_path"] = strings.Trim(p.curToken.Value, "\"'")
|
||||
p.nextToken()
|
||||
case TokenFormat:
|
||||
p.nextToken()
|
||||
if p.curToken.Type != TokenQuotedString && p.curToken.Type != TokenIdentifier {
|
||||
return nil, fmt.Errorf("expect format string (e.g. 'wav') after 'format'")
|
||||
}
|
||||
cmd.Params["format"] = strings.Trim(p.curToken.Value, "\"'")
|
||||
p.nextToken()
|
||||
default:
|
||||
return nil, fmt.Errorf("unexpected token: %s", p.curToken.Value)
|
||||
}
|
||||
}
|
||||
|
||||
if p.curToken.Type == TokenSemicolon {
|
||||
p.nextToken()
|
||||
}
|
||||
|
||||
cmd := NewCommand("tts_user_command")
|
||||
cmd.Params["composite_model_name"] = compositeModelName
|
||||
cmd.Params["text"] = text
|
||||
return cmd, nil
|
||||
}
|
||||
|
||||
|
||||
@ -356,3 +356,11 @@ RAGFlow(user)> list datasets;
|
||||
| 0 | naive | 1 | embedding-2@ZHIPU-AI | 0abe79f9423311f1ad8d38a74640adcc | English | ccc | aaa | me | 2ba4881420fa11f19e9c38a74640adcc | 0 | 1777375201933 |
|
||||
+-------------+--------------+----------------+----------------------+----------------------------------+----------+------+----------+------------+----------------------------------+-----------+---------------+
|
||||
```
|
||||
|
||||
### 6.23 Text to Speech
|
||||
|
||||
```
|
||||
RAGFlow(user)> tts with 'speech-2.8-hd@test@minimax' text 'He who desires but acts not, breeds pestilence.' play format 'wav' save './internal' param '{"voice_setting": {"voice_id": "English_radiant_girl", "speed": 1, "vol": 1, "pitch": 0}, "audio_setting": {"sample_rate": 32000, "bitrate": 128000, "format": "wav", "channel": 1}, "output_format": "hex"}'
|
||||
Saved to directory: /home/infiniflow/Documents/development/ragflow/internal/speech-2.8-hd_output.wav
|
||||
SUCCESS
|
||||
```
|
||||
|
||||
@ -19,6 +19,7 @@ package models
|
||||
import (
|
||||
"bufio"
|
||||
"bytes"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
@ -464,11 +465,194 @@ func (z *MinimaxModel) TranscribeAudioWithSender(modelName *string, file *string
|
||||
|
||||
// AudioSpeech convert audio to text
|
||||
func (z *MinimaxModel) AudioSpeech(modelName *string, audioContent *string, apiConfig *APIConfig, asrConfig *TTSConfig) (*TTSResponse, error) {
|
||||
return nil, fmt.Errorf("%s, no such method", z.Name())
|
||||
if apiConfig == nil || apiConfig.ApiKey == nil || *apiConfig.ApiKey == "" {
|
||||
return nil, fmt.Errorf("MiniMax API key is missing")
|
||||
}
|
||||
if audioContent == nil || *audioContent == "" {
|
||||
return nil, fmt.Errorf("text content is empty")
|
||||
}
|
||||
|
||||
var region = "default"
|
||||
if apiConfig.Region != nil && *apiConfig.Region != "" {
|
||||
region = *apiConfig.Region
|
||||
}
|
||||
|
||||
url := fmt.Sprintf("%s/%s", z.BaseURL[region], z.URLSuffix.TTS)
|
||||
|
||||
reqBody := map[string]interface{}{
|
||||
"model": modelName,
|
||||
"text": audioContent,
|
||||
}
|
||||
if asrConfig != nil && asrConfig.Params != nil {
|
||||
for key, value := range asrConfig.Params {
|
||||
reqBody[key] = value
|
||||
}
|
||||
}
|
||||
reqBody["stream"] = false
|
||||
|
||||
jsonData, err := json.Marshal(reqBody)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to marshal request: %w", err)
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData))
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to create request: %w", err)
|
||||
}
|
||||
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", strings.TrimSpace(*apiConfig.ApiKey)))
|
||||
|
||||
resp, err := z.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to send request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to read response body: %w", err)
|
||||
}
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return nil, fmt.Errorf("MiniMax TTS API error: status %d, body: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
var result struct {
|
||||
BaseResp struct {
|
||||
StatusCode int `json:"status_code"`
|
||||
StatusMsg string `json:"status_msg"`
|
||||
} `json:"base_resp"`
|
||||
Data struct {
|
||||
Audio string `json:"audio"` // HEX
|
||||
} `json:"data"`
|
||||
}
|
||||
|
||||
if err := json.Unmarshal(body, &result); err != nil {
|
||||
return nil, fmt.Errorf("failed to parse response: %w", err)
|
||||
}
|
||||
|
||||
if result.BaseResp.StatusCode != 0 {
|
||||
return nil, fmt.Errorf("MiniMax TTS returned error: %d - %s", result.BaseResp.StatusCode, result.BaseResp.StatusMsg)
|
||||
}
|
||||
|
||||
// format HEX
|
||||
audioBytes, err := hex.DecodeString(result.Data.Audio)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("failed to decode MiniMax hex audio: %w", err)
|
||||
}
|
||||
|
||||
return &TTSResponse{
|
||||
Audio: audioBytes,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// tts with 'speech-2.8-hd@test@minimax' text 'If that day, out position was switched, would our fate, be different?' voice 'English_expressive_narrator' param '{"voice_setting": {"voice_id": "English_expressive_narrator", "speed": 1, "vol": 1, "pitch": 0}, "audio_setting": {"sample_rate": 32000, "bitrate": 128000, "format": "wav", "channel": 1}, "output_format": "hex"}'
|
||||
func (z *MinimaxModel) AudioSpeechWithSender(modelName *string, audioContent *string, apiConfig *APIConfig, ttsConfig *TTSConfig, sender func(*string, *string) error) error {
|
||||
return fmt.Errorf("%s, no such method", z.Name())
|
||||
if apiConfig == nil || apiConfig.ApiKey == nil || *apiConfig.ApiKey == "" {
|
||||
return fmt.Errorf("MiniMax API key is missing")
|
||||
}
|
||||
if audioContent == nil || *audioContent == "" {
|
||||
return fmt.Errorf("text content is empty")
|
||||
}
|
||||
|
||||
var region = "default"
|
||||
if apiConfig.Region != nil && *apiConfig.Region != "" {
|
||||
region = *apiConfig.Region
|
||||
}
|
||||
|
||||
baseURL := strings.TrimSuffix(z.BaseURL[region], "/")
|
||||
if baseURL == "" {
|
||||
baseURL = strings.TrimSuffix(z.BaseURL["default"], "/")
|
||||
}
|
||||
suffix := strings.TrimPrefix(z.URLSuffix.TTS, "/")
|
||||
if suffix == "" {
|
||||
suffix = "v1/t2a_v2"
|
||||
}
|
||||
url := fmt.Sprintf("%s/%s", baseURL, suffix)
|
||||
|
||||
reqBody := map[string]interface{}{
|
||||
"model": modelName,
|
||||
"text": audioContent,
|
||||
}
|
||||
if ttsConfig != nil && ttsConfig.Params != nil {
|
||||
for key, value := range ttsConfig.Params {
|
||||
reqBody[key] = value
|
||||
}
|
||||
}
|
||||
reqBody["stream"] = false
|
||||
reqBody["stream"] = true
|
||||
|
||||
jsonData, err := json.Marshal(reqBody)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to marshal request: %w", err)
|
||||
}
|
||||
|
||||
req, err := http.NewRequest("POST", url, bytes.NewBuffer(jsonData))
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to create request: %w", err)
|
||||
}
|
||||
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", strings.TrimSpace(*apiConfig.ApiKey)))
|
||||
|
||||
resp, err := z.httpClient.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to send request: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
body, _ := io.ReadAll(resp.Body)
|
||||
return fmt.Errorf("MiniMax stream TTS API error: %d, body: %s", resp.StatusCode, string(body))
|
||||
}
|
||||
|
||||
scanner := bufio.NewScanner(resp.Body)
|
||||
scanner.Buffer(make([]byte, 64*1024), 2*1024*1024)
|
||||
|
||||
for scanner.Scan() {
|
||||
line := scanner.Text()
|
||||
|
||||
if !strings.HasPrefix(line, "data:") {
|
||||
continue
|
||||
}
|
||||
|
||||
dataStr := strings.TrimSpace(line[5:])
|
||||
if dataStr == "" {
|
||||
continue
|
||||
}
|
||||
|
||||
var event struct {
|
||||
Data struct {
|
||||
Audio string `json:"audio"`
|
||||
Status int `json:"status"`
|
||||
} `json:"data"`
|
||||
}
|
||||
|
||||
if err := json.Unmarshal([]byte(dataStr), &event); err != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
if event.Data.Audio != "" {
|
||||
audioBytes, err := hex.DecodeString(event.Data.Audio)
|
||||
if err == nil && len(audioBytes) > 0 {
|
||||
chunk := string(audioBytes)
|
||||
if errSend := sender(&chunk, nil); errSend != nil {
|
||||
return errSend
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if event.Data.Status == 2 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if err := scanner.Err(); err != nil {
|
||||
return fmt.Errorf("error reading minimax stream: %w", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// OCRFile OCR file
|
||||
|
||||
@ -65,6 +65,7 @@ type ASRResponse struct {
|
||||
}
|
||||
|
||||
type TTSResponse struct {
|
||||
Audio []byte `json:"audio"`
|
||||
}
|
||||
|
||||
type OCRResponse struct {
|
||||
@ -83,6 +84,7 @@ type URLSuffix struct {
|
||||
Balance string `json:"balance"`
|
||||
Files string `json:"files"`
|
||||
Status string `json:"status"`
|
||||
TTS string `json:"tts"`
|
||||
}
|
||||
|
||||
type ChatConfig struct {
|
||||
@ -116,6 +118,7 @@ type ASRConfig struct {
|
||||
}
|
||||
|
||||
type TTSConfig struct {
|
||||
Params map[string]interface{}
|
||||
}
|
||||
|
||||
type OCRConfig struct {
|
||||
|
||||
@ -1166,14 +1166,12 @@ func (h *ProviderHandler) TranscribeAudio(c *gin.Context) {
|
||||
}
|
||||
|
||||
type AudioSpeechRequest struct {
|
||||
ProviderName *string `json:"provider_name"`
|
||||
InstanceName *string `json:"instance_name"`
|
||||
ModelName *string `json:"model_name"`
|
||||
Text *string `json:"text"`
|
||||
Language []string `json:"language"`
|
||||
Voice int `json:"voice"`
|
||||
Stream bool `json:"stream"`
|
||||
Volume bool `json:"volume"`
|
||||
ProviderName *string `json:"provider_name"`
|
||||
InstanceName *string `json:"instance_name"`
|
||||
ModelName *string `json:"model_name"`
|
||||
Text *string `json:"text"`
|
||||
Stream bool `json:"stream"`
|
||||
TTSConfig *models.TTSConfig `json:"tts_config"`
|
||||
}
|
||||
|
||||
func (h *ProviderHandler) AudioSpeech(c *gin.Context) {
|
||||
@ -1219,6 +1217,9 @@ func (h *ProviderHandler) AudioSpeech(c *gin.Context) {
|
||||
}
|
||||
|
||||
ttsConfig := models.TTSConfig{}
|
||||
if req.TTSConfig != nil {
|
||||
ttsConfig = *req.TTSConfig
|
||||
}
|
||||
|
||||
// Check if it's a stream request
|
||||
if req.Stream {
|
||||
|
||||
Reference in New Issue
Block a user