tts add voice choose (#2391)

Co-authored-by: luowei <glpat-EjySCyNjWiLqAED-YmwM>
Co-authored-by: crazywoola <427733928@qq.com>
Co-authored-by: crazywoola <100913391+crazywoola@users.noreply.github.com>
This commit is contained in:
Charlie.Wei
2024-02-15 22:41:18 +08:00
committed by GitHub
parent e47b5b43b8
commit 300d9892a5
35 changed files with 746 additions and 92 deletions

View File

@ -1,7 +1,7 @@
import logging
from flask import request
from flask_restful import Resource
from flask_restful import Resource, reqparse
from werkzeug.exceptions import InternalServerError
import services
@ -23,6 +23,7 @@ from controllers.console.wraps import account_initialization_required
from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
from core.model_runtime.errors.invoke import InvokeError
from libs.login import login_required
from models.model import AppModelConfig
from services.audio_service import AudioService
from services.errors.audio import (
AudioTooLargeServiceError,
@ -45,7 +46,9 @@ class ChatMessageAudioApi(Resource):
try:
response = AudioService.transcript_asr(
tenant_id=app_model.tenant_id,
file=file
file=file,
end_user=None,
promot=app_model.app_model_config.pre_prompt
)
return response
@ -71,7 +74,7 @@ class ChatMessageAudioApi(Resource):
except ValueError as e:
raise e
except Exception as e:
logging.exception("internal server error.")
logging.exception(f"internal server error, {str(e)}.")
raise InternalServerError()
@ -82,10 +85,17 @@ class ChatMessageTextApi(Resource):
def post(self, app_id):
app_id = str(app_id)
app_model = _get_app(app_id, None)
app_model_config: AppModelConfig = app_model.app_model_config
if not app_model_config.text_to_speech_dict['enabled']:
raise AppUnavailableError()
try:
response = AudioService.transcript_tts(
tenant_id=app_model.tenant_id,
text=request.form['text'],
voice=app_model.app_model_config.text_to_speech_dict.get('voice'),
streaming=False
)
@ -112,9 +122,54 @@ class ChatMessageTextApi(Resource):
except ValueError as e:
raise e
except Exception as e:
logging.exception("internal server error.")
logging.exception(f"internal server error, {str(e)}.")
raise InternalServerError()
class TextModesApi(Resource):
def get(self, app_id: str):
app_model = _get_app(str(app_id))
app_model_config: AppModelConfig = app_model.app_model_config
if not app_model_config.text_to_speech_dict['enabled']:
raise AppUnavailableError()
try:
parser = reqparse.RequestParser()
parser.add_argument('language', type=str, required=True, location='args')
args = parser.parse_args()
response = AudioService.transcript_tts_voices(
tenant_id=app_model.tenant_id,
language=args['language'],
)
return response
except services.errors.audio.ProviderNotSupportTextToSpeechLanageServiceError:
raise AppUnavailableError("Text to audio voices language parameter loss.")
except NoAudioUploadedServiceError:
raise NoAudioUploadedError()
except AudioTooLargeServiceError as e:
raise AudioTooLargeError(str(e))
except UnsupportedAudioTypeServiceError:
raise UnsupportedAudioTypeError()
except ProviderNotSupportSpeechToTextServiceError:
raise ProviderNotSupportSpeechToTextError()
except ProviderTokenNotInitError as ex:
raise ProviderNotInitializeError(ex.description)
except QuotaExceededError:
raise ProviderQuotaExceededError()
except ModelCurrentlyNotSupportError:
raise ProviderModelCurrentlyNotSupportError()
except InvokeError as e:
raise CompletionRequestError(e.description)
except ValueError as e:
raise e
except Exception as e:
logging.exception(f"internal server error, {str(e)}.")
raise InternalServerError()
api.add_resource(ChatMessageAudioApi, '/apps/<uuid:app_id>/audio-to-text')
api.add_resource(ChatMessageTextApi, '/apps/<uuid:app_id>/text-to-audio')
api.add_resource(TextModesApi, '/apps/<uuid:app_id>/text-to-audio/voices')

View File

@ -85,6 +85,7 @@ class ChatTextApi(InstalledAppResource):
response = AudioService.transcript_tts(
tenant_id=app_model.tenant_id,
text=request.form['text'],
voice=app_model.app_model_config.text_to_speech_dict.get('voice'),
streaming=False
)
return {'data': response.data.decode('latin1')}

View File

@ -86,6 +86,7 @@ class TextApi(AppApiResource):
tenant_id=app_model.tenant_id,
text=args['text'],
end_user=args['user'],
voice=app_model.app_model_config.text_to_speech_dict.get('voice'),
streaming=args['streaming']
)

View File

@ -68,17 +68,23 @@ class AudioApi(WebApiResource):
except ValueError as e:
raise e
except Exception as e:
logging.exception("internal server error.")
logging.exception(f"internal server error: {str(e)}")
raise InternalServerError()
class TextApi(WebApiResource):
def post(self, app_model: App, end_user):
app_model_config: AppModelConfig = app_model.app_model_config
if not app_model_config.text_to_speech_dict['enabled']:
raise AppUnavailableError()
try:
response = AudioService.transcript_tts(
tenant_id=app_model.tenant_id,
text=request.form['text'],
end_user=end_user.external_user_id,
voice=app_model.app_model_config.text_to_speech_dict.get('voice'),
streaming=False
)
@ -105,7 +111,7 @@ class TextApi(WebApiResource):
except ValueError as e:
raise e
except Exception as e:
logging.exception("internal server error.")
logging.exception(f"internal server error: {str(e)}")
raise InternalServerError()

View File

@ -28,6 +28,7 @@ from core.entities.application_entities import (
ModelConfigEntity,
PromptTemplateEntity,
SensitiveWordAvoidanceEntity,
TextToSpeechEntity,
)
from core.entities.model_entities import ModelStatus
from core.errors.error import ModelCurrentlyNotSupportError, ProviderTokenNotInitError, QuotaExceededError
@ -572,7 +573,11 @@ class ApplicationManager:
text_to_speech_dict = copy_app_model_config_dict.get('text_to_speech')
if text_to_speech_dict:
if 'enabled' in text_to_speech_dict and text_to_speech_dict['enabled']:
properties['text_to_speech'] = True
properties['text_to_speech'] = TextToSpeechEntity(
enabled=text_to_speech_dict.get('enabled'),
voice=text_to_speech_dict.get('voice'),
language=text_to_speech_dict.get('language'),
)
# sensitive word avoidance
sensitive_word_avoidance_dict = copy_app_model_config_dict.get('sensitive_word_avoidance')

View File

@ -42,6 +42,7 @@ class AdvancedCompletionPromptTemplateEntity(BaseModel):
"""
Advanced Completion Prompt Template Entity.
"""
class RolePrefixEntity(BaseModel):
"""
Role Prefix Entity.
@ -57,6 +58,7 @@ class PromptTemplateEntity(BaseModel):
"""
Prompt Template Entity.
"""
class PromptType(Enum):
"""
Prompt Type.
@ -97,6 +99,7 @@ class DatasetRetrieveConfigEntity(BaseModel):
"""
Dataset Retrieve Config Entity.
"""
class RetrieveStrategy(Enum):
"""
Dataset Retrieve Strategy.
@ -143,6 +146,15 @@ class SensitiveWordAvoidanceEntity(BaseModel):
config: dict[str, Any] = {}
class TextToSpeechEntity(BaseModel):
"""
Sensitive Word Avoidance Entity.
"""
enabled: bool
voice: Optional[str] = None
language: Optional[str] = None
class FileUploadEntity(BaseModel):
"""
File Upload Entity.
@ -159,6 +171,7 @@ class AgentToolEntity(BaseModel):
tool_name: str
tool_parameters: dict[str, Any] = {}
class AgentPromptEntity(BaseModel):
"""
Agent Prompt Entity.
@ -166,6 +179,7 @@ class AgentPromptEntity(BaseModel):
first_prompt: str
next_iteration: str
class AgentScratchpadUnit(BaseModel):
"""
Agent First Prompt Entity.
@ -182,12 +196,14 @@ class AgentScratchpadUnit(BaseModel):
thought: Optional[str] = None
action_str: Optional[str] = None
observation: Optional[str] = None
action: Optional[Action] = None
action: Optional[Action] = None
class AgentEntity(BaseModel):
"""
Agent Entity.
"""
class Strategy(Enum):
"""
Agent Strategy.
@ -202,6 +218,7 @@ class AgentEntity(BaseModel):
tools: list[AgentToolEntity] = None
max_iteration: int = 5
class AppOrchestrationConfigEntity(BaseModel):
"""
App Orchestration Config Entity.
@ -219,7 +236,7 @@ class AppOrchestrationConfigEntity(BaseModel):
show_retrieve_source: bool = False
more_like_this: bool = False
speech_to_text: bool = False
text_to_speech: bool = False
text_to_speech: dict = {}
sensitive_word_avoidance: Optional[SensitiveWordAvoidanceEntity] = None

View File

@ -99,7 +99,8 @@ class ModelInstance:
user=user
)
def invoke_rerank(self, query: str, docs: list[str], score_threshold: Optional[float] = None, top_n: Optional[int] = None,
def invoke_rerank(self, query: str, docs: list[str], score_threshold: Optional[float] = None,
top_n: Optional[int] = None,
user: Optional[str] = None) \
-> RerankResult:
"""
@ -166,13 +167,15 @@ class ModelInstance:
user=user
)
def invoke_tts(self, content_text: str, streaming: bool, user: Optional[str] = None) \
def invoke_tts(self, content_text: str, tenant_id: str, voice: str, streaming: bool, user: Optional[str] = None) \
-> str:
"""
Invoke large language model
Invoke large language tts model
:param content_text: text content to be translated
:param tenant_id: user tenant id
:param user: unique user id
:param voice: model timbre
:param streaming: output is streaming
:return: text for given audio file
"""
@ -185,9 +188,28 @@ class ModelInstance:
credentials=self.credentials,
content_text=content_text,
user=user,
tenant_id=tenant_id,
voice=voice,
streaming=streaming
)
def get_tts_voices(self, language: str) -> list:
"""
Invoke large language tts model voices
:param language: tts language
:return: tts model voices
"""
if not isinstance(self.model_type_instance, TTSModel):
raise Exception("Model type instance is not TTSModel")
self.model_type_instance = cast(TTSModel, self.model_type_instance)
return self.model_type_instance.get_tts_model_voices(
model=self.model,
credentials=self.credentials,
language=language
)
class ModelManager:
def __init__(self) -> None:

View File

@ -48,6 +48,10 @@
- `file_upload_limit` (int) Maximum file upload limit, in MB (available for model type `speech2text`)
- `supported_file_extensions` (string) Supported file extension formats, e.g., mp3, mp4 (available for model type `speech2text`)
- `default_voice` (string) default voice, e.g.alloy,echo,fable,onyx,nova,shimmeravailable for model type `tts`
- `voices` (list) List of available voice.available for model type `tts`
- `mode` (string) voice model.available for model type `tts`
- `name` (string) voice model display name.available for model type `tts`
- `lanuage` (string) the voice model supports languages.available for model type `tts`
- `word_limit` (int) Single conversion word limit, paragraphwise by defaultavailable for model type `tts`
- `audio_type` (string) Support audio file extension format, e.g.mp3,wavavailable for model type `tts`
- `max_workers` (int) Number of concurrent workers supporting text and audio conversionavailable for model type`tts`

View File

@ -48,7 +48,11 @@
- `max_chunks` (int) 最大分块数量 (模型类型 `text-embedding ` `moderation` 可用)
- `file_upload_limit` (int) 文件最大上传限制单位MB。模型类型 `speech2text` 可用)
- `supported_file_extensions` (string) 支持文件扩展格式mp3,mp4模型类型 `speech2text` 可用)
- `default_voice` (string) 缺省音色,alloy,echo,fable,onyx,nova,shimmer模型类型 `tts` 可用)
- `default_voice` (string) 缺省音色,alloy,echo,fable,onyx,nova,shimmer模型类型 `tts` 可用)
- `voices` (list) 可选音色列表。
- `mode` (string) 音色模型。(模型类型 `tts` 可用)
- `name` (string) 音色模型显示名称。(模型类型 `tts` 可用)
- `lanuage` (string) 音色模型支持语言。(模型类型 `tts` 可用)
- `word_limit` (int) 单次转换字数限制,默认按段落分段(模型类型 `tts` 可用)
- `audio_type` (string) 支持音频文件扩展格式mp3,wav模型类型 `tts` 可用)
- `max_workers` (int) 支持文字音频转换并发任务数(模型类型 `tts` 可用)

View File

@ -127,6 +127,7 @@ class ModelPropertyKey(Enum):
SUPPORTED_FILE_EXTENSIONS = "supported_file_extensions"
MAX_CHARACTERS_PER_CHUNK = "max_characters_per_chunk"
DEFAULT_VOICE = "default_voice"
VOICES = "voices"
WORD_LIMIT = "word_limit"
AUDOI_TYPE = "audio_type"
MAX_WORKERS = "max_workers"

View File

@ -15,29 +15,37 @@ class TTSModel(AIModel):
"""
model_type: ModelType = ModelType.TTS
def invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None):
def invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
user: Optional[str] = None):
"""
Invoke large language model
:param model: model name
:param tenant_id: user tenant id
:param credentials: model credentials
:param voice: model timbre
:param content_text: text content to be translated
:param streaming: output is streaming
:param user: unique user id
:return: translated audio file
"""
try:
return self._invoke(model=model, credentials=credentials, user=user, streaming=streaming, content_text=content_text)
self._is_ffmpeg_installed()
return self._invoke(model=model, credentials=credentials, user=user, streaming=streaming,
content_text=content_text, voice=voice, tenant_id=tenant_id)
except Exception as e:
raise self._transform_invoke_error(e)
@abstractmethod
def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None):
def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
user: Optional[str] = None):
"""
Invoke large language model
:param model: model name
:param tenant_id: user tenant id
:param credentials: model credentials
:param voice: model timbre
:param content_text: text content to be translated
:param streaming: output is streaming
:param user: unique user id
@ -45,7 +53,22 @@ class TTSModel(AIModel):
"""
raise NotImplementedError
def _get_model_voice(self, model: str, credentials: dict) -> any:
def get_tts_model_voices(self, model: str, credentials: dict, language: str) -> list:
"""
Get voice for given tts model voices
:param language: tts language
:param model: model name
:param credentials: model credentials
:return: voices lists
"""
model_schema = self.get_model_schema(model, credentials)
if model_schema and ModelPropertyKey.VOICES in model_schema.model_properties:
voices = model_schema.model_properties[ModelPropertyKey.VOICES]
return [{'name': d['name'], 'value': d['mode']} for d in voices if language and language in d.get('language')]
def _get_model_default_voice(self, model: str, credentials: dict) -> any:
"""
Get voice for given tts model

View File

@ -1,7 +1,31 @@
model: tts-1-hd
model: tts-1
model_type: tts
model_properties:
default_voice: 'alloy'
voices:
- mode: 'alloy'
name: 'Alloy'
language: ['zh-CN', 'en-US']
- mode: 'echo'
name: 'Echo'
language: ['zh-CN', 'en-US']
- mode: 'fable'
name: 'Fable'
language: ['zh-CN', 'en-US']
- mode: 'onyx'
name: 'Onyx'
language: ['zh-CN', 'en-US']
- mode: 'nova'
name: 'Nova'
language: ['zh-CN', 'en-US']
- mode: 'shimmer'
name: 'Shimmer'
language: ['zh-CN', 'en-US']
word_limit: 120
audio_type: 'mp3'
max_workers: 5
pricing:
input: '0.03'
output: '0'
unit: '0.001'
currency: USD

View File

@ -2,6 +2,30 @@ model: tts-1
model_type: tts
model_properties:
default_voice: 'alloy'
voices:
- mode: 'alloy'
name: 'Alloy'
language: ['zh-CN', 'en-US']
- mode: 'echo'
name: 'Echo'
language: ['zh-CN', 'en-US']
- mode: 'fable'
name: 'Fable'
language: ['zh-CN', 'en-US']
- mode: 'onyx'
name: 'Onyx'
language: ['zh-CN', 'en-US']
- mode: 'nova'
name: 'Nova'
language: ['zh-CN', 'en-US']
- mode: 'shimmer'
name: 'Shimmer'
language: ['zh-CN', 'en-US']
word_limit: 120
audio_type: 'mp3'
max_workers: 5
pricing:
input: '0.015'
output: '0'
unit: '0.001'
currency: USD

View File

@ -11,33 +11,40 @@ from core.model_runtime.errors.invoke import InvokeBadRequestError
from core.model_runtime.errors.validate import CredentialsValidateFailedError
from core.model_runtime.model_providers.__base.tts_model import TTSModel
from core.model_runtime.model_providers.openai._common import _CommonOpenAI
from extensions.ext_storage import storage
class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
"""
Model class for OpenAI Speech to text model.
"""
def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any:
def _invoke(self, model: str, tenant_id: str, credentials: dict,
content_text: str, voice: str, streaming: bool, user: Optional[str] = None) -> any:
"""
_invoke text2speech model
:param model: model name
:param tenant_id: user tenant id
:param credentials: model credentials
:param content_text: text content to be translated
:param voice: model timbre
:param streaming: output is streaming
:param user: unique user id
:return: text translated to audio file
"""
self._is_ffmpeg_installed()
audio_type = self._get_model_audio_type(model, credentials)
if not voice:
voice = self._get_model_default_voice(model, credentials)
if streaming:
return Response(stream_with_context(self._tts_invoke_streaming(model=model,
credentials=credentials,
content_text=content_text,
user=user)),
tenant_id=tenant_id,
voice=voice)),
status=200, mimetype=f'audio/{audio_type}')
else:
return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, user=user)
return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)
def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
"""
@ -52,91 +59,96 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
self._tts_invoke(
model=model,
credentials=credentials,
content_text='Hello world!',
user=user
content_text='Hello Dify!',
voice=self._get_model_default_voice(model, credentials),
)
except Exception as ex:
raise CredentialsValidateFailedError(str(ex))
def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response:
def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
"""
_tts_invoke text2speech model
:param model: model name
:param credentials: model credentials
:param content_text: text content to be translated
:param user: unique user id
:param voice: model timbre
:return: text translated to audio file
"""
audio_type = self._get_model_audio_type(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
max_workers = self._get_model_workers_limit(model, credentials)
try:
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
audio_bytes_list = list()
# Create a thread pool and map the function to the list of sentences
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(self._process_sentence, sentence, model, credentials) for sentence
in sentences]
futures = [executor.submit(self._process_sentence, sentence=sentence, model=model, voice=voice,
credentials=credentials) for sentence in sentences]
for future in futures:
try:
audio_bytes_list.append(future.result())
if future.result():
audio_bytes_list.append(future.result())
except Exception as ex:
raise InvokeBadRequestError(str(ex))
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
audio_bytes_list if audio_bytes]
combined_segment = reduce(lambda x, y: x + y, audio_segments)
buffer: BytesIO = BytesIO()
combined_segment.export(buffer, format=audio_type)
buffer.seek(0)
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
if len(audio_bytes_list) > 0:
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
audio_bytes_list if audio_bytes]
combined_segment = reduce(lambda x, y: x + y, audio_segments)
buffer: BytesIO = BytesIO()
combined_segment.export(buffer, format=audio_type)
buffer.seek(0)
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
except Exception as ex:
raise InvokeBadRequestError(str(ex))
# Todo: To improve the streaming function
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str,
voice: str) -> any:
"""
_tts_invoke_streaming text2speech model
:param model: model name
:param tenant_id: user tenant id
:param credentials: model credentials
:param content_text: text content to be translated
:param user: unique user id
:param voice: model timbre
:return: text translated to audio file
"""
# transform credentials to kwargs for model instance
credentials_kwargs = self._to_credential_kwargs(credentials)
voice_name = self._get_model_voice(model, credentials)
if not voice:
voice = self._get_model_default_voice(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
audio_type = self._get_model_audio_type(model, credentials)
tts_file_id = self._get_file_name(content_text)
file_path = f'storage/generate_files/{audio_type}/{tts_file_id}.{audio_type}'
file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
try:
client = OpenAI(**credentials_kwargs)
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
for sentence in sentences:
response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip())
response.stream_to_file(file_path)
response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())
# response.stream_to_file(file_path)
storage.save(file_path, response.read())
except Exception as ex:
raise InvokeBadRequestError(str(ex))
def _process_sentence(self, sentence: str, model: str, credentials: dict):
def _process_sentence(self, sentence: str, model: str,
voice, credentials: dict):
"""
_tts_invoke openai text2speech model api
:param model: model name
:param credentials: model credentials
:param voice: model timbre
:param sentence: text content to be translated
:return: text translated to audio file
"""
# transform credentials to kwargs for model instance
credentials_kwargs = self._to_credential_kwargs(credentials)
voice_name = self._get_model_voice(model, credentials)
client = OpenAI(**credentials_kwargs)
response = client.audio.speech.create(model=model, voice=voice_name, input=sentence.strip())
response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())
if isinstance(response.read(), bytes):
return response.read()

View File

@ -1,7 +1,134 @@
model: tts-1
model_type: tts
model_properties:
default_voice: 'sambert-zhiru-v1' # 音色参考 https://help.aliyun.com/zh/dashscope/model-list 配置
default_voice: 'sambert-zhiru-v1'
voices:
- mode: "sambert-zhinan-v1"
name: "知楠(广告男声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhiqi-v1"
name: "知琪(温柔女声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhichu-v1"
name: "知厨(新闻播报)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhide-v1"
name: "知德(新闻男声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhijia-v1"
name: "知佳(标准女声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhiru-v1"
name: "知茹(新闻女声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhiqian-v1"
name: "知倩(配音解说、新闻播报)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhixiang-v1"
name: "知祥(配音解说)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhiwei-v1"
name: "知薇(萝莉女声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhihao-v1"
name: "知浩(咨询男声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhijing-v1"
name: "知婧(严厉女声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhiming-v1"
name: "知茗(诙谐男声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhimo-v1"
name: "知墨(情感男声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhina-v1"
name: "知娜(浙普女声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhishu-v1"
name: "知树(资讯男声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhistella-v1"
name: "知莎(知性女声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhiting-v1"
name: "知婷(电台女声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhixiao-v1"
name: "知笑(资讯女声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhiya-v1"
name: "知雅(严厉女声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhiye-v1"
name: "知晔(青年男声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhiying-v1"
name: "知颖(软萌童声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhiyuan-v1"
name: "知媛(知心姐姐)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhigui-v1"
name: "知柜(直播女声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhishuo-v1"
name: "知硕(自然男声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhimiao-emo-v1"
name: "知妙(多种情感女声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhimao-v1"
name: "知猫(直播女声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhilun-v1"
name: "知伦(悬疑解说)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhifei-v1"
name: "知飞(激昂解说)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-zhida-v1"
name: "知达(标准男声)"
language: [ "zh-CN", "en-US" ]
- mode: "sambert-camila-v1"
name: "Camila西班牙语女声"
language: [ "es-ES" ]
- mode: "sambert-perla-v1"
name: "Perla意大利语女声"
language: [ "it-IT" ]
- mode: "sambert-indah-v1"
name: "Indah印尼语女声"
language: [ "id-ID" ]
- mode: "sambert-clara-v1"
name: "Clara法语女声"
language: [ "fr-FR" ]
- mode: "sambert-hanna-v1"
name: "Hanna德语女声"
language: [ "de-DE" ]
- mode: "sambert-beth-v1"
name: "Beth咨询女声"
language: [ "en-US" ]
- mode: "sambert-betty-v1"
name: "Betty客服女声"
language: [ "en-US" ]
- mode: "sambert-cally-v1"
name: "Cally自然女声"
language: [ "en-US" ]
- mode: "sambert-cindy-v1"
name: "Cindy对话女声"
language: [ "en-US" ]
- mode: "sambert-eva-v1"
name: "Eva陪伴女声"
language: [ "en-US" ]
- mode: "sambert-donna-v1"
name: "Donna教育女声"
language: [ "en-US" ]
- mode: "sambert-brian-v1"
name: "Brian客服男声"
language: [ "en-US" ]
- mode: "sambert-waan-v1"
name: "Waan泰语女声"
language: [ "th-TH" ]
word_limit: 120
audio_type: 'mp3'
max_workers: 5

View File

@ -11,33 +11,40 @@ from core.model_runtime.errors.invoke import InvokeBadRequestError
from core.model_runtime.errors.validate import CredentialsValidateFailedError
from core.model_runtime.model_providers.__base.tts_model import TTSModel
from core.model_runtime.model_providers.tongyi._common import _CommonTongyi
from extensions.ext_storage import storage
class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
"""
Model class for Tongyi Speech to text model.
"""
def _invoke(self, model: str, credentials: dict, content_text: str, streaming: bool, user: Optional[str] = None) -> any:
def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
user: Optional[str] = None) -> any:
"""
_invoke text2speech model
:param model: model name
:param tenant_id: user tenant id
:param credentials: model credentials
:param voice: model timbre
:param content_text: text content to be translated
:param streaming: output is streaming
:param user: unique user id
:return: text translated to audio file
"""
self._is_ffmpeg_installed()
audio_type = self._get_model_audio_type(model, credentials)
if not voice:
voice = self._get_model_default_voice(model, credentials)
if streaming:
return Response(stream_with_context(self._tts_invoke_streaming(model=model,
credentials=credentials,
content_text=content_text,
user=user)),
voice=voice,
tenant_id=tenant_id)),
status=200, mimetype=f'audio/{audio_type}')
else:
return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, user=user)
return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)
def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
"""
@ -52,91 +59,96 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
self._tts_invoke(
model=model,
credentials=credentials,
content_text='Hello world!',
user=user
content_text='Hello Dify!',
voice=self._get_model_default_voice(model, credentials),
)
except Exception as ex:
raise CredentialsValidateFailedError(str(ex))
def _tts_invoke(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> Response:
def _tts_invoke(self, model: str, credentials: dict, content_text: str, voice: str) -> Response:
"""
_tts_invoke text2speech model
:param model: model name
:param credentials: model credentials
:param voice: model timbre
:param content_text: text content to be translated
:param user: unique user id
:return: text translated to audio file
"""
audio_type = self._get_model_audio_type(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
max_workers = self._get_model_workers_limit(model, credentials)
try:
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
audio_bytes_list = list()
# Create a thread pool and map the function to the list of sentences
with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
futures = [executor.submit(self._process_sentence, model=model, sentence=sentence,
credentials=credentials, audio_type=audio_type) for sentence in sentences]
futures = [executor.submit(self._process_sentence, sentence=sentence,
credentials=credentials, voice=voice, audio_type=audio_type) for sentence in
sentences]
for future in futures:
try:
audio_bytes_list.append(future.result())
if future.result():
audio_bytes_list.append(future.result())
except Exception as ex:
raise InvokeBadRequestError(str(ex))
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
audio_bytes_list if audio_bytes]
combined_segment = reduce(lambda x, y: x + y, audio_segments)
buffer: BytesIO = BytesIO()
combined_segment.export(buffer, format=audio_type)
buffer.seek(0)
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
if len(audio_bytes_list) > 0:
audio_segments = [AudioSegment.from_file(BytesIO(audio_bytes), format=audio_type) for audio_bytes in
audio_bytes_list if audio_bytes]
combined_segment = reduce(lambda x, y: x + y, audio_segments)
buffer: BytesIO = BytesIO()
combined_segment.export(buffer, format=audio_type)
buffer.seek(0)
return Response(buffer.read(), status=200, mimetype=f"audio/{audio_type}")
except Exception as ex:
raise InvokeBadRequestError(str(ex))
# Todo: To improve the streaming function
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str, user: Optional[str] = None) -> any:
def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str,
voice: str) -> any:
"""
_tts_invoke_streaming text2speech model
:param model: model name
:param tenant_id: user tenant id
:param credentials: model credentials
:param voice: model timbre
:param content_text: text content to be translated
:param user: unique user id
:return: text translated to audio file
"""
# transform credentials to kwargs for model instance
dashscope.api_key = credentials.get('dashscope_api_key')
voice_name = self._get_model_voice(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
audio_type = self._get_model_audio_type(model, credentials)
tts_file_id = self._get_file_name(content_text)
file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
try:
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
for sentence in sentences:
response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(),
response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice, sample_rate=48000,
text=sentence.strip(),
format=audio_type, word_timestamp_enabled=True,
phoneme_timestamp_enabled=True)
if isinstance(response.get_audio_data(), bytes):
return response.get_audio_data()
storage.save(file_path, response.get_audio_data())
except Exception as ex:
raise InvokeBadRequestError(str(ex))
def _process_sentence(self, sentence: str, model: str, credentials: dict, audio_type: str):
@staticmethod
def _process_sentence(sentence: str, credentials: dict, voice: str, audio_type: str):
"""
_tts_invoke Tongyi text2speech model api
:param model: model name
:param credentials: model credentials
:param sentence: text content to be translated
:param voice: model timbre
:param audio_type: audio file type
:return: text translated to audio file
"""
# transform credentials to kwargs for model instance
dashscope.api_key = credentials.get('dashscope_api_key')
voice_name = self._get_model_voice(model, credentials)
response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice_name, sample_rate=48000, text=sentence.strip(), format=audio_type)
response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice, sample_rate=48000,
text=sentence.strip(),
format=audio_type)
if isinstance(response.get_audio_data(), bytes):
return response.get_audio_data()

View File

@ -98,7 +98,9 @@ class AppModelConfigService:
# text_to_speech
if 'text_to_speech' not in config or not config["text_to_speech"]:
config["text_to_speech"] = {
"enabled": False
"enabled": False,
"voice": "",
"language": ""
}
if not isinstance(config["text_to_speech"], dict):
@ -106,6 +108,8 @@ class AppModelConfigService:
if "enabled" not in config["text_to_speech"] or not config["text_to_speech"]["enabled"]:
config["text_to_speech"]["enabled"] = False
config["text_to_speech"]["voice"] = ""
config["text_to_speech"]["language"] = ""
if not isinstance(config["text_to_speech"]["enabled"], bool):
raise ValueError("enabled in text_to_speech must be of boolean type")

View File

@ -13,14 +13,14 @@ from services.errors.audio import (
UnsupportedAudioTypeServiceError,
)
FILE_SIZE = 15
FILE_SIZE = 30
FILE_SIZE_LIMIT = FILE_SIZE * 1024 * 1024
ALLOWED_EXTENSIONS = ['mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'wav', 'webm', 'amr']
class AudioService:
@classmethod
def transcript_asr(cls, tenant_id: str, file: FileStorage, end_user: Optional[str] = None):
def transcript_asr(cls, tenant_id: str, file: FileStorage, promot: str, end_user: Optional[str] = None):
if file is None:
raise NoAudioUploadedServiceError()
@ -49,7 +49,7 @@ class AudioService:
return {"text": model_instance.invoke_speech2text(file=buffer, user=end_user)}
@classmethod
def transcript_tts(cls, tenant_id: str, text: str, streaming: bool, end_user: Optional[str] = None):
def transcript_tts(cls, tenant_id: str, text: str, voice: str, streaming: bool, end_user: Optional[str] = None):
model_manager = ModelManager()
model_instance = model_manager.get_default_model_instance(
tenant_id=tenant_id,
@ -59,6 +59,21 @@ class AudioService:
raise ProviderNotSupportTextToSpeechServiceError()
try:
return model_instance.invoke_tts(content_text=text.strip(), user=end_user, streaming=streaming)
return model_instance.invoke_tts(content_text=text.strip(), user=end_user, streaming=streaming, tenant_id=tenant_id, voice=voice)
except Exception as e:
raise e
@classmethod
def transcript_tts_voices(cls, tenant_id: str, language: str):
model_manager = ModelManager()
model_instance = model_manager.get_default_model_instance(
tenant_id=tenant_id,
model_type=ModelType.TTS
)
if model_instance is None:
raise ProviderNotSupportTextToSpeechServiceError()
try:
return model_instance.get_tts_voices(language)
except Exception as e:
raise e

View File

@ -16,3 +16,7 @@ class ProviderNotSupportSpeechToTextServiceError(Exception):
class ProviderNotSupportTextToSpeechServiceError(Exception):
pass
class ProviderNotSupportTextToSpeechLanageServiceError(Exception):
pass