feat:add tts-streaming config and future (#5492)

This commit is contained in:
chenxu9741
2024-07-09 11:33:58 +08:00
committed by GitHub
parent b29a36f461
commit 6ef401a9f0
44 changed files with 1280 additions and 358 deletions

View File

@ -1,4 +1,6 @@
import hashlib
import logging
import re
import subprocess
import uuid
from abc import abstractmethod
@ -10,7 +12,7 @@ from core.model_runtime.entities.model_entities import ModelPropertyKey, ModelTy
from core.model_runtime.errors.invoke import InvokeBadRequestError
from core.model_runtime.model_providers.__base.ai_model import AIModel
logger = logging.getLogger(__name__)
class TTSModel(AIModel):
"""
Model class for ttstext model.
@ -20,7 +22,7 @@ class TTSModel(AIModel):
# pydantic configs
model_config = ConfigDict(protected_namespaces=())
def invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
def invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str,
user: Optional[str] = None):
"""
Invoke large language model
@ -35,14 +37,15 @@ class TTSModel(AIModel):
:return: translated audio file
"""
try:
logger.info(f"Invoke TTS model: {model} , invoke content : {content_text}")
self._is_ffmpeg_installed()
return self._invoke(model=model, credentials=credentials, user=user, streaming=streaming,
return self._invoke(model=model, credentials=credentials, user=user,
content_text=content_text, voice=voice, tenant_id=tenant_id)
except Exception as e:
raise self._transform_invoke_error(e)
@abstractmethod
def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str,
user: Optional[str] = None):
"""
Invoke large language model
@ -123,26 +126,26 @@ class TTSModel(AIModel):
return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS]
@staticmethod
def _split_text_into_sentences(text: str, limit: int, delimiters=None):
if delimiters is None:
delimiters = set('。!?;\n')
buf = []
word_count = 0
for char in text:
buf.append(char)
if char in delimiters:
if word_count >= limit:
yield ''.join(buf)
buf = []
word_count = 0
else:
word_count += 1
else:
word_count += 1
if buf:
yield ''.join(buf)
def _split_text_into_sentences(org_text, max_length=2000, pattern=r'[。.!?]'):
match = re.compile(pattern)
tx = match.finditer(org_text)
start = 0
result = []
one_sentence = ''
for i in tx:
end = i.regs[0][1]
tmp = org_text[start:end]
if len(one_sentence + tmp) > max_length:
result.append(one_sentence)
one_sentence = ''
one_sentence += tmp
start = end
last_sens = org_text[start:]
if last_sens:
one_sentence += last_sens
if one_sentence != '':
result.append(one_sentence)
return result
@staticmethod
def _is_ffmpeg_installed():

View File

@ -4,7 +4,7 @@ from functools import reduce
from io import BytesIO
from typing import Optional
from flask import Response, stream_with_context
from flask import Response
from openai import AzureOpenAI
from pydub import AudioSegment
@ -14,7 +14,6 @@ from core.model_runtime.errors.validate import CredentialsValidateFailedError
from core.model_runtime.model_providers.__base.tts_model import TTSModel
from core.model_runtime.model_providers.azure_openai._common import _CommonAzureOpenAI
from core.model_runtime.model_providers.azure_openai._constant import TTS_BASE_MODELS, AzureBaseModel
from extensions.ext_storage import storage
class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
@ -23,7 +22,7 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
"""
def _invoke(self, model: str, tenant_id: str, credentials: dict,
content_text: str, voice: str, streaming: bool, user: Optional[str] = None) -> any:
content_text: str, voice: str, user: Optional[str] = None) -> any:
"""
_invoke text2speech model
@ -32,30 +31,23 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
:param credentials: model credentials
:param content_text: text content to be translated
:param voice: model timbre
:param streaming: output is streaming
:param user: unique user id
:return: text translated to audio file
"""
audio_type = self._get_model_audio_type(model, credentials)
if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]:
voice = self._get_model_default_voice(model, credentials)
if streaming:
return Response(stream_with_context(self._tts_invoke_streaming(model=model,
credentials=credentials,
content_text=content_text,
tenant_id=tenant_id,
voice=voice)),
status=200, mimetype=f'audio/{audio_type}')
else:
return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)
def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
return self._tts_invoke_streaming(model=model,
credentials=credentials,
content_text=content_text,
voice=voice)
def validate_credentials(self, model: str, credentials: dict) -> None:
"""
validate credentials text2speech model
:param model: model name
:param credentials: model credentials
:param user: unique user id
:return: text translated to audio file
"""
try:
@ -82,7 +74,7 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
word_limit = self._get_model_word_limit(model, credentials)
max_workers = self._get_model_workers_limit(model, credentials)
try:
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
audio_bytes_list = []
# Create a thread pool and map the function to the list of sentences
@ -107,34 +99,37 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
except Exception as ex:
raise InvokeBadRequestError(str(ex))
# Todo: To improve the streaming function
def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str,
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
voice: str) -> any:
"""
_tts_invoke_streaming text2speech model
:param model: model name
:param tenant_id: user tenant id
:param credentials: model credentials
:param content_text: text content to be translated
:param voice: model timbre
:return: text translated to audio file
"""
# transform credentials to kwargs for model instance
credentials_kwargs = self._to_credential_kwargs(credentials)
if not voice or voice not in self.get_tts_model_voices(model=model, credentials=credentials):
voice = self._get_model_default_voice(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
audio_type = self._get_model_audio_type(model, credentials)
tts_file_id = self._get_file_name(content_text)
file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
try:
# doc: https://platform.openai.com/docs/guides/text-to-speech
credentials_kwargs = self._to_credential_kwargs(credentials)
client = AzureOpenAI(**credentials_kwargs)
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
for sentence in sentences:
response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())
# response.stream_to_file(file_path)
storage.save(file_path, response.read())
# max font is 4096,there is 3500 limit for each request
max_length = 3500
if len(content_text) > max_length:
sentences = self._split_text_into_sentences(content_text, max_length=max_length)
executor = concurrent.futures.ThreadPoolExecutor(max_workers=min(3, len(sentences)))
futures = [executor.submit(client.audio.speech.with_streaming_response.create, model=model,
response_format="mp3",
input=sentences[i], voice=voice) for i in range(len(sentences))]
for index, future in enumerate(futures):
yield from future.result().__enter__().iter_bytes(1024)
else:
response = client.audio.speech.with_streaming_response.create(model=model, voice=voice,
response_format="mp3",
input=content_text.strip())
yield from response.__enter__().iter_bytes(1024)
except Exception as ex:
raise InvokeBadRequestError(str(ex))
@ -162,7 +157,7 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
@staticmethod
def _get_ai_model_entity(base_model_name: str, model: str) -> AzureBaseModel:
def _get_ai_model_entity(base_model_name: str, model: str) -> AzureBaseModel | None:
for ai_model_entity in TTS_BASE_MODELS:
if ai_model_entity.base_model_name == base_model_name:
ai_model_entity_copy = copy.deepcopy(ai_model_entity)
@ -170,5 +165,4 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
ai_model_entity_copy.entity.label.en_US = model
ai_model_entity_copy.entity.label.zh_Hans = model
return ai_model_entity_copy
return None

View File

@ -21,7 +21,7 @@ model_properties:
- mode: 'shimmer'
name: 'Shimmer'
language: [ 'zh-Hans', 'en-US', 'de-DE', 'fr-FR', 'es-ES', 'it-IT', 'th-TH', 'id-ID' ]
word_limit: 120
word_limit: 3500
audio_type: 'mp3'
max_workers: 5
pricing:

View File

@ -21,7 +21,7 @@ model_properties:
- mode: 'shimmer'
name: 'Shimmer'
language: ['zh-Hans', 'en-US', 'de-DE', 'fr-FR', 'es-ES', 'it-IT', 'th-TH', 'id-ID']
word_limit: 120
word_limit: 3500
audio_type: 'mp3'
max_workers: 5
pricing:

View File

@ -3,7 +3,7 @@ from functools import reduce
from io import BytesIO
from typing import Optional
from flask import Response, stream_with_context
from flask import Response
from openai import OpenAI
from pydub import AudioSegment
@ -11,7 +11,6 @@ from core.model_runtime.errors.invoke import InvokeBadRequestError
from core.model_runtime.errors.validate import CredentialsValidateFailedError
from core.model_runtime.model_providers.__base.tts_model import TTSModel
from core.model_runtime.model_providers.openai._common import _CommonOpenAI
from extensions.ext_storage import storage
class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
@ -20,7 +19,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
"""
def _invoke(self, model: str, tenant_id: str, credentials: dict,
content_text: str, voice: str, streaming: bool, user: Optional[str] = None) -> any:
content_text: str, voice: str, user: Optional[str] = None) -> any:
"""
_invoke text2speech model
@ -29,22 +28,17 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
:param credentials: model credentials
:param content_text: text content to be translated
:param voice: model timbre
:param streaming: output is streaming
:param user: unique user id
:return: text translated to audio file
"""
audio_type = self._get_model_audio_type(model, credentials)
if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]:
voice = self._get_model_default_voice(model, credentials)
if streaming:
return Response(stream_with_context(self._tts_invoke_streaming(model=model,
credentials=credentials,
content_text=content_text,
tenant_id=tenant_id,
voice=voice)),
status=200, mimetype=f'audio/{audio_type}')
else:
return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)
# if streaming:
return self._tts_invoke_streaming(model=model,
credentials=credentials,
content_text=content_text,
voice=voice)
def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
"""
@ -79,7 +73,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
word_limit = self._get_model_word_limit(model, credentials)
max_workers = self._get_model_workers_limit(model, credentials)
try:
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
audio_bytes_list = []
# Create a thread pool and map the function to the list of sentences
@ -104,34 +98,40 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
except Exception as ex:
raise InvokeBadRequestError(str(ex))
# Todo: To improve the streaming function
def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str,
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
voice: str) -> any:
"""
_tts_invoke_streaming text2speech model
:param model: model name
:param tenant_id: user tenant id
:param credentials: model credentials
:param content_text: text content to be translated
:param voice: model timbre
:return: text translated to audio file
"""
# transform credentials to kwargs for model instance
credentials_kwargs = self._to_credential_kwargs(credentials)
if not voice or voice not in self.get_tts_model_voices(model=model, credentials=credentials):
voice = self._get_model_default_voice(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
audio_type = self._get_model_audio_type(model, credentials)
tts_file_id = self._get_file_name(content_text)
file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
try:
# doc: https://platform.openai.com/docs/guides/text-to-speech
credentials_kwargs = self._to_credential_kwargs(credentials)
client = OpenAI(**credentials_kwargs)
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
for sentence in sentences:
response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())
# response.stream_to_file(file_path)
storage.save(file_path, response.read())
if not voice or voice not in self.get_tts_model_voices(model=model, credentials=credentials):
voice = self._get_model_default_voice(model, credentials)
word_limit = self._get_model_word_limit(model, credentials)
if len(content_text) > word_limit:
sentences = self._split_text_into_sentences(content_text, max_length=word_limit)
executor = concurrent.futures.ThreadPoolExecutor(max_workers=min(3, len(sentences)))
futures = [executor.submit(client.audio.speech.with_streaming_response.create, model=model,
response_format="mp3",
input=sentences[i], voice=voice) for i in range(len(sentences))]
for index, future in enumerate(futures):
yield from future.result().__enter__().iter_bytes(1024)
else:
response = client.audio.speech.with_streaming_response.create(model=model, voice=voice,
response_format="mp3",
input=content_text.strip())
yield from response.__enter__().iter_bytes(1024)
except Exception as ex:
raise InvokeBadRequestError(str(ex))

View File

@ -129,7 +129,7 @@ model_properties:
- mode: "sambert-waan-v1"
name: "Waan泰语女声"
language: [ "th-TH" ]
word_limit: 120
word_limit: 7000
audio_type: 'mp3'
max_workers: 5
pricing:

View File

@ -1,17 +1,21 @@
import concurrent.futures
import threading
from functools import reduce
from io import BytesIO
from queue import Queue
from typing import Optional
import dashscope
from flask import Response, stream_with_context
from dashscope import SpeechSynthesizer
from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse
from dashscope.audio.tts import ResultCallback, SpeechSynthesisResult
from flask import Response
from pydub import AudioSegment
from core.model_runtime.errors.invoke import InvokeBadRequestError
from core.model_runtime.errors.validate import CredentialsValidateFailedError
from core.model_runtime.model_providers.__base.tts_model import TTSModel
from core.model_runtime.model_providers.tongyi._common import _CommonTongyi
from extensions.ext_storage import storage
class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
@ -19,7 +23,7 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
Model class for Tongyi Speech to text model.
"""
def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str,
user: Optional[str] = None) -> any:
"""
_invoke text2speech model
@ -29,22 +33,17 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
:param credentials: model credentials
:param voice: model timbre
:param content_text: text content to be translated
:param streaming: output is streaming
:param user: unique user id
:return: text translated to audio file
"""
audio_type = self._get_model_audio_type(model, credentials)
if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]:
if not voice or voice not in [d['value'] for d in
self.get_tts_model_voices(model=model, credentials=credentials)]:
voice = self._get_model_default_voice(model, credentials)
if streaming:
return Response(stream_with_context(self._tts_invoke_streaming(model=model,
credentials=credentials,
content_text=content_text,
voice=voice,
tenant_id=tenant_id)),
status=200, mimetype=f'audio/{audio_type}')
else:
return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)
return self._tts_invoke_streaming(model=model,
credentials=credentials,
content_text=content_text,
voice=voice)
def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
"""
@ -79,7 +78,7 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
word_limit = self._get_model_word_limit(model, credentials)
max_workers = self._get_model_workers_limit(model, credentials)
try:
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
audio_bytes_list = []
# Create a thread pool and map the function to the list of sentences
@ -105,14 +104,12 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
except Exception as ex:
raise InvokeBadRequestError(str(ex))
# Todo: To improve the streaming function
def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str,
def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
voice: str) -> any:
"""
_tts_invoke_streaming text2speech model
:param model: model name
:param tenant_id: user tenant id
:param credentials: model credentials
:param voice: model timbre
:param content_text: text content to be translated
@ -120,18 +117,32 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
"""
word_limit = self._get_model_word_limit(model, credentials)
audio_type = self._get_model_audio_type(model, credentials)
tts_file_id = self._get_file_name(content_text)
file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
try:
sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
for sentence in sentences:
response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice, sample_rate=48000,
api_key=credentials.get('dashscope_api_key'),
text=sentence.strip(),
format=audio_type, word_timestamp_enabled=True,
phoneme_timestamp_enabled=True)
if isinstance(response.get_audio_data(), bytes):
storage.save(file_path, response.get_audio_data())
audio_queue: Queue = Queue()
callback = Callback(queue=audio_queue)
def invoke_remote(content, v, api_key, cb, at, wl):
if len(content) < word_limit:
sentences = [content]
else:
sentences = list(self._split_text_into_sentences(org_text=content, max_length=wl))
for sentence in sentences:
SpeechSynthesizer.call(model=v, sample_rate=16000,
api_key=api_key,
text=sentence.strip(),
callback=cb,
format=at, word_timestamp_enabled=True,
phoneme_timestamp_enabled=True)
threading.Thread(target=invoke_remote, args=(
content_text, voice, credentials.get('dashscope_api_key'), callback, audio_type, word_limit)).start()
while True:
audio = audio_queue.get()
if audio is None:
break
yield audio
except Exception as ex:
raise InvokeBadRequestError(str(ex))
@ -152,3 +163,29 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
format=audio_type)
if isinstance(response.get_audio_data(), bytes):
return response.get_audio_data()
class Callback(ResultCallback):
def __init__(self, queue: Queue):
self._queue = queue
def on_open(self):
pass
def on_complete(self):
self._queue.put(None)
self._queue.task_done()
def on_error(self, response: SpeechSynthesisResponse):
self._queue.put(None)
self._queue.task_done()
def on_close(self):
self._queue.put(None)
self._queue.task_done()
def on_event(self, result: SpeechSynthesisResult):
ad = result.get_audio_frame()
if ad:
self._queue.put(ad)