feat:add tts-streaming config and future (#5492)

2026-05-04 01:18:05 +08:00 · 2024-07-09 11:33:58 +08:00
parent b29a36f461
commit 6ef401a9f0
44 changed files with 1280 additions and 358 deletions
--- a/api/core/model_runtime/model_providers/__base/tts_model.py
+++ b/api/core/model_runtime/model_providers/__base/tts_model.py
@ -1,4 +1,6 @@
 import hashlib
+import logging
+import re
 import subprocess
 import uuid
 from abc import abstractmethod
@ -10,7 +12,7 @@ from core.model_runtime.entities.model_entities import ModelPropertyKey, ModelTy
 from core.model_runtime.errors.invoke import InvokeBadRequestError
 from core.model_runtime.model_providers.__base.ai_model import AIModel

-
+logger = logging.getLogger(__name__)
 class TTSModel(AIModel):
    """
    Model class for ttstext model.
@ -20,7 +22,7 @@ class TTSModel(AIModel):
    # pydantic configs
    model_config = ConfigDict(protected_namespaces=())

-    def invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
+    def invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str,
               user: Optional[str] = None):
        """
        Invoke large language model
@ -35,14 +37,15 @@ class TTSModel(AIModel):
        :return: translated audio file
        """
        try:
+            logger.info(f"Invoke TTS model: {model} , invoke content : {content_text}")
            self._is_ffmpeg_installed()
-            return self._invoke(model=model, credentials=credentials, user=user, streaming=streaming,
+            return self._invoke(model=model, credentials=credentials, user=user,
                                content_text=content_text, voice=voice, tenant_id=tenant_id)
        except Exception as e:
            raise self._transform_invoke_error(e)

    @abstractmethod
-    def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
+    def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str,
                user: Optional[str] = None):
        """
        Invoke large language model
@ -123,26 +126,26 @@ class TTSModel(AIModel):
            return model_schema.model_properties[ModelPropertyKey.MAX_WORKERS]

    @staticmethod
-    def _split_text_into_sentences(text: str, limit: int, delimiters=None):
-        if delimiters is None:
-            delimiters = set('。！？；\n')
-
-        buf = []
-        word_count = 0
-        for char in text:
-            buf.append(char)
-            if char in delimiters:
-                if word_count >= limit:
-                    yield ''.join(buf)
-                    buf = []
-                    word_count = 0
-                else:
-                    word_count += 1
-            else:
-                word_count += 1
-
-        if buf:
-            yield ''.join(buf)
+    def _split_text_into_sentences(org_text, max_length=2000, pattern=r'[。.!?]'):
+        match = re.compile(pattern)
+        tx = match.finditer(org_text)
+        start = 0
+        result = []
+        one_sentence = ''
+        for i in tx:
+            end = i.regs[0][1]
+            tmp = org_text[start:end]
+            if len(one_sentence + tmp) > max_length:
+                result.append(one_sentence)
+                one_sentence = ''
+            one_sentence += tmp
+            start = end
+        last_sens = org_text[start:]
+        if last_sens:
+            one_sentence += last_sens
+        if one_sentence != '':
+            result.append(one_sentence)
+        return result

    @staticmethod
    def _is_ffmpeg_installed():
--- a/api/core/model_runtime/model_providers/azure_openai/tts/tts.py
+++ b/api/core/model_runtime/model_providers/azure_openai/tts/tts.py
@ -4,7 +4,7 @@ from functools import reduce
 from io import BytesIO
 from typing import Optional

-from flask import Response, stream_with_context
+from flask import Response
 from openai import AzureOpenAI
 from pydub import AudioSegment

@ -14,7 +14,6 @@ from core.model_runtime.errors.validate import CredentialsValidateFailedError
 from core.model_runtime.model_providers.__base.tts_model import TTSModel
 from core.model_runtime.model_providers.azure_openai._common import _CommonAzureOpenAI
 from core.model_runtime.model_providers.azure_openai._constant import TTS_BASE_MODELS, AzureBaseModel
-from extensions.ext_storage import storage


 class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
@ -23,7 +22,7 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
    """

    def _invoke(self, model: str, tenant_id: str, credentials: dict,
-                content_text: str, voice: str, streaming: bool, user: Optional[str] = None) -> any:
+                content_text: str, voice: str, user: Optional[str] = None) -> any:
        """
        _invoke text2speech model

@ -32,30 +31,23 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
        :param credentials: model credentials
        :param content_text: text content to be translated
        :param voice: model timbre
-        :param streaming: output is streaming
        :param user: unique user id
        :return: text translated to audio file
        """
-        audio_type = self._get_model_audio_type(model, credentials)
        if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]:
            voice = self._get_model_default_voice(model, credentials)
-        if streaming:
-            return Response(stream_with_context(self._tts_invoke_streaming(model=model,
-                                                                           credentials=credentials,
-                                                                           content_text=content_text,
-                                                                           tenant_id=tenant_id,
-                                                                           voice=voice)),
-                            status=200, mimetype=f'audio/{audio_type}')
-        else:
-            return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)

-    def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
+        return self._tts_invoke_streaming(model=model,
+                                          credentials=credentials,
+                                          content_text=content_text,
+                                          voice=voice)
+
+    def validate_credentials(self, model: str, credentials: dict) -> None:
        """
        validate credentials text2speech model

        :param model: model name
        :param credentials: model credentials
-        :param user: unique user id
        :return: text translated to audio file
        """
        try:
@ -82,7 +74,7 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
        word_limit = self._get_model_word_limit(model, credentials)
        max_workers = self._get_model_workers_limit(model, credentials)
        try:
-            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
+            sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
            audio_bytes_list = []

            # Create a thread pool and map the function to the list of sentences
@ -107,34 +99,37 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
        except Exception as ex:
            raise InvokeBadRequestError(str(ex))

-    # Todo: To improve the streaming function
-    def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str,
+    def _tts_invoke_streaming(self, model: str,  credentials: dict, content_text: str,
                              voice: str) -> any:
        """
        _tts_invoke_streaming text2speech model
-
        :param model: model name
-        :param tenant_id: user tenant id
        :param credentials: model credentials
        :param content_text: text content to be translated
        :param voice: model timbre
        :return: text translated to audio file
        """
-        # transform credentials to kwargs for model instance
-        credentials_kwargs = self._to_credential_kwargs(credentials)
-        if not voice or voice not in self.get_tts_model_voices(model=model, credentials=credentials):
-            voice = self._get_model_default_voice(model, credentials)
-        word_limit = self._get_model_word_limit(model, credentials)
-        audio_type = self._get_model_audio_type(model, credentials)
-        tts_file_id = self._get_file_name(content_text)
-        file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
        try:
+            # doc: https://platform.openai.com/docs/guides/text-to-speech
+            credentials_kwargs = self._to_credential_kwargs(credentials)
            client = AzureOpenAI(**credentials_kwargs)
-            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
-            for sentence in sentences:
-                response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())
-                # response.stream_to_file(file_path)
-                storage.save(file_path, response.read())
+            # max font is 4096,there is 3500 limit for each request
+            max_length = 3500
+            if len(content_text) > max_length:
+                sentences = self._split_text_into_sentences(content_text, max_length=max_length)
+                executor = concurrent.futures.ThreadPoolExecutor(max_workers=min(3, len(sentences)))
+                futures = [executor.submit(client.audio.speech.with_streaming_response.create, model=model,
+                                           response_format="mp3",
+                                           input=sentences[i], voice=voice) for i in range(len(sentences))]
+                for index, future in enumerate(futures):
+                    yield from future.result().__enter__().iter_bytes(1024)
+
+            else:
+                response = client.audio.speech.with_streaming_response.create(model=model, voice=voice,
+                                                                              response_format="mp3",
+                                                                              input=content_text.strip())
+
+                yield from response.__enter__().iter_bytes(1024)
        except Exception as ex:
            raise InvokeBadRequestError(str(ex))

@ -162,7 +157,7 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):


    @staticmethod
-    def _get_ai_model_entity(base_model_name: str, model: str) -> AzureBaseModel:
+    def _get_ai_model_entity(base_model_name: str, model: str) -> AzureBaseModel | None:
        for ai_model_entity in TTS_BASE_MODELS:
            if ai_model_entity.base_model_name == base_model_name:
                ai_model_entity_copy = copy.deepcopy(ai_model_entity)
@ -170,5 +165,4 @@ class AzureOpenAIText2SpeechModel(_CommonAzureOpenAI, TTSModel):
                ai_model_entity_copy.entity.label.en_US = model
                ai_model_entity_copy.entity.label.zh_Hans = model
                return ai_model_entity_copy
-
        return None
--- a/api/core/model_runtime/model_providers/openai/tts/tts-1-hd.yaml
+++ b/api/core/model_runtime/model_providers/openai/tts/tts-1-hd.yaml
@ -21,7 +21,7 @@ model_properties:
    - mode: 'shimmer'
      name: 'Shimmer'
      language: [ 'zh-Hans', 'en-US', 'de-DE', 'fr-FR', 'es-ES', 'it-IT', 'th-TH', 'id-ID' ]
-  word_limit: 120
+  word_limit: 3500
  audio_type: 'mp3'
  max_workers: 5
 pricing:
--- a/api/core/model_runtime/model_providers/openai/tts/tts-1.yaml
+++ b/api/core/model_runtime/model_providers/openai/tts/tts-1.yaml
@ -21,7 +21,7 @@ model_properties:
    - mode: 'shimmer'
      name: 'Shimmer'
      language: ['zh-Hans', 'en-US', 'de-DE', 'fr-FR', 'es-ES', 'it-IT', 'th-TH', 'id-ID']
-  word_limit: 120
+  word_limit: 3500
  audio_type: 'mp3'
  max_workers: 5
 pricing:
--- a/api/core/model_runtime/model_providers/openai/tts/tts.py
+++ b/api/core/model_runtime/model_providers/openai/tts/tts.py
@ -3,7 +3,7 @@ from functools import reduce
 from io import BytesIO
 from typing import Optional

-from flask import Response, stream_with_context
+from flask import Response
 from openai import OpenAI
 from pydub import AudioSegment

@ -11,7 +11,6 @@ from core.model_runtime.errors.invoke import InvokeBadRequestError
 from core.model_runtime.errors.validate import CredentialsValidateFailedError
 from core.model_runtime.model_providers.__base.tts_model import TTSModel
 from core.model_runtime.model_providers.openai._common import _CommonOpenAI
-from extensions.ext_storage import storage


 class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
@ -20,7 +19,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
    """

    def _invoke(self, model: str, tenant_id: str, credentials: dict,
-                content_text: str, voice: str, streaming: bool, user: Optional[str] = None) -> any:
+                content_text: str, voice: str, user: Optional[str] = None) -> any:
        """
        _invoke text2speech model

@ -29,22 +28,17 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
        :param credentials: model credentials
        :param content_text: text content to be translated
        :param voice: model timbre
-        :param streaming: output is streaming
        :param user: unique user id
        :return: text translated to audio file
        """
-        audio_type = self._get_model_audio_type(model, credentials)
+
        if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]:
            voice = self._get_model_default_voice(model, credentials)
-        if streaming:
-            return Response(stream_with_context(self._tts_invoke_streaming(model=model,
-                                                                           credentials=credentials,
-                                                                           content_text=content_text,
-                                                                           tenant_id=tenant_id,
-                                                                           voice=voice)),
-                            status=200, mimetype=f'audio/{audio_type}')
-        else:
-            return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)
+        # if streaming:
+        return self._tts_invoke_streaming(model=model,
+                                          credentials=credentials,
+                                          content_text=content_text,
+                                          voice=voice)

    def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
        """
@ -79,7 +73,7 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
        word_limit = self._get_model_word_limit(model, credentials)
        max_workers = self._get_model_workers_limit(model, credentials)
        try:
-            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
+            sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
            audio_bytes_list = []

            # Create a thread pool and map the function to the list of sentences
@ -104,34 +98,40 @@ class OpenAIText2SpeechModel(_CommonOpenAI, TTSModel):
        except Exception as ex:
            raise InvokeBadRequestError(str(ex))

-    # Todo: To improve the streaming function
-    def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str,
+
+    def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
                              voice: str) -> any:
        """
        _tts_invoke_streaming text2speech model

        :param model: model name
-        :param tenant_id: user tenant id
        :param credentials: model credentials
        :param content_text: text content to be translated
        :param voice: model timbre
        :return: text translated to audio file
        """
-        # transform credentials to kwargs for model instance
-        credentials_kwargs = self._to_credential_kwargs(credentials)
-        if not voice or voice not in self.get_tts_model_voices(model=model, credentials=credentials):
-            voice = self._get_model_default_voice(model, credentials)
-        word_limit = self._get_model_word_limit(model, credentials)
-        audio_type = self._get_model_audio_type(model, credentials)
-        tts_file_id = self._get_file_name(content_text)
-        file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
        try:
+            # doc: https://platform.openai.com/docs/guides/text-to-speech
+            credentials_kwargs = self._to_credential_kwargs(credentials)
            client = OpenAI(**credentials_kwargs)
-            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
-            for sentence in sentences:
-                response = client.audio.speech.create(model=model, voice=voice, input=sentence.strip())
-                # response.stream_to_file(file_path)
-                storage.save(file_path, response.read())
+            if not voice or voice not in self.get_tts_model_voices(model=model, credentials=credentials):
+                voice = self._get_model_default_voice(model, credentials)
+            word_limit = self._get_model_word_limit(model, credentials)
+            if len(content_text) > word_limit:
+                sentences = self._split_text_into_sentences(content_text, max_length=word_limit)
+                executor = concurrent.futures.ThreadPoolExecutor(max_workers=min(3, len(sentences)))
+                futures = [executor.submit(client.audio.speech.with_streaming_response.create, model=model,
+                                           response_format="mp3",
+                                           input=sentences[i], voice=voice) for i in range(len(sentences))]
+                for index, future in enumerate(futures):
+                    yield from future.result().__enter__().iter_bytes(1024)
+
+            else:
+                response = client.audio.speech.with_streaming_response.create(model=model, voice=voice,
+                                                                              response_format="mp3",
+                                                                              input=content_text.strip())
+
+                yield from response.__enter__().iter_bytes(1024)
        except Exception as ex:
            raise InvokeBadRequestError(str(ex))

--- a/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml
+++ b/api/core/model_runtime/model_providers/tongyi/tts/tts-1.yaml
@ -129,7 +129,7 @@ model_properties:
    - mode: "sambert-waan-v1"
      name: "Waan（泰语女声）"
      language: [ "th-TH" ]
-  word_limit: 120
+  word_limit: 7000
  audio_type: 'mp3'
  max_workers: 5
 pricing:
--- a/api/core/model_runtime/model_providers/tongyi/tts/tts.py
+++ b/api/core/model_runtime/model_providers/tongyi/tts/tts.py
@ -1,17 +1,21 @@
 import concurrent.futures
+import threading
 from functools import reduce
 from io import BytesIO
+from queue import Queue
 from typing import Optional

 import dashscope
-from flask import Response, stream_with_context
+from dashscope import SpeechSynthesizer
+from dashscope.api_entities.dashscope_response import SpeechSynthesisResponse
+from dashscope.audio.tts import ResultCallback, SpeechSynthesisResult
+from flask import Response
 from pydub import AudioSegment

 from core.model_runtime.errors.invoke import InvokeBadRequestError
 from core.model_runtime.errors.validate import CredentialsValidateFailedError
 from core.model_runtime.model_providers.__base.tts_model import TTSModel
 from core.model_runtime.model_providers.tongyi._common import _CommonTongyi
-from extensions.ext_storage import storage


 class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
@ -19,7 +23,7 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
    Model class for Tongyi Speech to text model.
    """

-    def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str, streaming: bool,
+    def _invoke(self, model: str, tenant_id: str, credentials: dict, content_text: str, voice: str,
                user: Optional[str] = None) -> any:
        """
        _invoke text2speech model
@ -29,22 +33,17 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
        :param credentials: model credentials
        :param voice: model timbre
        :param content_text: text content to be translated
-        :param streaming: output is streaming
        :param user: unique user id
        :return: text translated to audio file
        """
-        audio_type = self._get_model_audio_type(model, credentials)
-        if not voice or voice not in [d['value'] for d in self.get_tts_model_voices(model=model, credentials=credentials)]:
+        if not voice or voice not in [d['value'] for d in
+                                      self.get_tts_model_voices(model=model, credentials=credentials)]:
            voice = self._get_model_default_voice(model, credentials)
-        if streaming:
-            return Response(stream_with_context(self._tts_invoke_streaming(model=model,
-                                                                           credentials=credentials,
-                                                                           content_text=content_text,
-                                                                           voice=voice,
-                                                                           tenant_id=tenant_id)),
-                            status=200, mimetype=f'audio/{audio_type}')
-        else:
-            return self._tts_invoke(model=model, credentials=credentials, content_text=content_text, voice=voice)
+
+        return self._tts_invoke_streaming(model=model,
+                                          credentials=credentials,
+                                          content_text=content_text,
+                                          voice=voice)

    def validate_credentials(self, model: str, credentials: dict, user: Optional[str] = None) -> None:
        """
@ -79,7 +78,7 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
        word_limit = self._get_model_word_limit(model, credentials)
        max_workers = self._get_model_workers_limit(model, credentials)
        try:
-            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
+            sentences = list(self._split_text_into_sentences(org_text=content_text, max_length=word_limit))
            audio_bytes_list = []

            # Create a thread pool and map the function to the list of sentences
@ -105,14 +104,12 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
        except Exception as ex:
            raise InvokeBadRequestError(str(ex))

-    # Todo: To improve the streaming function
-    def _tts_invoke_streaming(self, model: str, tenant_id: str, credentials: dict, content_text: str,
+    def _tts_invoke_streaming(self, model: str, credentials: dict, content_text: str,
                              voice: str) -> any:
        """
        _tts_invoke_streaming text2speech model

        :param model: model name
-        :param tenant_id: user tenant id
        :param credentials: model credentials
        :param voice: model timbre
        :param content_text: text content to be translated
@ -120,18 +117,32 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
        """
        word_limit = self._get_model_word_limit(model, credentials)
        audio_type = self._get_model_audio_type(model, credentials)
-        tts_file_id = self._get_file_name(content_text)
-        file_path = f'generate_files/audio/{tenant_id}/{tts_file_id}.{audio_type}'
        try:
-            sentences = list(self._split_text_into_sentences(text=content_text, limit=word_limit))
-            for sentence in sentences:
-                response = dashscope.audio.tts.SpeechSynthesizer.call(model=voice, sample_rate=48000,
-                                                                      api_key=credentials.get('dashscope_api_key'),
-                                                                      text=sentence.strip(),
-                                                                      format=audio_type, word_timestamp_enabled=True,
-                                                                      phoneme_timestamp_enabled=True)
-                if isinstance(response.get_audio_data(), bytes):
-                    storage.save(file_path, response.get_audio_data())
+            audio_queue: Queue = Queue()
+            callback = Callback(queue=audio_queue)
+
+            def invoke_remote(content, v, api_key, cb, at, wl):
+                if len(content) < word_limit:
+                    sentences = [content]
+                else:
+                    sentences = list(self._split_text_into_sentences(org_text=content, max_length=wl))
+                for sentence in sentences:
+                    SpeechSynthesizer.call(model=v, sample_rate=16000,
+                                           api_key=api_key,
+                                           text=sentence.strip(),
+                                           callback=cb,
+                                           format=at, word_timestamp_enabled=True,
+                                           phoneme_timestamp_enabled=True)
+
+            threading.Thread(target=invoke_remote, args=(
+                content_text, voice, credentials.get('dashscope_api_key'), callback, audio_type, word_limit)).start()
+
+            while True:
+                audio = audio_queue.get()
+                if audio is None:
+                    break
+                yield audio
+
        except Exception as ex:
            raise InvokeBadRequestError(str(ex))

@ -152,3 +163,29 @@ class TongyiText2SpeechModel(_CommonTongyi, TTSModel):
                                                              format=audio_type)
        if isinstance(response.get_audio_data(), bytes):
            return response.get_audio_data()
+
+
+class Callback(ResultCallback):
+
+    def __init__(self, queue: Queue):
+        self._queue = queue
+
+    def on_open(self):
+        pass
+
+    def on_complete(self):
+        self._queue.put(None)
+        self._queue.task_done()
+
+    def on_error(self, response: SpeechSynthesisResponse):
+        self._queue.put(None)
+        self._queue.task_done()
+
+    def on_close(self):
+        self._queue.put(None)
+        self._queue.task_done()
+
+    def on_event(self, result: SpeechSynthesisResult):
+        ad = result.get_audio_frame()
+        if ad:
+            self._queue.put(ad)