ragflow/rag/utils/tts_cache.py

#
#  Copyright 2024 The InfiniFlow Authors. All Rights Reserved.
#
#  Licensed under the Apache License, Version 2.0 (the "License");
#  you may not use this file except in compliance with the License.
#  You may obtain a copy of the License at
#
#      http://www.apache.org/licenses/LICENSE-2.0
#
#  Unless required by applicable law or agreed to in writing, software
#  distributed under the License is distributed on an "AS IS" BASIS,
#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#  See the License for the specific language governing permissions and
#  limitations under the License.
#
import binascii
import hashlib
import logging
import os
from typing import Any, Optional

from rag.utils.redis_conn import REDIS_CONN

_DEFAULT_TTL_SECONDS = 7 * 24 * 60 * 60
_KEY_PREFIX = "tts:cache:"


def _ttl_seconds() -> int:
    raw = os.environ.get("RAGFLOW_TTS_CACHE_TTL_SECONDS")
    if not raw:
        return _DEFAULT_TTL_SECONDS
    try:
        v = int(raw)
        return v if v > 0 else 0
    except ValueError:
        logging.warning("Invalid RAGFLOW_TTS_CACHE_TTL_SECONDS=%r, using default", raw)
        return _DEFAULT_TTL_SECONDS


def _model_id(tts_mdl: Any) -> Optional[str]:
    cfg = getattr(tts_mdl, "model_config", None)
    if isinstance(cfg, dict):
        mid = cfg.get("id")
        if mid is not None:
            return str(mid)
        name = cfg.get("llm_name") or cfg.get("model_name")
        if name:
            return str(name)
    return None


def _build_key(tts_mdl: Any, text: str) -> Optional[str]:
    mid = _model_id(tts_mdl)
    if not mid:
        return None
    digest = hashlib.sha256(text.encode("utf-8", "ignore")).hexdigest()
    return f"{_KEY_PREFIX}{mid}:{digest}"


def _to_hex_string(value: Any) -> Optional[str]:
    if value is None:
        return None
    if isinstance(value, bytes):
        try:
            return value.decode("utf-8")
        except Exception:
            return None
    if isinstance(value, str):
        return value
    return None


def synthesize_with_cache(tts_mdl: Any, cleaned_text: str) -> Optional[str]:
    """
    Synthesize ``cleaned_text`` through ``tts_mdl`` and return a hex-encoded
    audio blob, reusing a Redis-cached result when available.

    The cache key is derived from the TTS model identifier and a SHA-256 of the
    text, so different models keep separate caches and the same text on the
    same model resolves to the same key regardless of call site. Returns
    ``None`` on synthesis failure; callers should treat that as a no-op the
    same way they do today.
    """
    if not tts_mdl or not cleaned_text:
        return None

    key = _build_key(tts_mdl, cleaned_text)

    if key:
        try:
            cached = REDIS_CONN.get(key)
        except Exception as e:
            logging.warning("TTS cache lookup failed: %s", e)
            cached = None
        hex_cached = _to_hex_string(cached)
        if hex_cached:
            return hex_cached

    buf = b""
    try:
        for chunk in tts_mdl.tts(cleaned_text):
            if isinstance(chunk, (bytes, bytearray)):
                buf += bytes(chunk)
    except Exception as e:
        logging.error("TTS failed: %s (text length=%d)", e, len(cleaned_text))
        return None

    if not buf:
        return None

    hex_value = binascii.hexlify(buf).decode("utf-8")

    ttl = _ttl_seconds()
    if key and ttl > 0:
        try:
            REDIS_CONN.set(key, hex_value, exp=ttl)
        except Exception as e:
            logging.warning("TTS cache store failed: %s", e)

    return hex_value