diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py index 82a4571a0..ffe2b9175 100644 --- a/astrbot/core/config/default.py +++ b/astrbot/core/config/default.py @@ -1041,6 +1041,7 @@ "timeout": "20", }, "阿里云百炼 TTS(API)": { + "hint": "API Key 从 https://bailian.console.aliyun.com/?tab=model#/api-key 获取。模型和音色的选择文档请参考: 阿里云百炼语音合成音色名称。具体可参考 https://help.aliyun.com/zh/model-studio/speech-synthesis-and-speech-recognition", "id": "dashscope_tts", "provider": "dashscope", "type": "dashscope_tts", @@ -1421,9 +1422,8 @@ "hint": "Azure_TTS 服务的订阅密钥(注意不是令牌)", }, "dashscope_tts_voice": { - "description": "语音合成模型", - "type": "string", - "hint": "阿里云百炼语音合成模型名称。具体可参考 https://help.aliyun.com/zh/model-studio/developer-reference/cosyvoice-python-api 等内容", + "description": "音色", + "type": "string" }, "gm_resp_image_modal": { "description": "启用图片模态", diff --git a/astrbot/core/provider/sources/dashscope_tts.py b/astrbot/core/provider/sources/dashscope_tts.py index 29c988d76..efda31ca9 100644 --- a/astrbot/core/provider/sources/dashscope_tts.py +++ b/astrbot/core/provider/sources/dashscope_tts.py @@ -1,10 +1,22 @@ +import asyncio +import base64 +import logging import os -import dashscope import uuid -import asyncio -from dashscope.audio.tts_v2 import * -from ..provider import TTSProvider +from typing import Optional, Tuple +import aiohttp +import dashscope +from dashscope.audio.tts_v2 import AudioFormat, SpeechSynthesizer + +try: + from dashscope.aigc.multimodal_conversation import MultiModalConversation +except ( + ImportError +): # pragma: no cover - older dashscope versions without Qwen TTS support + MultiModalConversation = None + from ..entities import ProviderType +from ..provider import TTSProvider from ..register import register_provider_adapter from astrbot.core.utils.astrbot_path import get_astrbot_data_path @@ -26,16 +38,112 @@ def __init__( dashscope.api_key = self.chosen_api_key async def get_audio(self, text: str) -> str: + model = self.get_model() + if not model: + raise RuntimeError("Dashscope TTS model is not configured.") + temp_dir = os.path.join(get_astrbot_data_path(), "temp") - path = os.path.join(temp_dir, f"dashscope_tts_{uuid.uuid4()}.wav") - self.synthesizer = SpeechSynthesizer( - model=self.get_model(), + os.makedirs(temp_dir, exist_ok=True) + + if self._is_qwen_tts_model(model): + audio_bytes, ext = await self._synthesize_with_qwen_tts(model, text) + else: + audio_bytes, ext = await self._synthesize_with_cosyvoice(model, text) + + if not audio_bytes: + raise RuntimeError( + "Audio synthesis failed, returned empty content. The model may not be supported or the service is unavailable." + ) + + path = os.path.join(temp_dir, f"dashscope_tts_{uuid.uuid4()}{ext}") + with open(path, "wb") as f: + f.write(audio_bytes) + return path + + def _call_qwen_tts(self, model: str, text: str): + if MultiModalConversation is None: + raise RuntimeError( + "dashscope SDK missing MultiModalConversation. Please upgrade the dashscope package to use Qwen TTS models." + ) + + kwargs = { + "model": model, + "text": text, + "api_key": self.chosen_api_key, + "voice": self.voice or "Cherry", + } + if not self.voice: + logging.warning( + "No voice specified for Qwen TTS model, using default 'Cherry'." + ) + return MultiModalConversation.call(**kwargs) + + async def _synthesize_with_qwen_tts( + self, model: str, text: str + ) -> Tuple[Optional[bytes], str]: + loop = asyncio.get_event_loop() + response = await loop.run_in_executor(None, self._call_qwen_tts, model, text) + audio_bytes = await self._extract_audio_from_response(response) + if not audio_bytes: + raise RuntimeError( + f"Audio synthesis failed for model '{model}'. {response}" + ) + ext = ".wav" + return audio_bytes, ext + + async def _extract_audio_from_response(self, response) -> Optional[bytes]: + output = getattr(response, "output", None) + audio_obj = getattr(output, "audio", None) if output is not None else None + if not audio_obj: + return None + + data_b64 = getattr(audio_obj, "data", None) + if data_b64: + try: + return base64.b64decode(data_b64) + except (ValueError, TypeError): + logging.error("Failed to decode base64 audio data.") + return None + + url = getattr(audio_obj, "url", None) + if url: + return await self._download_audio_from_url(url) + return None + + async def _download_audio_from_url(self, url: str) -> Optional[bytes]: + if not url: + return None + timeout = max(self.timeout_ms / 1000, 1) if self.timeout_ms else 20 + try: + async with aiohttp.ClientSession() as session: + async with session.get( + url, timeout=aiohttp.ClientTimeout(total=timeout) + ) as response: + return await response.read() + except (aiohttp.ClientError, asyncio.TimeoutError, OSError) as e: + logging.error(f"Failed to download audio from URL {url}: {e}") + return None + + async def _synthesize_with_cosyvoice( + self, model: str, text: str + ) -> Tuple[Optional[bytes], str]: + synthesizer = SpeechSynthesizer( + model=model, voice=self.voice, format=AudioFormat.WAV_24000HZ_MONO_16BIT, ) - audio = await asyncio.get_event_loop().run_in_executor( - None, self.synthesizer.call, text, self.timeout_ms + loop = asyncio.get_event_loop() + audio_bytes = await loop.run_in_executor( + None, synthesizer.call, text, self.timeout_ms ) - with open(path, "wb") as f: - f.write(audio) - return path + if not audio_bytes: + resp = synthesizer.get_response() + if resp and isinstance(resp, dict): + raise RuntimeError( + f"Audio synthesis failed for model '{model}'. {resp}".strip() + ) + return audio_bytes, ".wav" + + def _is_qwen_tts_model(self, model: str) -> bool: + model_lower = model.lower() + return "tts" in model_lower and model_lower.startswith("qwen")