Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions astrbot/core/config/default.py
Original file line number Diff line number Diff line change
Expand Up @@ -1041,6 +1041,7 @@
"timeout": "20",
},
"阿里云百炼 TTS(API)": {
"hint": "API Key 从 https://bailian.console.aliyun.com/?tab=model#/api-key 获取。模型和音色的选择文档请参考: 阿里云百炼语音合成音色名称。具体可参考 https://help.aliyun.com/zh/model-studio/speech-synthesis-and-speech-recognition",
"id": "dashscope_tts",
"provider": "dashscope",
"type": "dashscope_tts",
Expand Down Expand Up @@ -1421,9 +1422,8 @@
"hint": "Azure_TTS 服务的订阅密钥(注意不是令牌)",
},
"dashscope_tts_voice": {
"description": "语音合成模型",
"type": "string",
"hint": "阿里云百炼语音合成模型名称。具体可参考 https://help.aliyun.com/zh/model-studio/developer-reference/cosyvoice-python-api 等内容",
"description": "音色",
"type": "string"
},
"gm_resp_image_modal": {
"description": "启用图片模态",
Expand Down
132 changes: 120 additions & 12 deletions astrbot/core/provider/sources/dashscope_tts.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,22 @@
import asyncio
import base64
import logging
import os
import dashscope
import uuid
import asyncio
from dashscope.audio.tts_v2 import *
from ..provider import TTSProvider
from typing import Optional, Tuple
import aiohttp
import dashscope
from dashscope.audio.tts_v2 import AudioFormat, SpeechSynthesizer

try:
from dashscope.aigc.multimodal_conversation import MultiModalConversation
except (
ImportError
): # pragma: no cover - older dashscope versions without Qwen TTS support
MultiModalConversation = None

from ..entities import ProviderType
from ..provider import TTSProvider
from ..register import register_provider_adapter
from astrbot.core.utils.astrbot_path import get_astrbot_data_path

Expand All @@ -26,16 +38,112 @@ def __init__(
dashscope.api_key = self.chosen_api_key

async def get_audio(self, text: str) -> str:
model = self.get_model()
if not model:
raise RuntimeError("Dashscope TTS model is not configured.")

temp_dir = os.path.join(get_astrbot_data_path(), "temp")
path = os.path.join(temp_dir, f"dashscope_tts_{uuid.uuid4()}.wav")
self.synthesizer = SpeechSynthesizer(
model=self.get_model(),
os.makedirs(temp_dir, exist_ok=True)

if self._is_qwen_tts_model(model):
audio_bytes, ext = await self._synthesize_with_qwen_tts(model, text)
else:
audio_bytes, ext = await self._synthesize_with_cosyvoice(model, text)

if not audio_bytes:
raise RuntimeError(
"Audio synthesis failed, returned empty content. The model may not be supported or the service is unavailable."
)

path = os.path.join(temp_dir, f"dashscope_tts_{uuid.uuid4()}{ext}")
with open(path, "wb") as f:
f.write(audio_bytes)
return path

def _call_qwen_tts(self, model: str, text: str):
if MultiModalConversation is None:
raise RuntimeError(
"dashscope SDK missing MultiModalConversation. Please upgrade the dashscope package to use Qwen TTS models."
)

kwargs = {
"model": model,
"text": text,
"api_key": self.chosen_api_key,
"voice": self.voice or "Cherry",
}
if not self.voice:
logging.warning(
"No voice specified for Qwen TTS model, using default 'Cherry'."
)
return MultiModalConversation.call(**kwargs)

async def _synthesize_with_qwen_tts(
self, model: str, text: str
) -> Tuple[Optional[bytes], str]:
loop = asyncio.get_event_loop()
response = await loop.run_in_executor(None, self._call_qwen_tts, model, text)
audio_bytes = await self._extract_audio_from_response(response)
if not audio_bytes:
raise RuntimeError(
f"Audio synthesis failed for model '{model}'. {response}"
)
ext = ".wav"
return audio_bytes, ext

async def _extract_audio_from_response(self, response) -> Optional[bytes]:
output = getattr(response, "output", None)
audio_obj = getattr(output, "audio", None) if output is not None else None
if not audio_obj:
return None

data_b64 = getattr(audio_obj, "data", None)
if data_b64:
try:
return base64.b64decode(data_b64)
except (ValueError, TypeError):
logging.error("Failed to decode base64 audio data.")
return None

url = getattr(audio_obj, "url", None)
if url:
return await self._download_audio_from_url(url)
return None

async def _download_audio_from_url(self, url: str) -> Optional[bytes]:
if not url:
return None
timeout = max(self.timeout_ms / 1000, 1) if self.timeout_ms else 20
try:
async with aiohttp.ClientSession() as session:
async with session.get(
url, timeout=aiohttp.ClientTimeout(total=timeout)
) as response:
return await response.read()
except (aiohttp.ClientError, asyncio.TimeoutError, OSError) as e:
logging.error(f"Failed to download audio from URL {url}: {e}")
return None

async def _synthesize_with_cosyvoice(
self, model: str, text: str
) -> Tuple[Optional[bytes], str]:
synthesizer = SpeechSynthesizer(
model=model,
voice=self.voice,
format=AudioFormat.WAV_24000HZ_MONO_16BIT,
)
audio = await asyncio.get_event_loop().run_in_executor(
None, self.synthesizer.call, text, self.timeout_ms
loop = asyncio.get_event_loop()
audio_bytes = await loop.run_in_executor(
None, synthesizer.call, text, self.timeout_ms
)
with open(path, "wb") as f:
f.write(audio)
return path
if not audio_bytes:
resp = synthesizer.get_response()
if resp and isinstance(resp, dict):
raise RuntimeError(
f"Audio synthesis failed for model '{model}'. {resp}".strip()
)
return audio_bytes, ".wav"

def _is_qwen_tts_model(self, model: str) -> bool:
model_lower = model.lower()
return "tts" in model_lower and model_lower.startswith("qwen")