From 52371a9a427cb3ee48419afbec6bca189f409480 Mon Sep 17 00:00:00 2001 From: Yaron Date: Sun, 5 Oct 2025 14:37:15 -0400 Subject: [PATCH 1/4] =?UTF-8?q?fix:=20=E4=BF=AE=E5=A4=8D=E4=BA=86CosyVoice?= =?UTF-8?q?=20V2=EF=BC=8CQwen=20TTS=E7=94=9F=E6=88=90=E6=8A=A5=E9=94=99?= =?UTF-8?q?=E7=9A=84=E9=97=AE=E9=A2=98=E3=80=82Fixed=20compatability=20pro?= =?UTF-8?q?blems=20with=20CosyVoice=20V2,=20Qwen=20TTS.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../core/provider/sources/dashscope_tts.py | 139 ++++++++++++++++-- 1 file changed, 127 insertions(+), 12 deletions(-) diff --git a/astrbot/core/provider/sources/dashscope_tts.py b/astrbot/core/provider/sources/dashscope_tts.py index 29c988d76..4f6b303ef 100644 --- a/astrbot/core/provider/sources/dashscope_tts.py +++ b/astrbot/core/provider/sources/dashscope_tts.py @@ -1,10 +1,21 @@ +import asyncio +import base64 import os -import dashscope import uuid -import asyncio -from dashscope.audio.tts_v2 import * -from ..provider import TTSProvider +from typing import Optional, Tuple +from urllib.error import URLError +from urllib.request import urlopen + +import dashscope +from dashscope.audio.tts_v2 import AudioFormat, SpeechSynthesizer + +try: + from dashscope.aigc.multimodal_conversation import MultiModalConversation +except ImportError: # pragma: no cover - older dashscope versions without Qwen TTS support + MultiModalConversation = None + from ..entities import ProviderType +from ..provider import TTSProvider from ..register import register_provider_adapter from astrbot.core.utils.astrbot_path import get_astrbot_data_path @@ -26,16 +37,120 @@ def __init__( dashscope.api_key = self.chosen_api_key async def get_audio(self, text: str) -> str: + model = self.get_model() + if not model: + raise RuntimeError("Dashscope TTS model is not configured.") + temp_dir = os.path.join(get_astrbot_data_path(), "temp") - path = os.path.join(temp_dir, f"dashscope_tts_{uuid.uuid4()}.wav") - self.synthesizer = SpeechSynthesizer( - model=self.get_model(), + os.makedirs(temp_dir, exist_ok=True) + + if self._is_qwen_tts_model(model): + audio_bytes, ext = await self._synthesize_with_qwen_tts(model, text) + else: + audio_bytes, ext = await self._synthesize_with_cosyvoice(model, text) + + if not audio_bytes: + raise RuntimeError( + "Audio synthesis failed, returned empty content. The model may not be supported or the service is unavailable." + ) + + path = os.path.join(temp_dir, f"dashscope_tts_{uuid.uuid4()}{ext}") + with open(path, "wb") as f: + f.write(audio_bytes) + return path + + def _call_qwen_tts(self, model: str, text: str): + if MultiModalConversation is None: + raise RuntimeError( + "dashscope SDK missing MultiModalConversation. Please upgrade the dashscope package to use Qwen TTS models." + ) + + kwargs = { + "model": model, + "text": text, + "api_key": self.chosen_api_key, + } + if self.voice: + kwargs["voice"] = self.voice + return MultiModalConversation.call(**kwargs) + + async def _synthesize_with_qwen_tts(self, model: str, text: str) -> Tuple[Optional[bytes], str]: + loop = asyncio.get_event_loop() + response = await loop.run_in_executor(None, self._call_qwen_tts, model, text) + + audio_bytes = self._extract_audio_from_response(response) + if not audio_bytes: + error_details = self._format_dashscope_error(response) + raise RuntimeError( + f"Audio synthesis failed for model '{model}'. {error_details}" + ) + ext = ".wav" + return audio_bytes, ext + + def _extract_audio_from_response(self, response) -> Optional[bytes]: + output = getattr(response, "output", None) + audio_obj = getattr(output, "audio", None) if output is not None else None + if not audio_obj: + return None + + data_b64 = getattr(audio_obj, "data", None) + if data_b64: + try: + return base64.b64decode(data_b64) + except (ValueError, TypeError): + return None + + url = getattr(audio_obj, "url", None) + if url: + return self._download_audio_from_url(url) + return None + + def _download_audio_from_url(self, url: str) -> Optional[bytes]: + if not url: + return None + timeout = max(self.timeout_ms / 1000, 1) if self.timeout_ms else 20 + try: + with urlopen(url, timeout=timeout) as response: + return response.read() + except (URLError, TimeoutError, OSError): + return None + + async def _synthesize_with_cosyvoice(self, model: str, text: str) -> Tuple[Optional[bytes], str]: + synthesizer = SpeechSynthesizer( + model=model, voice=self.voice, format=AudioFormat.WAV_24000HZ_MONO_16BIT, ) - audio = await asyncio.get_event_loop().run_in_executor( - None, self.synthesizer.call, text, self.timeout_ms + loop = asyncio.get_event_loop() + audio_bytes = await loop.run_in_executor( + None, synthesizer.call, text, self.timeout_ms ) - with open(path, "wb") as f: - f.write(audio) - return path + if not audio_bytes: + response = getattr(synthesizer, "get_response", None) + detail = "" + if callable(response): + resp = response() + detail = self._format_dashscope_error(resp) + raise RuntimeError( + f"Audio synthesis failed for model '{model}'. {detail}".strip() + ) + return audio_bytes, ".wav" + + def _is_qwen_tts_model(self, model: str) -> bool: + model_lower = model.lower() + return "tts" in model_lower and model_lower.startswith("qwen") + + def _format_dashscope_error(self, response) -> str: + status_code = getattr(response, "status_code", None) + code = getattr(response, "code", None) + message = getattr(response, "message", None) + parts = [] + if status_code is not None: + parts.append(f"status_code={status_code}") + if code: + parts.append(f"code={code}") + if message: + parts.append(f"message={message}") + if not parts: + return "" + return " ".join(parts) From 1031d2e8d63464d12015751c5444d950317607e2 Mon Sep 17 00:00:00 2001 From: Yaron Date: Mon, 6 Oct 2025 10:25:00 -0400 Subject: [PATCH 2/4] =?UTF-8?q?fix:=20=E5=B0=86urlopen=E7=9A=84=E5=90=8C?= =?UTF-8?q?=E6=AD=A5=E8=AF=B7=E6=B1=82=E6=9B=BF=E6=8D=A2=E4=B8=BAaiohttp?= =?UTF-8?q?=E7=9A=84=E5=BC=82=E6=AD=A5=E8=AF=B7=E6=B1=82=E4=BB=A5=E4=B8=8B?= =?UTF-8?q?=E8=BD=BD=E9=9F=B3=E9=A2=91?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../core/provider/sources/dashscope_tts.py | 27 ++++++++++--------- 1 file changed, 15 insertions(+), 12 deletions(-) diff --git a/astrbot/core/provider/sources/dashscope_tts.py b/astrbot/core/provider/sources/dashscope_tts.py index 4f6b303ef..e273cd7cb 100644 --- a/astrbot/core/provider/sources/dashscope_tts.py +++ b/astrbot/core/provider/sources/dashscope_tts.py @@ -1,11 +1,10 @@ import asyncio import base64 +import logging import os import uuid from typing import Optional, Tuple -from urllib.error import URLError -from urllib.request import urlopen - +import aiohttp import dashscope from dashscope.audio.tts_v2 import AudioFormat, SpeechSynthesizer @@ -69,16 +68,17 @@ def _call_qwen_tts(self, model: str, text: str): "model": model, "text": text, "api_key": self.chosen_api_key, + "voice": self.voice or "Cherry", } - if self.voice: - kwargs["voice"] = self.voice + if not self.voice: + logging.warning("No voice specified for Qwen TTS model, using default 'Cherry'.") return MultiModalConversation.call(**kwargs) async def _synthesize_with_qwen_tts(self, model: str, text: str) -> Tuple[Optional[bytes], str]: loop = asyncio.get_event_loop() response = await loop.run_in_executor(None, self._call_qwen_tts, model, text) - audio_bytes = self._extract_audio_from_response(response) + audio_bytes = await self._extract_audio_from_response(response) if not audio_bytes: error_details = self._format_dashscope_error(response) raise RuntimeError( @@ -87,7 +87,7 @@ async def _synthesize_with_qwen_tts(self, model: str, text: str) -> Tuple[Option ext = ".wav" return audio_bytes, ext - def _extract_audio_from_response(self, response) -> Optional[bytes]: + async def _extract_audio_from_response(self, response) -> Optional[bytes]: output = getattr(response, "output", None) audio_obj = getattr(output, "audio", None) if output is not None else None if not audio_obj: @@ -98,21 +98,24 @@ def _extract_audio_from_response(self, response) -> Optional[bytes]: try: return base64.b64decode(data_b64) except (ValueError, TypeError): + logging.error("Failed to decode base64 audio data.") return None url = getattr(audio_obj, "url", None) if url: - return self._download_audio_from_url(url) + return await self._download_audio_from_url(url) return None - def _download_audio_from_url(self, url: str) -> Optional[bytes]: + async def _download_audio_from_url(self, url: str) -> Optional[bytes]: if not url: return None timeout = max(self.timeout_ms / 1000, 1) if self.timeout_ms else 20 try: - with urlopen(url, timeout=timeout) as response: - return response.read() - except (URLError, TimeoutError, OSError): + async with aiohttp.ClientSession() as session: + async with session.get(url, timeout=aiohttp.ClientTimeout(total=timeout)) as response: + return await response.read() + except (aiohttp.ClientError, asyncio.TimeoutError, OSError) as e: + logging.error(f"Failed to download audio from URL {url}: {e}") return None async def _synthesize_with_cosyvoice(self, model: str, text: str) -> Tuple[Optional[bytes], str]: From 3c4ce263c67ae8cc194b46cda9e11ca91d094ca0 Mon Sep 17 00:00:00 2001 From: Soulter <905617992@qq.com> Date: Sun, 12 Oct 2025 00:57:14 +0800 Subject: [PATCH 3/4] =?UTF-8?q?fix:=20cozyvoice=20=E6=8A=A5=E9=94=99?= =?UTF-8?q?=E6=98=BE=E7=A4=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../core/provider/sources/dashscope_tts.py | 52 ++++++++----------- 1 file changed, 21 insertions(+), 31 deletions(-) diff --git a/astrbot/core/provider/sources/dashscope_tts.py b/astrbot/core/provider/sources/dashscope_tts.py index e273cd7cb..efda31ca9 100644 --- a/astrbot/core/provider/sources/dashscope_tts.py +++ b/astrbot/core/provider/sources/dashscope_tts.py @@ -10,7 +10,9 @@ try: from dashscope.aigc.multimodal_conversation import MultiModalConversation -except ImportError: # pragma: no cover - older dashscope versions without Qwen TTS support +except ( + ImportError +): # pragma: no cover - older dashscope versions without Qwen TTS support MultiModalConversation = None from ..entities import ProviderType @@ -71,18 +73,20 @@ def _call_qwen_tts(self, model: str, text: str): "voice": self.voice or "Cherry", } if not self.voice: - logging.warning("No voice specified for Qwen TTS model, using default 'Cherry'.") + logging.warning( + "No voice specified for Qwen TTS model, using default 'Cherry'." + ) return MultiModalConversation.call(**kwargs) - async def _synthesize_with_qwen_tts(self, model: str, text: str) -> Tuple[Optional[bytes], str]: + async def _synthesize_with_qwen_tts( + self, model: str, text: str + ) -> Tuple[Optional[bytes], str]: loop = asyncio.get_event_loop() response = await loop.run_in_executor(None, self._call_qwen_tts, model, text) - audio_bytes = await self._extract_audio_from_response(response) if not audio_bytes: - error_details = self._format_dashscope_error(response) raise RuntimeError( - f"Audio synthesis failed for model '{model}'. {error_details}" + f"Audio synthesis failed for model '{model}'. {response}" ) ext = ".wav" return audio_bytes, ext @@ -112,13 +116,17 @@ async def _download_audio_from_url(self, url: str) -> Optional[bytes]: timeout = max(self.timeout_ms / 1000, 1) if self.timeout_ms else 20 try: async with aiohttp.ClientSession() as session: - async with session.get(url, timeout=aiohttp.ClientTimeout(total=timeout)) as response: + async with session.get( + url, timeout=aiohttp.ClientTimeout(total=timeout) + ) as response: return await response.read() except (aiohttp.ClientError, asyncio.TimeoutError, OSError) as e: logging.error(f"Failed to download audio from URL {url}: {e}") return None - async def _synthesize_with_cosyvoice(self, model: str, text: str) -> Tuple[Optional[bytes], str]: + async def _synthesize_with_cosyvoice( + self, model: str, text: str + ) -> Tuple[Optional[bytes], str]: synthesizer = SpeechSynthesizer( model=model, voice=self.voice, @@ -129,31 +137,13 @@ async def _synthesize_with_cosyvoice(self, model: str, text: str) -> Tuple[Optio None, synthesizer.call, text, self.timeout_ms ) if not audio_bytes: - response = getattr(synthesizer, "get_response", None) - detail = "" - if callable(response): - resp = response() - detail = self._format_dashscope_error(resp) - raise RuntimeError( - f"Audio synthesis failed for model '{model}'. {detail}".strip() - ) + resp = synthesizer.get_response() + if resp and isinstance(resp, dict): + raise RuntimeError( + f"Audio synthesis failed for model '{model}'. {resp}".strip() + ) return audio_bytes, ".wav" def _is_qwen_tts_model(self, model: str) -> bool: model_lower = model.lower() return "tts" in model_lower and model_lower.startswith("qwen") - - def _format_dashscope_error(self, response) -> str: - status_code = getattr(response, "status_code", None) - code = getattr(response, "code", None) - message = getattr(response, "message", None) - parts = [] - if status_code is not None: - parts.append(f"status_code={status_code}") - if code: - parts.append(f"code={code}") - if message: - parts.append(f"message={message}") - if not parts: - return "" - return " ".join(parts) From 6c11c5254b5f0d004f02843e568cf9e0a2c17fc5 Mon Sep 17 00:00:00 2001 From: Soulter <905617992@qq.com> Date: Sun, 12 Oct 2025 01:02:05 +0800 Subject: [PATCH 4/4] =?UTF-8?q?fix:=20=E6=B7=BB=E5=8A=A0=E9=98=BF=E9=87=8C?= =?UTF-8?q?=E4=BA=91=E7=99=BE=E7=82=BC=20TTS=20API=20Key=20=E8=8E=B7?= =?UTF-8?q?=E5=8F=96=E6=8F=90=E7=A4=BA=E4=BF=A1=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- astrbot/core/config/default.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/astrbot/core/config/default.py b/astrbot/core/config/default.py index 82a4571a0..ffe2b9175 100644 --- a/astrbot/core/config/default.py +++ b/astrbot/core/config/default.py @@ -1041,6 +1041,7 @@ "timeout": "20", }, "阿里云百炼 TTS(API)": { + "hint": "API Key 从 https://bailian.console.aliyun.com/?tab=model#/api-key 获取。模型和音色的选择文档请参考: 阿里云百炼语音合成音色名称。具体可参考 https://help.aliyun.com/zh/model-studio/speech-synthesis-and-speech-recognition", "id": "dashscope_tts", "provider": "dashscope", "type": "dashscope_tts", @@ -1421,9 +1422,8 @@ "hint": "Azure_TTS 服务的订阅密钥(注意不是令牌)", }, "dashscope_tts_voice": { - "description": "语音合成模型", - "type": "string", - "hint": "阿里云百炼语音合成模型名称。具体可参考 https://help.aliyun.com/zh/model-studio/developer-reference/cosyvoice-python-api 等内容", + "description": "音色", + "type": "string" }, "gm_resp_image_modal": { "description": "启用图片模态",