55import numpy as np
66from rich .console import Console
77import torch
8+ from .STV .speech_to_visemes import SpeechToVisemes
89
910logging .basicConfig (
1011 format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" ,
@@ -22,6 +23,7 @@ def setup(
2223 gen_kwargs = {}, # Unused
2324 stream = True ,
2425 chunk_size = 512 ,
26+ viseme_flag = True
2527 ):
2628 self .should_listen = should_listen
2729 self .device = device
@@ -33,6 +35,9 @@ def setup(
3335 self .params_infer_code = ChatTTS .Chat .InferCodeParams (
3436 spk_emb = rnd_spk_emb ,
3537 )
38+ self .viseme_flag = viseme_flag
39+ if self .viseme_flag :
40+ self .speech_to_visemes = SpeechToVisemes ()
3641 self .warmup ()
3742
3843 def warmup (self ):
@@ -61,22 +66,65 @@ def process(self, llm_sentence):
6166 if gen [0 ] is None or len (gen [0 ]) == 0 :
6267 self .should_listen .set ()
6368 return
69+
70+ # Resample the audio to 16000 Hz
6471 audio_chunk = librosa .resample (gen [0 ], orig_sr = 24000 , target_sr = 16000 )
65- audio_chunk = (audio_chunk * 32768 ).astype (np .int16 )[0 ]
66- while len (audio_chunk ) > self .chunk_size :
67- yield audio_chunk [: self .chunk_size ] # 返回前 chunk_size 字节的数据
68- audio_chunk = audio_chunk [self .chunk_size :] # 移除已返回的数据
69- yield np .pad (audio_chunk , (0 , self .chunk_size - len (audio_chunk )))
72+ # Ensure the audio is converted to mono (single channel)
73+ if len (audio_chunk .shape ) > 1 :
74+ audio_chunk = librosa .to_mono (audio_chunk )
75+ audio_chunk = (audio_chunk * 32768 ).astype (np .int16 )
76+
77+ # Process visemes if viseme_flag is set
78+ if self .viseme_flag :
79+ visemes = self .speech_to_visemes .process (audio_chunk )
80+ for viseme in visemes :
81+ console .print (f"[blue]ASSISTANT_MOUTH_SHAPE: { viseme ['viseme' ]} -- { viseme ['timestamp' ]} " )
82+ else :
83+ visemes = None
84+
85+ # Loop through audio chunks, yielding dict for each chunk
86+ for i in range (0 , len (audio_chunk ), self .chunk_size ):
87+ chunk_data = {
88+ "audio" : np .pad (
89+ audio_chunk [i : i + self .chunk_size ],
90+ (0 , self .chunk_size - len (audio_chunk [i : i + self .chunk_size ])),
91+ )
92+ }
93+ # Include text and visemes for the first chunk
94+ if i == 0 :
95+ chunk_data ["text" ] = llm_sentence # Assuming llm_sentence is defined elsewhere
96+ chunk_data ["visemes" ] = visemes
97+
98+ yield chunk_data
7099 else :
71100 wavs = wavs_gen
72101 if len (wavs [0 ]) == 0 :
73102 self .should_listen .set ()
74103 return
75104 audio_chunk = librosa .resample (wavs [0 ], orig_sr = 24000 , target_sr = 16000 )
105+ # Ensure the audio is converted to mono (single channel)
106+ if len (audio_chunk .shape ) > 1 :
107+ audio_chunk = librosa .to_mono (audio_chunk )
76108 audio_chunk = (audio_chunk * 32768 ).astype (np .int16 )
109+
110+ if self .viseme_flag :
111+ visemes = self .speech_to_visemes .process (audio_chunk )
112+ for viseme in visemes :
113+ console .print (f"[blue]ASSISTANT_MOUTH_SHAPE: { viseme ['viseme' ]} -- { viseme ['timestamp' ]} " )
114+ else :
115+ visemes = None
116+
77117 for i in range (0 , len (audio_chunk ), self .chunk_size ):
78- yield np .pad (
79- audio_chunk [i : i + self .chunk_size ],
80- (0 , self .chunk_size - len (audio_chunk [i : i + self .chunk_size ])),
81- )
118+ chunk_data = {
119+ "audio" : np .pad (
120+ audio_chunk [i : i + self .chunk_size ],
121+ (0 , self .chunk_size - len (audio_chunk [i : i + self .chunk_size ])),
122+ )
123+ }
124+ # For the first chunk, include text and visemes
125+ if i == 0 :
126+ chunk_data ["text" ] = llm_sentence
127+ chunk_data ["visemes" ] = visemes
128+ yield chunk_data
129+
82130 self .should_listen .set ()
0 commit comments