Added s2s files

michaelgoldpiano · michaelgoldpiano · commit f6a0e4615658 · 2024-09-24T14:31:24.000-04:00
diff --git a/TTS/parler_handler.py b/TTS/parler_handler.py
@@ -36,7 +36,9 @@ class ParlerTTSHandler(BaseHandler):
     def setup(
         self,
         should_listen,
-        model_name="ylacombe/parler-tts-mini-jenny-30H",
+        # model_name="ylacombe/parler-tts-mini-jenny-30H",
+        model_name="ylacombe/parler_tts_mini_v0.1",
+        # model_name="parler-tts/parler_tts_mini_v0.1",
         device="cuda",
         torch_dtype="float16",
         compile_mode=None,
@@ -82,7 +84,9 @@ def setup(
 
         self.viseme_flag = viseme_flag
         if self.viseme_flag:
-            self.speech_to_visemes = SpeechToVisemes()
+            self.speech_to_visemes = SpeechToVisemes(
+                device=self.device
+            )
 
         self.warmup()
 
@@ -100,13 +104,15 @@ def prepare_model_inputs(
             self.description, return_tensors="pt"
         )
         input_ids = tokenized_description.input_ids.to(self.device)
-        attention_mask = tokenized_description.attention_mask.to(self.device)
+        # attention_mask = tokenized_description.attention_mask.to(self.device)
+        attention_mask = None
 
         tokenized_prompt = self.prompt_tokenizer(
             prompt, return_tensors="pt", **pad_args_prompt
         )
         prompt_input_ids = tokenized_prompt.input_ids.to(self.device)
-        prompt_attention_mask = tokenized_prompt.attention_mask.to(self.device)
+        # prompt_attention_mask = tokenized_prompt.attention_mask.to(self.device)
+        prompt_attention_mask = None
 
         gen_kwargs = {
             "input_ids": input_ids,
diff --git a/arguments_classes/parler_tts_arguments.py b/arguments_classes/parler_tts_arguments.py
@@ -4,7 +4,8 @@
 @dataclass
 class ParlerTTSHandlerArguments:
     tts_model_name: str = field(
-        default="ylacombe/parler-tts-mini-jenny-30H",
+        # default="ylacombe/parler-tts-mini-jenny-30H",
+        default="ylacombe/parler_tts_mini_v0.1",
         metadata={
             "help": "The pretrained TTS model to use. Default is 'ylacombe/parler-tts-mini-jenny-30H'."
         },
diff --git a/config.json b/config.json
@@ -0,0 +1,5 @@
+{
+    "device": "cpu",
+    "stt": "whisper",
+    "stt_model_name": "openai/whisper-tiny"
+}
diff --git a/listen_and_dont_play.py b/listen_and_dont_play.py
@@ -0,0 +1,138 @@
+import socket
+import threading
+from queue import Queue
+from dataclasses import dataclass, field
+import soundfile as sf
+import numpy as np
+import struct
+import pickle
+
+@dataclass
+class ListenAndPlayArguments:
+    send_rate: int = field(default=16000, metadata={"help": "In Hz. Default is 16000."})
+    recv_rate: int = field(default=16000, metadata={"help": "In Hz. Default is 16000."})
+    list_play_chunk_size: int = field(
+        default=512,
+        metadata={"help": "The size of data chunks (in bytes). Default is 512."},
+    )
+    host: str = field(
+        default="localhost",
+        metadata={
+            "help": "The hostname or IP address for listening and playing. Default is 'localhost'."
+        },
+    )
+    send_port: int = field(
+        default=12345,
+        metadata={"help": "The network port for sending data. Default is 12345."},
+    )
+    recv_port: int = field(
+        default=12346,
+        metadata={"help": "The network port for receiving data. Default is 12346."},
+    )
+    input_audio_file: str = field(
+        default="sample_audio.wav",
+        metadata={"help": "Path to the audio file to use as input."},
+    )
+
+
+def listen_and_dont_play(
+    send_rate=16000,
+    recv_rate=16000,
+    list_play_chunk_size=512,
+    host="localhost",
+    send_port=12345,
+    recv_port=12346,
+    input_audio_file="sample_audio.wav",
+):
+    send_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    send_socket.connect((host, send_port))
+
+    recv_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    recv_socket.connect((host, recv_port))
+
+    print(f"Simulating recording and streaming using {input_audio_file}...")
+
+    stop_event = threading.Event()
+    recv_queue = Queue()
+    send_queue = Queue()
+
+    def load_audio_chunks(file_path, chunk_size, sample_rate, append_silence_secs=5):
+        """Load audio file, append silence, and yield chunks of audio data."""
+        # Read audio file
+        audio_data, audio_sample_rate = sf.read(file_path, dtype='int16')
+        if audio_sample_rate != sample_rate:
+            raise ValueError(f"Expected sample rate of {sample_rate}, but got {audio_sample_rate}")
+
+        # Calculate and append 5 seconds of silence
+        silence = np.zeros(int(sample_rate * append_silence_secs), dtype='int16')
+        combined_audio = np.concatenate([audio_data, silence])
+
+        # Break audio into chunks
+        for i in range(0, len(combined_audio), chunk_size):
+            yield combined_audio[i:i + chunk_size].tobytes()
+
+    def send(stop_event, send_queue):
+        for chunk in load_audio_chunks(input_audio_file, list_play_chunk_size, send_rate):
+            if stop_event.is_set():
+                break
+            send_queue.put(chunk)
+
+        send_queue.put(b"END")
+
+    def recv(stop_event, recv_queue):
+        def receive_full_chunk(conn, chunk_size):
+            data = b""
+            while len(data) < chunk_size:
+                packet = conn.recv(chunk_size - len(data))
+                if not packet:
+                    return None  # Connection has been closed
+                data += packet
+            return data
+
+        while not stop_event.is_set():
+            # Step 1: Receive the first 4 bytes to get the packet length
+            length_data = receive_full_chunk(recv_socket, 4)
+            if not length_data:
+                continue  # Handle disconnection or data not available
+
+            # Step 2: Unpack the length (4 bytes)
+            packet_length = struct.unpack('!I', length_data)[0]
+
+            # Step 3: Receive the full packet based on the length
+            serialized_packet = receive_full_chunk(recv_socket, packet_length)
+            if serialized_packet:
+                # Step 4: Deserialize the packet using pickle
+                packet = pickle.loads(serialized_packet)
+                # Step 5: Extract the packet contents (text, visemes, audio)
+                if 'text' in packet:
+                    print(f"Transcribed Text: {packet['text']}")
+                if 'visemes' in packet:
+                    print(f"Visemes: {packet['visemes']}")
+                # We're no longer playing audio, but you could process it if needed
+                if 'audio' in packet:
+                    recv_queue.put(packet['audio'])
+
+    try:
+        send_thread = threading.Thread(target=send, args=(stop_event, send_queue))
+        send_thread.start()
+        recv_thread = threading.Thread(target=recv, args=(stop_event, recv_queue))
+        recv_thread.start()
+
+        input("Press Enter to stop...")
+
+    except KeyboardInterrupt:
+        print("Finished streaming.")
+
+    finally:
+        stop_event.set()
+        recv_thread.join()
+        send_thread.join()
+        send_socket.close()
+        recv_socket.close()
+        print("Connection closed.")
+
+
+if __name__ == "__main__":
+    parser = HfArgumentParser((ListenAndPlayArguments,))
+    (listen_and_play_kwargs,) = parser.parse_args_into_dataclasses()
+    listen_and_dont_play(**vars(listen_and_play_kwargs))
diff --git a/listen_and_play.py b/listen_and_play.py
@@ -95,11 +95,11 @@ def receive_full_chunk(conn, chunk_size):
                 packet = pickle.loads(serialized_packet)
                 # Step 5: Extract the packet contents
                 if 'text' in packet:
-                    pass
-                    # print(packet['text'])
+                    # pass
+                    print(packet['text'])
                 if 'visemes' in packet:
-                    pass
-                    # print(packet['visemes'])
+                    # pass
+                    print(packet['visemes'])
                 
                 # Step 6: Put the packet audio data into the queue for sending
                 recv_queue.put(packet['audio'].tobytes())
diff --git a/resampler.py b/resampler.py
@@ -0,0 +1,20 @@
+import librosa
+import soundfile as sf
+
+def resample_audio(input_audio_file, target_sample_rate=16000):
+    # Load the audio file with librosa
+    audio_data, original_sample_rate = librosa.load(input_audio_file, sr=None)
+
+    # Resample the audio to the target sample rate
+    if original_sample_rate != target_sample_rate:
+        audio_data = librosa.resample(audio_data, orig_sr=original_sample_rate, target_sr=target_sample_rate)
+
+    # Save the resampled audio
+    resampled_audio_file = "sample_audio.wav"
+    sf.write(resampled_audio_file, audio_data, target_sample_rate)
+    
+    return resampled_audio_file
+
+# Usage
+input_audio_file = "unsampled_audio.wav"
+resampled_audio_file = resample_audio(input_audio_file)
diff --git a/run.py b/run.py
@@ -0,0 +1,24 @@
+import subprocess
+import sys
+
+def run_pipeline():
+    try:
+        # Run the command and stream the output in real-time
+        process = subprocess.Popen(['python', 's2s_pipeline.py', 'config.json'], 
+                                   stdout=subprocess.PIPE, 
+                                   stderr=subprocess.STDOUT,  # Combine stdout and stderr
+                                   text=True, 
+                                   bufsize=1)  # Line-buffered output
+        
+        # Stream the stdout as it comes
+        for line in process.stdout:
+            sys.stdout.write(line)  # Write directly to sys.stdout for real-time output
+            sys.stdout.flush()      # Ensure each line is printed immediately
+            
+        process.wait()  # Wait for the process to complete
+        
+    except subprocess.CalledProcessError as e:
+        print(f"Error occurred: {e.output}")
+
+if __name__ == "__main__":
+    run_pipeline()
diff --git a/s2s_pipeline.py b/s2s_pipeline.py
@@ -119,7 +119,8 @@ def main():
     # 1. Handle logger
     global logger
     logging.basicConfig(
-        level=module_kwargs.log_level.upper(),
+        # level=module_kwargs.log_level.upper(),
+        level=logging.DEBUG,
         format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
     )
     logger = logging.getLogger(__name__)
diff --git a/simplified_pipeline.py b/simplified_pipeline.py
@@ -0,0 +1,79 @@
+import queue
+import threading
+import logging
+import numpy as np
+import torch
+import torchaudio
+from VAD.vad_handler import VADHandler
+from STT.whisper_stt_handler import WhisperSTTHandler  # Import your Whisper handler
+
+# Configure logger
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+# Function to run the local pipeline with VAD and STT
+def run_local_pipeline():
+    # Initialize events and queues
+    stop_event = threading.Event()
+    should_listen = threading.Event()
+    recv_audio_chunks_queue = queue.Queue()
+    spoken_prompt_queue = queue.Queue()  # Queue for audio with detected speech
+    text_prompt_queue = queue.Queue()    # Queue for converted text (output of STT)
+
+    # Initialize the VAD handler
+    vad = VADHandler(
+        stop_event,
+        queue_in=recv_audio_chunks_queue, 
+        queue_out=spoken_prompt_queue,
+        setup_args=(should_listen,),
+        setup_kwargs={
+            'thresh': 0.3, 
+            'sample_rate': 16000, 
+            'audio_enhancement': False  # Set to True if you want enhancement
+        }
+    )
+
+    print("Setup VAD")
+
+    # Initialize the Whisper STT handler
+    stt = WhisperSTTHandler(
+        stop_event,
+        queue_in=spoken_prompt_queue,  # Speech detected audio chunks go here
+        queue_out=text_prompt_queue,   # The output text from STT goes here
+        setup_kwargs={
+            'device': 'cpu',  # Assuming you're using CPU. Change to 'cuda' for GPU.
+            'model_name': 'openai/whisper-tiny',  # Whisper model being used
+            'language': 'en',  # Set to English to avoid language detection
+            'compile_mode': None,
+        }
+    )
+
+
+    print("Setup STT")
+
+    # Simulate receiving audio chunks and processing them with VAD and STT
+    try:
+        print("Running simplified pipeline locally with VAD and STT...")
+        
+        # Simulate processing 5 chunks of audio (each 1 second long, silence)
+        for _ in range(5):
+            dummy_audio_chunk = b'\x00' * 16000  # Simulate a 1-second silent audio chunk
+            vad.process(dummy_audio_chunk)
+
+            # After VAD, check if any speech was detected, if so, send to STT
+            while not spoken_prompt_queue.empty():
+                audio_chunk = spoken_prompt_queue.get()  # Get the detected speech audio
+                stt.process(audio_chunk)  # Convert the audio to text
+
+                # Retrieve the transcribed text from the STT
+                if not text_prompt_queue.empty():
+                    transcribed_text = text_prompt_queue.get()
+                    print(f"STT Result: {transcribed_text}")
+
+        print("Pipeline completed.")
+    except KeyboardInterrupt:
+        print("Pipeline stopped.")
+
+# Run the local pipeline
+if __name__ == "__main__":
+    run_local_pipeline()
diff --git a/test.py b/test.py
diff --git a/test_audio_input.wav b/test_audio_input.wav
diff --git a/unsampled_audio.wav b/unsampled_audio.wav