修改tts录制文件

2024-09-26 20:28:49 +08:00 · 2024-09-26 20:28:49 +08:00 · bef51d5c47
commit bef51d5c47
parent 2127982650
5 changed files with 225 additions and 38 deletions
--- a/Human.py
+++ b/Human.py
@ -9,6 +9,7 @@ import time


 import numpy as np
+import pyaudio

 import audio
 import face_detection
@ -291,14 +292,16 @@ class Human:
        self._output_queue = mp.Queue()
        self._res_frame_queue = mp.Queue(self._batch_size * 2)

-        # self._chunk_2_mal = Chunk2Mal(self)
-        # self._tts = TTSBase(self)
+        self._chunk_2_mal = Chunk2Mal(self)
+        self._tts = TTSBase(self)

        self.mel_chunks_queue_ = Queue()
+        self.audio_chunks_queue_ = Queue()
        self._test_image_queue = Queue()

        self._thread = None
        # self.test()
+        # self.play_pcm()

        # face_images_path = r'./face/'
        # self._face_image_paths = utils.read_files_path(face_images_path)
@ -309,6 +312,19 @@ class Human:
        #                                    )).start()
        # self.render_event.set()

+    # def play_pcm(self):
+    #     p = pyaudio.PyAudio()
+    #     stream = p.open(format=p.get_format_from_width(2), channels=1, rate=16000, output=True)
+    #     file1 = r'./audio/en_weather.pcm'
+    #
+    #     # 将 pcm 数据直接写入 PyAudio 的数据流
+    #     with open(file1, "rb") as f:
+    #         stream.write(f.read())
+    #
+    #     stream.stop_stream()
+    #     stream.close()
+    #     p.terminate()
+
    def test(self):
        wav = audio.load_wav(r'./audio/audio1.wav', 16000)
        mel = audio.melspectrogram(wav)
@ -346,8 +362,8 @@ class Human:
        print("Model loaded")

        frame_h, frame_w = face_list_cycle[0].shape[:-1]
-        out = cv2.VideoWriter('temp/resul_tttt.avi',
-                              cv2.VideoWriter_fourcc(*'DIVX'), 25, (frame_w, frame_h))
+        # out = cv2.VideoWriter('temp/resul_tttt.avi',
+        #                       cv2.VideoWriter_fourcc(*'DIVX'), 25, (frame_w, frame_h))

        face_det_results = face_detect(face_list_cycle)

@ -374,12 +390,12 @@ class Human:
                # j = j + 1
                p = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
                self._test_image_queue.put(p)
-                out.write(f)
-
-        out.release()
-        command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format('./audio/audio1.wav', 'temp/resul_tttt.avi',
-                                                                      'temp/resul_tttt.mp4')
-        subprocess.call(command, shell=platform.system() != 'Windows')
+                # out.write(f)
+        #
+        # out.release()
+        # command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format('./audio/audio1.wav', 'temp/resul_tttt.avi',
+        #                                                               'temp/resul_tttt.mp4')
+        # subprocess.call(command, shell=platform.system() != 'Windows')


        # gen = datagen(face_list_cycle, self.mel_chunks_queue_)
@ -407,18 +423,18 @@ class Human:
        logging.info('human destroy')

    def read(self, txt):
-        # if self._tts is None:
-        #     logging.warning('tts is none')
-        #     return
-
-        if self._thread is None:
-            self._thread = threading.Thread(target=self.test)
-            self._thread.start()
-            # self._tts.push_txt(txt)
+        if self._tts is None:
+            logging.warning('tts is none')
+            return
+        self._tts.push_txt(txt)

    def push_audio_chunk(self, audio_chunk):
        self._chunk_2_mal.push_chunk(audio_chunk)

+    def push_mel_chunks_queue(self, mel_chunk):
+        self.mel_chunks_queue_.put(mel_chunk)
+        # self.audio_chunks_queue_.put(audio_chunk)
+
    def push_feat_queue(self, mel_chunks):
        print("push_feat_queue")
        self._feat_queue.put(mel_chunks)
--- a/edge_tts_test.py
+++ b/edge_tts_test.py
@ -0,0 +1,102 @@
+#encoding = utf8
+
+import edge_tts
+import asyncio
+import pyaudio
+from pydub import AudioSegment
+from io import BytesIO
+
+# 如果在 Jupyter Notebook 中使用，解除事件循环限制
+try:
+    import nest_asyncio
+    nest_asyncio.apply()
+except ImportError:
+    pass
+
+def play_audio(data: bytes, stream: pyaudio.Stream) -> None:
+  stream.write(AudioSegment.from_mp3(BytesIO(data)).raw_data)
+
+CHUNK_SIZE = 20 * 1024
+async def play_tts(text, voice):
+    communicate = edge_tts.Communicate(text, voice)
+
+    # 设置 PyAudio
+    audio = pyaudio.PyAudio()
+    stream = audio.open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
+
+    # async for chunk in communicate.stream():  # 使用 stream 方法
+    #     if chunk['type'] == 'audio':  # 确保 chunk 是字节流
+    #         stream.write(chunk['data'])
+
+    total_data = b''
+    for chunk in communicate.stream_sync():
+      if chunk["type"] == "audio" and chunk["data"]:
+        total_data += chunk["data"]
+        if len(total_data) >= CHUNK_SIZE:
+          # print(f"Time elapsed: {time.time() - start_time:.2f} seconds")  # Print time
+          stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data)
+          # play_audio(total_data[:CHUNK_SIZE], stream)  # Play first CHUNK_SIZE bytes
+          total_data = total_data[CHUNK_SIZE:]  # Remove played data
+    # play_audio(total_data, stream)
+    # 停止和关闭音频流
+    stream.stop_stream()
+    stream.close()
+    audio.terminate()
+
+
+async def save_to_file(text, voice, filename):
+    communicate = edge_tts.Communicate(text, voice)
+
+    with open(filename, "wb") as f:
+        async for chunk in communicate.stream():
+            if chunk['type'] == 'audio':
+                f.write(chunk['data'])
+
+if __name__ == "__main__":
+    text = "Hello, this is a test of the Edge TTS service."
+    voice = "en-US-JessaNeural"
+
+    # 使用 asyncio.run() 运行异步函数
+    asyncio.run(play_tts(text, voice))
+    # asyncio.run(save_to_file(text, voice, "output.wav"))
+
+#
+# import edge_tts
+# import pyaudio
+# from io import BytesIO
+# from pydub import AudioSegment
+# import time
+#
+# TEXT = 'Hello World! How are you guys doing? I hope great, cause I am having fun and honestly it has been a blast'
+# VOICE = "en-US-AndrewMultilingualNeural"
+# CHUNK_SIZE = 20 * 1024  # Assuming around 1024 bytes per chunk (adjust based on format)
+#
+# def main() -> None:
+#   start_time = time.time()
+#   communicator = edge_tts.Communicate(TEXT, VOICE)
+#
+#   pyaudio_instance = pyaudio.PyAudio()
+#   audio_stream = pyaudio_instance.open(format=pyaudio.paInt16, channels=1, rate=16000, output=True)
+#
+#   total_data = b''  # Store audio data instead of chunks
+#
+#   for chunk in communicator.stream_sync():
+#     if chunk["type"] == "audio" and chunk["data"]:
+#       total_data += chunk["data"]
+#       if len(total_data) >= CHUNK_SIZE:
+#         print(f"Time elapsed: {time.time() - start_time:.2f} seconds")  # Print time
+#         play_audio(total_data[:CHUNK_SIZE], audio_stream)  # Play first CHUNK_SIZE bytes
+#         total_data = total_data[CHUNK_SIZE:]  # Remove played data
+#
+#   # Play remaining audio
+#   play_audio(total_data, audio_stream)
+#
+#   audio_stream.stop_stream()
+#   audio_stream.close()
+#   pyaudio_instance.terminate()
+#
+# def play_audio(data: bytes, stream: pyaudio.Stream) -> None:
+#   stream.write(AudioSegment.from_mp3(BytesIO(data)).raw_data)
+#
+# if __name__ == "__main__":
+#   main()
--- a/tts/Chunk2Mal.py
+++ b/tts/Chunk2Mal.py
@ -36,19 +36,35 @@ class Chunk2Mal:
                # print('Chunk2Mal queue.Empty')
                continue

-            if len(self._chunks) <= self._human.get_stride_left_size() + self._human.get_stride_right_size():
-                # print('Chunk2Mal queue.Empty')
+            if type_ == 0:
                continue

            logging.info('np.concatenate')
-            inputs = np.concatenate(self._chunks)  # [N * chunk]
-            mel = audio.melspectrogram(inputs)
-            left = max(0, self._human.get_stride_left_size() * 80 / 50)
-            right = min(len(mel[0]), len(mel[0]) - self._human.get_stride_right_size() * 80 / 50)
-            mel_idx_multiplier = 80. * 2 / self._human.get_fps()
+            mel = audio.melspectrogram(chunk)
+            if np.isnan(mel.reshape(-1)).sum() > 0:
+                raise ValueError(
+                    'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
+
            mel_step_size = 16
+
+            print('fps:', self._human.get_fps())
+            mel_idx_multiplier = 80. / self._human.get_fps()
+            print('mel_idx_multiplier:', mel_idx_multiplier)
+
            i = 0
-            mel_chunks = []
+            while 1:
+                start_idx = int(i * mel_idx_multiplier)
+                if start_idx + mel_step_size > len(mel[0]):
+                    # mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
+                    self._human.push_mel_chunks_queue(mel[:, len(mel[0]) - mel_step_size:])
+                    break
+                # mel_chunks.append(mel[:, start_idx: start_idx + mel_step_size])
+                self._human.push_mel_chunks_queue(mel[:, start_idx: start_idx + mel_step_size])
+                i += 1
+
+            batch_size = 128
+
+            '''
            while i < (len(self._chunks) - self._human.get_stride_left_size()
                       - self._human.get_stride_right_size()) / 2:
                start_idx = int(left + i * mel_idx_multiplier)
@ -62,6 +78,7 @@ class Chunk2Mal:

            # discard the old part to save memory
            self._chunks = self._chunks[-(self._human.get_stride_left_size() + self._human.get_stride_right_size()):]
+            '''

        logging.info('chunk2mal exit')

--- a/tts/TTSBase.py
+++ b/tts/TTSBase.py
@ -5,6 +5,7 @@ import time

 import edge_tts
 import numpy as np
+import pyaudio
 import soundfile
 import resampy
 import queue
@ -12,6 +13,8 @@ from io import BytesIO
 from queue import Queue
 from threading import Thread, Event

+from pydub import AudioSegment
+
 logger = logging.getLogger(__name__)


@ -23,12 +26,15 @@ class TTSBase:
        self._exit_event = None
        self._io_stream = BytesIO()
        self._sample_rate = 16000
-        self._chunk = self._sample_rate // self._human.get_fps()
+        self._chunk_len = self._sample_rate // self._human.get_fps()

        self._exit_event = Event()
        self._thread = Thread(target=self._on_run)
        self._exit_event.set()
        self._thread.start()
+        self._pcm_player = pyaudio.PyAudio()
+        self._pcm_stream = self._pcm_player.open(format=pyaudio.paInt16,
+                                                 channels=1, rate=16000, output=True)
        logging.info('tts start')

    def _on_run(self):
@ -51,10 +57,15 @@ class TTSBase:
        stream = self.__create_bytes_stream(self._io_stream)
        stream_len = stream.shape[0]
        index = 0
-        while stream_len >= self._chunk:
-            self._human.push_audio_chunk(stream[index:index + self._chunk])
-            stream_len -= self._chunk
-            index += self._chunk
+        while stream_len >= self._chunk_len:
+            audio_chunk = stream[index:index + self._chunk_len]
+            # self._pcm_stream.write(audio_chunk)
+            # self._pcm_stream.write(AudioSegment.from_mp3(audio_chunk))
+            # self._human.push_audio_chunk(audio_chunk)
+            # self._human.push_mel_chunks_queue(audio_chunk)
+            self._human.push_audio_chunk(audio_chunk)
+            stream_len -= self._chunk_len
+            index += self._chunk_len

    def __create_bytes_stream(self, io_stream):
        stream, sample_rate = soundfile.read(io_stream)
@ -74,14 +85,38 @@ class TTSBase:
    async def __on_request(self, voice, txt):
        communicate = edge_tts.Communicate(txt, voice)
        first = True
-        async for chuck in communicate.stream():
-            if first:
-                first = False
+        # total_data = b''
+        # CHUNK_SIZE = self._chunk_len
+        async for chunk in communicate.stream():
+            if chunk["type"] == "audio" and chunk["data"]:
+                self._io_stream.write(chunk['data'])
+                # total_data += chunk["data"]
+                # if len(total_data) >= CHUNK_SIZE:
+                #     print(f"Time elapsed: {time.time() - start_time:.2f} seconds")  # Print time
+                    # audio_data = AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])) #.raw_data
+                    # audio_data = audio_data.set_frame_rate(self._human.get_audio_sample_rate())
+                    # self._human.push_audio_chunk(audio_data)
+                    # self._pcm_stream.write(audio_data.raw_data)
+                    # play_audio(total_data[:CHUNK_SIZE], stream)  # Play first CHUNK_SIZE bytes
+                    # total_data = total_data[CHUNK_SIZE:]  # Remove played data

-            if chuck['type'] == 'audio':
-                self._io_stream.write(chuck['data'])
+            # if first:
+            #     first = False
+
+            # if chuck['type'] == 'audio':
+            #     # self._io_stream.write(chuck['data'])
+            #     self._io_stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data)
+        # if len(total_data) > 0:
+             # self._pcm_stream.write(AudioSegment.from_mp3(BytesIO(total_data)).raw_data)
+             # audio_data = AudioSegment.from_mp3(BytesIO(total_data))  # .raw_data
+             # audio_data = audio_data.set_frame_rate(self._human.get_audio_sample_rate())
+             # self._human.push_audio_chunk(audio_data)
+        # self._io_stream.write(AudioSegment.from_mp3(BytesIO(total_data)).raw_data)

    def stop(self):
+        self._pcm_stream.stop_stream()
+        self._pcm_player.close(self._pcm_stream)
+        self._pcm_player.terminate()
        if self._exit_event is None:
            return

--- a/ui.py
+++ b/ui.py
@ -1,14 +1,18 @@
 #encoding = utf8
 import json
 import logging
+import os
 from logging import handlers
 import tkinter
 import tkinter.messagebox
 import customtkinter
 import cv2
 import requests
+import winsound
 from PIL import Image, ImageTk

+from playsound import playsound
+
 from Human import Human
 from tts.EdgeTTS import EdgeTTS

@ -25,7 +29,7 @@ class App(customtkinter.CTk):
        self._tts_url = 'http://localhost:8080'

        # configure window
-        self.title("数字人测试demo")
+        self.title("TTS demo")
        self.geometry(f"{1100}x{580}")

        self.grid_columnconfigure(1, weight=1)
@ -49,13 +53,24 @@ class App(customtkinter.CTk):

        self._init_image_canvas()

+        self._is_play_audio = False
        self._human = Human()
        self._render()
+        # self.play_audio()

    def on_destroy(self):
        logger.info('------------App destroy------------')
        self._human.on_destroy()

+    def play_audio(self):
+        if self._is_play_audio:
+            return
+        self._is_play_audio = True
+        file = os.path.curdir + '/audio/audio1.wav'
+        print(file)
+        winsound.PlaySound(file, winsound.SND_ASYNC or winsound.SND_FILENAME)
+        # playsound(file)
+
    def _init_image_canvas(self):
        self._canvas = customtkinter.CTkCanvas(self.image_frame)
        self._canvas.pack(fill=customtkinter.BOTH, expand=customtkinter.YES)
@ -66,6 +81,7 @@ class App(customtkinter.CTk):
            self.after(100, self._render)
            return

+        self.play_audio()
        iheight, iwidth = image.shape[0], image.shape[1]
        width = self.winfo_width()
        height = self.winfo_height()
@ -88,10 +104,11 @@ class App(customtkinter.CTk):
        height = self.winfo_height() * 0.5
        self._canvas.create_image(width, height, anchor=customtkinter.CENTER, image=imgtk)
        self._canvas.update()
-        self.after(60, self._render)
+        self.after(34, self._render)

    def request_tts(self):
        content = self.entry.get()
+        content = 'Hello, this is a test of the Edge TTS service.'
        print('content:', content)
        self.entry.delete(0, customtkinter.END)
        self._human.read(content)