修改tts录制文件

2024-09-26 20:28:49 +08:00 · 2024-09-26 20:28:49 +08:00 · bef51d5c47
commit bef51d5c47
parent 2127982650
5 changed files with 225 additions and 38 deletions
--- a/Human.py
+++ b/Human.py
@ -9,6 +9,7 @@ import time
 import numpy as np
 import pyaudio
 import audio
 import face_detection
@ -291,14 +292,16 @@ class Human:
        self._output_queue = mp.Queue()
        self._res_frame_queue = mp.Queue(self._batch_size * 2)
-        # self._chunk_2_mal = Chunk2Mal(self)
+        self._chunk_2_mal = Chunk2Mal(self)
-        # self._tts = TTSBase(self)
+        self._tts = TTSBase(self)
        self.mel_chunks_queue_ = Queue()
        self.audio_chunks_queue_ = Queue()
        self._test_image_queue = Queue()
        self._thread = None
        # self.test()
        # self.play_pcm()
        # face_images_path = r'./face/'
        # self._face_image_paths = utils.read_files_path(face_images_path)
@ -309,6 +312,19 @@ class Human:
        #                                    )).start()
        # self.render_event.set()
    # def play_pcm(self):
    #     p = pyaudio.PyAudio()
    #     stream = p.open(format=p.get_format_from_width(2), channels=1, rate=16000, output=True)
    #     file1 = r'./audio/en_weather.pcm'
    #
    #     # 将 pcm 数据直接写入 PyAudio 的数据流
    #     with open(file1, "rb") as f:
    #         stream.write(f.read())
    #
    #     stream.stop_stream()
    #     stream.close()
    #     p.terminate()
    def test(self):
        wav = audio.load_wav(r'./audio/audio1.wav', 16000)
        mel = audio.melspectrogram(wav)
@ -346,8 +362,8 @@ class Human:
        print("Model loaded")
        frame_h, frame_w = face_list_cycle[0].shape[:-1]
-        out = cv2.VideoWriter('temp/resul_tttt.avi',
+        # out = cv2.VideoWriter('temp/resul_tttt.avi',
-                              cv2.VideoWriter_fourcc(*'DIVX'), 25, (frame_w, frame_h))
+        #                       cv2.VideoWriter_fourcc(*'DIVX'), 25, (frame_w, frame_h))
        face_det_results = face_detect(face_list_cycle)
@ -374,12 +390,12 @@ class Human:
                # j = j + 1
                p = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
                self._test_image_queue.put(p)
-                out.write(f)
+                # out.write(f)
-
+        #
-        out.release()
+        # out.release()
-        command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format('./audio/audio1.wav', 'temp/resul_tttt.avi',
+        # command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format('./audio/audio1.wav', 'temp/resul_tttt.avi',
-                                                                      'temp/resul_tttt.mp4')
+        #                                                               'temp/resul_tttt.mp4')
-        subprocess.call(command, shell=platform.system() != 'Windows')
+        # subprocess.call(command, shell=platform.system() != 'Windows')
        # gen = datagen(face_list_cycle, self.mel_chunks_queue_)
@ -407,18 +423,18 @@ class Human:
        logging.info('human destroy')
    def read(self, txt):
-        # if self._tts is None:
+        if self._tts is None:
-        #     logging.warning('tts is none')
+            logging.warning('tts is none')
-        #     return
+            return
-
+        self._tts.push_txt(txt)
        if self._thread is None:
            self._thread = threading.Thread(target=self.test)
            self._thread.start()
            # self._tts.push_txt(txt)
    def push_audio_chunk(self, audio_chunk):
        self._chunk_2_mal.push_chunk(audio_chunk)
    def push_mel_chunks_queue(self, mel_chunk):
        self.mel_chunks_queue_.put(mel_chunk)
        # self.audio_chunks_queue_.put(audio_chunk)
    def push_feat_queue(self, mel_chunks):
        print("push_feat_queue")
        self._feat_queue.put(mel_chunks)
--- a/edge_tts_test.py
+++ b/edge_tts_test.py
@ -0,0 +1,102 @@
 #encoding = utf8
 import edge_tts
 import asyncio
 import pyaudio
 from pydub import AudioSegment
 from io import BytesIO
 # 如果在 Jupyter Notebook 中使用，解除事件循环限制
 try:
    import nest_asyncio
    nest_asyncio.apply()
 except ImportError:
    pass
 def play_audio(data: bytes, stream: pyaudio.Stream) -> None:
  stream.write(AudioSegment.from_mp3(BytesIO(data)).raw_data)
 CHUNK_SIZE = 20 * 1024
 async def play_tts(text, voice):
    communicate = edge_tts.Communicate(text, voice)
    # 设置 PyAudio
    audio = pyaudio.PyAudio()
    stream = audio.open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
    # async for chunk in communicate.stream():  # 使用 stream 方法
    #     if chunk['type'] == 'audio':  # 确保 chunk 是字节流
    #         stream.write(chunk['data'])
    total_data = b''
    for chunk in communicate.stream_sync():
      if chunk["type"] == "audio" and chunk["data"]:
        total_data += chunk["data"]
        if len(total_data) >= CHUNK_SIZE:
          # print(f"Time elapsed: {time.time() - start_time:.2f} seconds")  # Print time
          stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data)
          # play_audio(total_data[:CHUNK_SIZE], stream)  # Play first CHUNK_SIZE bytes
          total_data = total_data[CHUNK_SIZE:]  # Remove played data
    # play_audio(total_data, stream)
    # 停止和关闭音频流
    stream.stop_stream()
    stream.close()
    audio.terminate()
 async def save_to_file(text, voice, filename):
    communicate = edge_tts.Communicate(text, voice)
    with open(filename, "wb") as f:
        async for chunk in communicate.stream():
            if chunk['type'] == 'audio':
                f.write(chunk['data'])
 if __name__ == "__main__":
    text = "Hello, this is a test of the Edge TTS service."
    voice = "en-US-JessaNeural"
    # 使用 asyncio.run() 运行异步函数
    asyncio.run(play_tts(text, voice))
    # asyncio.run(save_to_file(text, voice, "output.wav"))
 #
 # import edge_tts
 # import pyaudio
 # from io import BytesIO
 # from pydub import AudioSegment
 # import time
 #
 # TEXT = 'Hello World! How are you guys doing? I hope great, cause I am having fun and honestly it has been a blast'
 # VOICE = "en-US-AndrewMultilingualNeural"
 # CHUNK_SIZE = 20 * 1024  # Assuming around 1024 bytes per chunk (adjust based on format)
 #
 # def main() -> None:
 #   start_time = time.time()
 #   communicator = edge_tts.Communicate(TEXT, VOICE)
 #
 #   pyaudio_instance = pyaudio.PyAudio()
 #   audio_stream = pyaudio_instance.open(format=pyaudio.paInt16, channels=1, rate=16000, output=True)
 #
 #   total_data = b''  # Store audio data instead of chunks
 #
 #   for chunk in communicator.stream_sync():
 #     if chunk["type"] == "audio" and chunk["data"]:
 #       total_data += chunk["data"]
 #       if len(total_data) >= CHUNK_SIZE:
 #         print(f"Time elapsed: {time.time() - start_time:.2f} seconds")  # Print time
 #         play_audio(total_data[:CHUNK_SIZE], audio_stream)  # Play first CHUNK_SIZE bytes
 #         total_data = total_data[CHUNK_SIZE:]  # Remove played data
 #
 #   # Play remaining audio
 #   play_audio(total_data, audio_stream)
 #
 #   audio_stream.stop_stream()
 #   audio_stream.close()
 #   pyaudio_instance.terminate()
 #
 # def play_audio(data: bytes, stream: pyaudio.Stream) -> None:
 #   stream.write(AudioSegment.from_mp3(BytesIO(data)).raw_data)
 #
 # if __name__ == "__main__":
 #   main()
--- a/tts/Chunk2Mal.py
+++ b/tts/Chunk2Mal.py
@ -36,19 +36,35 @@ class Chunk2Mal:
                # print('Chunk2Mal queue.Empty')
                continue
-            if len(self._chunks) <= self._human.get_stride_left_size() + self._human.get_stride_right_size():
+            if type_ == 0:
                # print('Chunk2Mal queue.Empty')
                continue
            logging.info('np.concatenate')
-            inputs = np.concatenate(self._chunks)  # [N * chunk]
+            mel = audio.melspectrogram(chunk)
-            mel = audio.melspectrogram(inputs)
+            if np.isnan(mel.reshape(-1)).sum() > 0:
-            left = max(0, self._human.get_stride_left_size() * 80 / 50)
+                raise ValueError(
-            right = min(len(mel[0]), len(mel[0]) - self._human.get_stride_right_size() * 80 / 50)
+                    'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
-            mel_idx_multiplier = 80. * 2 / self._human.get_fps()
+
            mel_step_size = 16
            print('fps:', self._human.get_fps())
            mel_idx_multiplier = 80. / self._human.get_fps()
            print('mel_idx_multiplier:', mel_idx_multiplier)
            i = 0
-            mel_chunks = []
+            while 1:
                start_idx = int(i * mel_idx_multiplier)
                if start_idx + mel_step_size > len(mel[0]):
                    # mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
                    self._human.push_mel_chunks_queue(mel[:, len(mel[0]) - mel_step_size:])
                    break
                # mel_chunks.append(mel[:, start_idx: start_idx + mel_step_size])
                self._human.push_mel_chunks_queue(mel[:, start_idx: start_idx + mel_step_size])
                i += 1
            batch_size = 128
            '''
            while i < (len(self._chunks) - self._human.get_stride_left_size()
                       - self._human.get_stride_right_size()) / 2:
                start_idx = int(left + i * mel_idx_multiplier)
@ -62,6 +78,7 @@ class Chunk2Mal:
            # discard the old part to save memory
            self._chunks = self._chunks[-(self._human.get_stride_left_size() + self._human.get_stride_right_size()):]
            '''
        logging.info('chunk2mal exit')
--- a/tts/TTSBase.py
+++ b/tts/TTSBase.py
@ -5,6 +5,7 @@ import time
 import edge_tts
 import numpy as np
 import pyaudio
 import soundfile
 import resampy
 import queue
@ -12,6 +13,8 @@ from io import BytesIO
 from queue import Queue
 from threading import Thread, Event
 from pydub import AudioSegment
 logger = logging.getLogger(__name__)
@ -23,12 +26,15 @@ class TTSBase:
        self._exit_event = None
        self._io_stream = BytesIO()
        self._sample_rate = 16000
-        self._chunk = self._sample_rate // self._human.get_fps()
+        self._chunk_len = self._sample_rate // self._human.get_fps()
        self._exit_event = Event()
        self._thread = Thread(target=self._on_run)
        self._exit_event.set()
        self._thread.start()
        self._pcm_player = pyaudio.PyAudio()
        self._pcm_stream = self._pcm_player.open(format=pyaudio.paInt16,
                                                 channels=1, rate=16000, output=True)
        logging.info('tts start')
    def _on_run(self):
@ -51,10 +57,15 @@ class TTSBase:
        stream = self.__create_bytes_stream(self._io_stream)
        stream_len = stream.shape[0]
        index = 0
-        while stream_len >= self._chunk:
+        while stream_len >= self._chunk_len:
-            self._human.push_audio_chunk(stream[index:index + self._chunk])
+            audio_chunk = stream[index:index + self._chunk_len]
-            stream_len -= self._chunk
+            # self._pcm_stream.write(audio_chunk)
-            index += self._chunk
+            # self._pcm_stream.write(AudioSegment.from_mp3(audio_chunk))
            # self._human.push_audio_chunk(audio_chunk)
            # self._human.push_mel_chunks_queue(audio_chunk)
            self._human.push_audio_chunk(audio_chunk)
            stream_len -= self._chunk_len
            index += self._chunk_len
    def __create_bytes_stream(self, io_stream):
        stream, sample_rate = soundfile.read(io_stream)
@ -74,14 +85,38 @@ class TTSBase:
    async def __on_request(self, voice, txt):
        communicate = edge_tts.Communicate(txt, voice)
        first = True
-        async for chuck in communicate.stream():
+        # total_data = b''
-            if first:
+        # CHUNK_SIZE = self._chunk_len
-                first = False
+        async for chunk in communicate.stream():
            if chunk["type"] == "audio" and chunk["data"]:
                self._io_stream.write(chunk['data'])
                # total_data += chunk["data"]
                # if len(total_data) >= CHUNK_SIZE:
                #     print(f"Time elapsed: {time.time() - start_time:.2f} seconds")  # Print time
                    # audio_data = AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])) #.raw_data
                    # audio_data = audio_data.set_frame_rate(self._human.get_audio_sample_rate())
                    # self._human.push_audio_chunk(audio_data)
                    # self._pcm_stream.write(audio_data.raw_data)
                    # play_audio(total_data[:CHUNK_SIZE], stream)  # Play first CHUNK_SIZE bytes
                    # total_data = total_data[CHUNK_SIZE:]  # Remove played data
-            if chuck['type'] == 'audio':
+            # if first:
-                self._io_stream.write(chuck['data'])
+            #     first = False
            # if chuck['type'] == 'audio':
            #     # self._io_stream.write(chuck['data'])
            #     self._io_stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data)
        # if len(total_data) > 0:
             # self._pcm_stream.write(AudioSegment.from_mp3(BytesIO(total_data)).raw_data)
             # audio_data = AudioSegment.from_mp3(BytesIO(total_data))  # .raw_data
             # audio_data = audio_data.set_frame_rate(self._human.get_audio_sample_rate())
             # self._human.push_audio_chunk(audio_data)
        # self._io_stream.write(AudioSegment.from_mp3(BytesIO(total_data)).raw_data)
    def stop(self):
        self._pcm_stream.stop_stream()
        self._pcm_player.close(self._pcm_stream)
        self._pcm_player.terminate()
        if self._exit_event is None:
            return
--- a/ui.py
+++ b/ui.py
@ -1,14 +1,18 @@
 #encoding = utf8
 import json
 import logging
 import os
 from logging import handlers
 import tkinter
 import tkinter.messagebox
 import customtkinter
 import cv2
 import requests
 import winsound
 from PIL import Image, ImageTk
 from playsound import playsound
 from Human import Human
 from tts.EdgeTTS import EdgeTTS
@ -25,7 +29,7 @@ class App(customtkinter.CTk):
        self._tts_url = 'http://localhost:8080'
        # configure window
-        self.title("数字人测试demo")
+        self.title("TTS demo")
        self.geometry(f"{1100}x{580}")
        self.grid_columnconfigure(1, weight=1)
@ -49,13 +53,24 @@ class App(customtkinter.CTk):
        self._init_image_canvas()
        self._is_play_audio = False
        self._human = Human()
        self._render()
        # self.play_audio()
    def on_destroy(self):
        logger.info('------------App destroy------------')
        self._human.on_destroy()
    def play_audio(self):
        if self._is_play_audio:
            return
        self._is_play_audio = True
        file = os.path.curdir + '/audio/audio1.wav'
        print(file)
        winsound.PlaySound(file, winsound.SND_ASYNC or winsound.SND_FILENAME)
        # playsound(file)
    def _init_image_canvas(self):
        self._canvas = customtkinter.CTkCanvas(self.image_frame)
        self._canvas.pack(fill=customtkinter.BOTH, expand=customtkinter.YES)
@ -66,6 +81,7 @@ class App(customtkinter.CTk):
            self.after(100, self._render)
            return
        self.play_audio()
        iheight, iwidth = image.shape[0], image.shape[1]
        width = self.winfo_width()
        height = self.winfo_height()
@ -88,10 +104,11 @@ class App(customtkinter.CTk):
        height = self.winfo_height() * 0.5
        self._canvas.create_image(width, height, anchor=customtkinter.CENTER, image=imgtk)
        self._canvas.update()
-        self.after(60, self._render)
+        self.after(34, self._render)
    def request_tts(self):
        content = self.entry.get()
        content = 'Hello, this is a test of the Edge TTS service.'
        print('content:', content)
        self.entry.delete(0, customtkinter.END)
        self._human.read(content)