add audio render

2024-09-29 02:47:04 +08:00 · 2024-09-29 02:47:04 +08:00 · 472a17f896
commit 472a17f896
parent f848529859
11 changed files with 213 additions and 173 deletions
--- a/Human.py
+++ b/Human.py
@ -246,12 +246,12 @@ def datagen(frames, mels):
 def datagen_signal(frame, mel, face_det_results):
-    img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
+    img_batch, mel_batch, frame_batch, coord_batch = [], [], [], []
    # for i, m in enumerate(mels):
    idx = 0
    frame_to_save = frame.copy()
-    face, coords = face_det_results[idx].copy()
+    face, coord = face_det_results[idx].copy()
    face = cv2.resize(face, (img_size, img_size))
    m = mel
@ -259,7 +259,7 @@ def datagen_signal(frame, mel, face_det_results):
    img_batch.append(face)
    mel_batch.append(m)
    frame_batch.append(frame_to_save)
-    coords_batch.append(coords)
+    coord_batch.append(coord)
    if len(img_batch) >= wav2lip_batch_size:
        img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
@ -269,7 +269,7 @@ def datagen_signal(frame, mel, face_det_results):
        img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
        mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
-        return img_batch, mel_batch, frame_batch, coords_batch
+        return img_batch, mel_batch, frame_batch, coord_batch
    if len(img_batch) > 0:
        img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
@ -279,7 +279,7 @@ def datagen_signal(frame, mel, face_det_results):
        img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
        mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
-        return img_batch, mel_batch, frame_batch, coords_batch
+        return img_batch, mel_batch, frame_batch, coord_batch
 # 从字节流加载音频数据
@ -294,7 +294,7 @@ def load_audio_from_bytes(byte_data):
 class Human:
    def __init__(self):
-        self._fps = 25  # 20 ms per frame
+        self._fps = 25  # 40 ms per frame
        self._batch_size = 16
        self._sample_rate = 16000
        self._stride_left_size = 10
@ -340,13 +340,14 @@ class Human:
    #     p.terminate()
    def test(self):
-        wav = audio.load_wav(r'./audio/test.wav', 16000)
+        wav = audio.load_wav(r'./audio/audio.wav', 16000)
        # with open(r'./audio/test.wav', 'rb') as f:
        #     byte_data = f.read()
        #
        # byte_data = byte_data[16:]
        # inputs = np.concatenate(byte_data)  # [N * chunk]
        # wav = load_audio_from_bytes(inputs)
        print('wav length:', len(wav))
        mel = audio.melspectrogram(wav)
        if np.isnan(mel.reshape(-1)).sum() > 0:
            raise ValueError(
@ -405,9 +406,9 @@ class Human:
                p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))
                f[y1:y2, x1:x2] = p
-                # name = "%04d" % j
+                name = "%04d" % j
-                # cv2.imwrite(f'temp/images/{j}.jpg', p)
+                cv2.imwrite(f'temp/images/{j}.jpg', p)
-                # j = j + 1
+                j = j + 1
                p = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
                self._test_image_queue.put(p)
                # out.write(f)
@ -460,7 +461,6 @@ class Human:
        self._feat_queue.put(mel_chunks)
    def push_audio_frames(self, chunk, type_):
        print("push_audio_frames")
        self._output_queue.put((chunk, type_))
    def push_render_image(self, image):
--- a/audio_render/AudioRender.dll
+++ b/audio_render/AudioRender.dll
--- a/audio_render/AudioRender.lib
+++ b/audio_render/AudioRender.lib
--- a/audio_render/AudioRender.pdb
+++ b/audio_render/AudioRender.pdb
--- a/audio_render/init.py
+++ b/audio_render/init.py
@ -0,0 +1,3 @@
 #encoding = utf8
 from .audio_render import AudioRender
--- a/audio_render/audio_render.py
+++ b/audio_render/audio_render.py
@ -0,0 +1,35 @@
 #encoding = utf8
 from ctypes import *
 import os
 current = os.path.dirname(__file__)
 dynamic_path = os.path.join(current, 'AudioRender.dll')
 def audio_render_log_callback(level, log, size):
    print(f'level={level}, log={log}, len={size}')
 class AudioRender:
    def __init__(self):
        self.__audio_render_obj = WinDLL(dynamic_path)
        print(self.__audio_render_obj)
        if self.__audio_render_obj is not None:
            CALLBACK_TYPE = CFUNCTYPE(None, c_int, c_ubyte, c_uint)
            c_callback = CALLBACK_TYPE(audio_render_log_callback)
            self.__init = self.__audio_render_obj.Initialize(c_callback)
            print('AudioRender init', self.__init)
    def __del__(self):
        print('AudioRender __del__')
        if self.__audio_render_obj is None:
            return
        if self.__init:
            self.__audio_render_obj.Uninitialize()
    def write(self, data, size):
        if not self.__init:
            return False
        self.__audio_render_obj.argtypes = (POINTER(c_ubyte), c_uint)
        return self.__audio_render_obj.Write(data.ctypes.data_as(POINTER(c_ubyte)), size)
--- a/edge_tts_test.py
+++ b/edge_tts_test.py
@ -1,102 +1,105 @@
 #encoding = utf8
 import edge_tts
 import asyncio
 import pyaudio
 from pydub import AudioSegment
 from io import BytesIO
 # 如果在 Jupyter Notebook 中使用，解除事件循环限制
 try:
    import nest_asyncio
    nest_asyncio.apply()
 except ImportError:
    pass
 def play_audio(data: bytes, stream: pyaudio.Stream) -> None:
  stream.write(AudioSegment.from_mp3(BytesIO(data)).raw_data)
 CHUNK_SIZE = 20 * 1024
 async def play_tts(text, voice):
    communicate = edge_tts.Communicate(text, voice)
    # 设置 PyAudio
    audio = pyaudio.PyAudio()
    stream = audio.open(format=pyaudio.paInt16, channels=1, rate=16000, output=True)
    # async for chunk in communicate.stream():  # 使用 stream 方法
    #     if chunk['type'] == 'audio':  # 确保 chunk 是字节流
    #         stream.write(chunk['data'])
    total_data = b''
    for chunk in communicate.stream_sync():
      if chunk["type"] == "audio" and chunk["data"]:
        total_data += chunk["data"]
        if len(total_data) >= CHUNK_SIZE:
          # print(f"Time elapsed: {time.time() - start_time:.2f} seconds")  # Print time
          stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data)
          # play_audio(total_data[:CHUNK_SIZE], stream)  # Play first CHUNK_SIZE bytes
          total_data = total_data[CHUNK_SIZE:]  # Remove played data
    # play_audio(total_data, stream)
    # 停止和关闭音频流
    stream.stop_stream()
    stream.close()
    audio.terminate()
 async def save_to_file(text, voice, filename):
    communicate = edge_tts.Communicate(text, voice)
    with open(filename, "wb") as f:
        async for chunk in communicate.stream():
            if chunk['type'] == 'audio':
                f.write(chunk['data'])
 if __name__ == "__main__":
    text = "Hello, this is a test of the Edge TTS service."
    voice = "en-US-JessaNeural"
    # 使用 asyncio.run() 运行异步函数
    asyncio.run(play_tts(text, voice))
    # asyncio.run(save_to_file(text, voice, "output.wav"))
 #
 # import edge_tts
 # import asyncio
 # import pyaudio
 # from io import BytesIO
 # from pydub import AudioSegment
-# import time
+# from io import BytesIO
 #
-# TEXT = 'Hello World! How are you guys doing? I hope great, cause I am having fun and honestly it has been a blast'
+# # 如果在 Jupyter Notebook 中使用，解除事件循环限制
-# VOICE = "en-US-AndrewMultilingualNeural"
+# try:
-# CHUNK_SIZE = 20 * 1024  # Assuming around 1024 bytes per chunk (adjust based on format)
+#     import nest_asyncio
-#
+#     nest_asyncio.apply()
-# def main() -> None:
+# except ImportError:
-#   start_time = time.time()
+#     pass
 #   communicator = edge_tts.Communicate(TEXT, VOICE)
 #
 #   pyaudio_instance = pyaudio.PyAudio()
 #   audio_stream = pyaudio_instance.open(format=pyaudio.paInt16, channels=1, rate=16000, output=True)
 #
 #   total_data = b''  # Store audio data instead of chunks
 #
 #   for chunk in communicator.stream_sync():
 #     if chunk["type"] == "audio" and chunk["data"]:
 #       total_data += chunk["data"]
 #       if len(total_data) >= CHUNK_SIZE:
 #         print(f"Time elapsed: {time.time() - start_time:.2f} seconds")  # Print time
 #         play_audio(total_data[:CHUNK_SIZE], audio_stream)  # Play first CHUNK_SIZE bytes
 #         total_data = total_data[CHUNK_SIZE:]  # Remove played data
 #
 #   # Play remaining audio
 #   play_audio(total_data, audio_stream)
 #
 #   audio_stream.stop_stream()
 #   audio_stream.close()
 #   pyaudio_instance.terminate()
 #
 # def play_audio(data: bytes, stream: pyaudio.Stream) -> None:
 #   stream.write(AudioSegment.from_mp3(BytesIO(data)).raw_data)
 #
 # CHUNK_SIZE = 20 * 1024
 # async def play_tts(text, voice):
 #     communicate = edge_tts.Communicate(text, voice)
 #
 #     # 设置 PyAudio
 #     audio = pyaudio.PyAudio()
 #     stream = audio.open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
 #
 #     # async for chunk in communicate.stream():  # 使用 stream 方法
 #     #     if chunk['type'] == 'audio':  # 确保 chunk 是字节流
 #     #         stream.write(chunk['data'])
 #
 #     total_data = b''
 #     for chunk in communicate.stream_sync():
 #       if chunk["type"] == "audio" and chunk["data"]:
 #         total_data += chunk["data"]
 #         if len(total_data) >= CHUNK_SIZE:
 #           # print(f"Time elapsed: {time.time() - start_time:.2f} seconds")  # Print time
 #           stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data)
 #           # play_audio(total_data[:CHUNK_SIZE], stream)  # Play first CHUNK_SIZE bytes
 #           total_data = total_data[CHUNK_SIZE:]  # Remove played data
 #     # play_audio(total_data, stream)
 #     # 停止和关闭音频流
 #     stream.stop_stream()
 #     stream.close()
 #     audio.terminate()
 #
 #
 # async def save_to_file(text, voice, filename):
 #     communicate = edge_tts.Communicate(text, voice)
 #
 #     with open(filename, "wb") as f:
 #         async for chunk in communicate.stream():
 #             if chunk['type'] == 'audio':
 #                 f.write(chunk['data'])
 #
 # if __name__ == "__main__":
-#   main()
+#     text = "Hello, this is a test of the Edge TTS service."
 #     voice = "en-US-JessaNeural"
 #
 #     # 使用 asyncio.run() 运行异步函数
 #     asyncio.run(play_tts(text, voice))
 #     # asyncio.run(save_to_file(text, voice, "output.wav"))
 import edge_tts
 import pyaudio
 from io import BytesIO
 from pydub import AudioSegment
 import time
 TEXT = 'Hello World! How are you guys doing? I hope great, cause I am having fun and honestly it has been a blast'
 VOICE = "en-US-AndrewMultilingualNeural"
 CHUNK_SIZE = 20 * 1024  # Assuming around 1024 bytes per chunk (adjust based on format)
 def main() -> None:
    start_time = time.time()
    communicator = edge_tts.Communicate(TEXT, VOICE)
    pyaudio_instance = pyaudio.PyAudio()
    audio_stream = pyaudio_instance.open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
    total_data = b''  # Store audio data instead of chunks
    for chunk in communicator.stream_sync():
      if chunk["type"] == "audio" and chunk["data"]:
        total_data += chunk["data"]
        if len(total_data) >= CHUNK_SIZE:
          print(f"Time elapsed: {time.time() - start_time:.2f} seconds")  # Print time
          play_audio(total_data[:CHUNK_SIZE], audio_stream)  # Play first CHUNK_SIZE bytes
          total_data = total_data[CHUNK_SIZE:]  # Remove played data
    # Play remaining audio
    play_audio(total_data, audio_stream)
    audio_stream.stop_stream()
    audio_stream.close()
    pyaudio_instance.terminate()
 def play_audio(data: bytes, stream: pyaudio.Stream) -> None:
    stream.write(AudioSegment.from_mp3(BytesIO(data)).raw_data)
 if __name__ == "__main__":
    main()
--- a/infer.py
+++ b/infer.py
@ -1,5 +1,6 @@
 #encoding = utf8
 import queue
 import time
 from queue import Queue
 from threading import Thread, Event
 import logging
@ -169,6 +170,7 @@ class Infer:
        j = 0
        count = 0
        while self._exit_event.is_set():
            try:
                m = self._queue.get(block=True, timeout=1)
@ -180,6 +182,8 @@ class Infer:
            img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
            mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
            time.sleep(0.01)
            with torch.no_grad():
                pred = model(mel_batch, img_batch)
@ -189,12 +193,14 @@ class Infer:
                p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))
                f[y1:y2, x1:x2] = p
-                name = "%04d" % j
+                # name = "%04d" % j
                cv2.imwrite(f'temp/images/{j}.jpg', p)
                j = j + 1
                # count = count + 1
                p = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
                self._human.push_render_image(p)
                # out.write(f)
            # print('infer count:', count)
    def push(self, chunk):
        self._queue.put(chunk)
--- a/tts/Chunk2Mal.py
+++ b/tts/Chunk2Mal.py
@ -1,5 +1,5 @@
 #encoding = utf8
-
+import ctypes
 import logging
 import queue
 import time
@ -8,6 +8,7 @@ from threading import Thread, Event
 import numpy as np
 import audio
 from audio_render import AudioRender
 class Chunk2Mal:
@ -17,6 +18,7 @@ class Chunk2Mal:
        self._thread = None
        self._chunks = []
        self._audio_chunks = []
        # 320 samples per chunk (20ms * 16000 / 1000)audio_chunk
        self._chunk_len = self._human.get_audio_sample_rate() // self._human.get_fps()
@ -24,63 +26,67 @@ class Chunk2Mal:
        self._thread = Thread(target=self._on_run)
        self._exit_event.set()
        self._thread.start()
        self._audio_render = AudioRender()
        logging.info('chunk2mal start')
    def _concatenate(self):
        logging.info('np.concatenate')
        inputs = np.concatenate(self._chunks)  # [5 * chunk]
        self._chunks = []
        mel = audio.melspectrogram(inputs)
        if np.isnan(mel.reshape(-1)).sum() > 0:
            raise ValueError(
                'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
        mel_step_size = 16
        # print('fps:', self._human.get_fps())
        mel_idx_multiplier = 80. / self._human.get_fps()
        # print('mel_idx_multiplier:', mel_idx_multiplier)
        count = 0
        i = 0
        while 1:
            count = count + 1
            start_idx = int(i * mel_idx_multiplier)
            print('i', i, 'start_idx', start_idx, 'mel len:', len(mel[0]))
            if start_idx + mel_step_size > len(mel[0]):
                self._human.push_mel_chunks_queue(mel[:, len(mel[0]) - mel_step_size:])
                break
            self._human.push_mel_chunks_queue(mel[:, start_idx: start_idx + mel_step_size])
            i += 1
        wav = np.concatenate(self._audio_chunks)  # [5 * chunk]self._audio_chunks
        wav *= 32767 / max(0.01, np.max(np.abs(wav)))
        wav = wav.astype(np.int16)
        self._audio_render.write(wav, len(wav))
        self._audio_chunks = []
        print('mel_chunks count:', count)
    def _on_run(self):
        logging.info('chunk2mal run')
        while self._exit_event.is_set():
            if self._audio_chunk_queue.empty():
-                time.sleep(0.5)
+                if len(self._chunks) > 0:
                    self._concatenate()
                else:
                    time.sleep(0.5)
                continue
            try:
                chunk = self._audio_chunk_queue.get(block=True, timeout=1)
                self._chunks.append(chunk)
-                self._human.push_audio_frames(chunk, 0)
+                self._audio_chunks.append(chunk.copy())
-                if len(self._chunks) < 10:
+                # print(type(chunk))
                # self._human.push_audio_frames(chunk, 0)
                if len(self._chunks) < 102: # 200ms
                    continue
            except queue.Empty:
                # print('Chunk2Mal queue.Empty')
                continue
            print('len(self._chunks):', len(self._chunks))
            self._concatenate()
            logging.info('np.concatenate')
            inputs = np.concatenate(self._chunks)  # [N * chunk]
            mel = audio.melspectrogram(inputs)
            if np.isnan(mel.reshape(-1)).sum() > 0:
                raise ValueError(
                    'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
            mel_step_size = 16
            print('fps:', self._human.get_fps())
            mel_idx_multiplier = 80. / self._human.get_fps()
            print('mel_idx_multiplier:', mel_idx_multiplier)
            i = 0
            while 1:
                start_idx = int(i * mel_idx_multiplier)
                if start_idx + mel_step_size > len(mel[0]):
                    self._human.push_mel_chunks_queue(mel[:, len(mel[0]) - mel_step_size:])
                    break
                self._human.push_mel_chunks_queue(mel[:, start_idx: start_idx + mel_step_size])
                i += 1
            batch_size = 128
            '''
            while i < (len(self._chunks) - self._human.get_stride_left_size()
                       - self._human.get_stride_right_size()) / 2:
                start_idx = int(left + i * mel_idx_multiplier)
                # print(start_idx)
                if start_idx + mel_step_size > len(mel[0]):
                    mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
                else:
                    mel_chunks.append(mel[:, start_idx: start_idx + mel_step_size])
                i += 1
            self._human.push_feat_queue(mel_chunks)
            # discard the old part to save memory
            self._chunks = self._chunks[-(self._human.get_stride_left_size() + self._human.get_stride_right_size()):]
            '''
        logging.info('chunk2mal exit')
--- a/tts/TTSBase.py
+++ b/tts/TTSBase.py
@ -57,33 +57,19 @@ class TTSBase:
        self._io_stream.seek(0)
        stream = self.__create_bytes_stream(self._io_stream)
        # audio.save_wav(stream, "./temp/audio/test1.wav", 16000)
        stream_len = stream.shape[0]
-        sr = 16000
+        print("stream_len:", stream_len, " _chunk_len:", self._chunk_len)
        soundfile.read('./temp/audio/audio.wav', stream, sr)
        # audio_chunks = audio.split_audio(stream, sr, 4)
        # display(audio.play_audio_chunk(audio_chunks[0], sr=sr))
        # 保存切割后的片段
        # audio.save_chunks(stream[0:-1], sr, './temp/audio/')
        # audio.save_chunks(audio_chunks, sr, './temp/audio/')
        # try:
        #     sounddevice.play(stream, samplerate=self._human.get_audio_sample_rate())
        #     sounddevice.wait()  # 等待音频播放完毕
        # except Exception as e:
        #     logger.error(f"播放音频出错: {e}") playrec
        index = 0
        segment = 0
        while stream_len >= self._chunk_len:
            audio_chunk = stream[index:index + self._chunk_len]
            # sounddevice.play(audio_chunk, samplerate=self._human.get_audio_sample_rate())
            # self._pcm_stream.write(audio_chunk)
            # self._pcm_stream.write(audio_chunk.tobytes())
            # self._human.push_audio_chunk(audio_chunk)
            # self._human.push_mel_chunks_queue(audio_chunk)
            self._human.push_audio_chunk(audio_chunk)
            stream_len -= self._chunk_len
            index += self._chunk_len
            segment = segment + 1
        print("segment:", segment)
        self._io_stream.seek(0)
        self._io_stream.truncate()
--- a/ui.py
+++ b/ui.py
@ -44,6 +44,7 @@ class App(customtkinter.CTk):
        # self.logo_label.grid(row=0, column=0, padx=20, pady=(20, 10))
        self.entry = customtkinter.CTkEntry(self, placeholder_text="输入内容")
        self.entry.insert(0, "基本信息,北京九零科技有限公司，成立于2015年，位于北京市，是一家以从事科技推广和应用服务业为主的企业。企业注册资本500万人民币。")
        self.entry.grid(row=2, column=0, columnspan=2, padx=(20, 0), pady=(20, 20), sticky="nsew")
        self.main_button_1 = customtkinter.CTkButton(master=self, fg_color="transparent", border_width=2,
@ -63,13 +64,13 @@ class App(customtkinter.CTk):
        self._human.on_destroy()
    def play_audio(self):
-        # return
+        return
-        if self._is_play_audio:
+        # if self._is_play_audio:
-            return
+        #     return
-        self._is_play_audio = True
+        # self._is_play_audio = True
-        file = os.path.curdir + '/audio/test.wav'
+        # file = os.path.curdir + '/audio/test1.wav'
-        print(file)
+        # print(file)
-        winsound.PlaySound(file, winsound.SND_ASYNC or winsound.SND_FILENAME)
+        # winsound.PlaySound(file, winsound.SND_ASYNC or winsound.SND_FILENAME)
        # playsound(file)
    def _init_image_canvas(self):
@ -105,11 +106,11 @@ class App(customtkinter.CTk):
        height = self.winfo_height() * 0.5
        self._canvas.create_image(width, height, anchor=customtkinter.CENTER, image=imgtk)
        self._canvas.update()
-        self.after(33, self._render)
+        self.after(40, self._render)
    def request_tts(self):
        content = self.entry.get()
-        content = 'Hello, this is a test of the Edge TTS service.'
+        # content = ''
        print('content:', content)
        self.entry.delete(0, customtkinter.END)
        self._human.read(content)
		`@ -0,0 +1,3 @@`
							`#encoding = utf8`

							`from .audio_render import AudioRender`