add audio render

2024-09-29 02:47:04 +08:00 · 2024-09-29 02:47:04 +08:00 · 472a17f896
commit 472a17f896
parent f848529859
11 changed files with 213 additions and 173 deletions
--- a/Human.py
+++ b/Human.py
@ -246,12 +246,12 @@ def datagen(frames, mels):


 def datagen_signal(frame, mel, face_det_results):
-    img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
+    img_batch, mel_batch, frame_batch, coord_batch = [], [], [], []

    # for i, m in enumerate(mels):
    idx = 0
    frame_to_save = frame.copy()
-    face, coords = face_det_results[idx].copy()
+    face, coord = face_det_results[idx].copy()

    face = cv2.resize(face, (img_size, img_size))
    m = mel
@ -259,7 +259,7 @@ def datagen_signal(frame, mel, face_det_results):
    img_batch.append(face)
    mel_batch.append(m)
    frame_batch.append(frame_to_save)
-    coords_batch.append(coords)
+    coord_batch.append(coord)

    if len(img_batch) >= wav2lip_batch_size:
        img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
@ -269,7 +269,7 @@ def datagen_signal(frame, mel, face_det_results):
        img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
        mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])

-        return img_batch, mel_batch, frame_batch, coords_batch
+        return img_batch, mel_batch, frame_batch, coord_batch

    if len(img_batch) > 0:
        img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
@ -279,7 +279,7 @@ def datagen_signal(frame, mel, face_det_results):
        img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
        mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])

-        return img_batch, mel_batch, frame_batch, coords_batch
+        return img_batch, mel_batch, frame_batch, coord_batch


 # 从字节流加载音频数据
@ -294,7 +294,7 @@ def load_audio_from_bytes(byte_data):

 class Human:
    def __init__(self):
-        self._fps = 25  # 20 ms per frame
+        self._fps = 25  # 40 ms per frame
        self._batch_size = 16
        self._sample_rate = 16000
        self._stride_left_size = 10
@ -340,13 +340,14 @@ class Human:
    #     p.terminate()

    def test(self):
-        wav = audio.load_wav(r'./audio/test.wav', 16000)
+        wav = audio.load_wav(r'./audio/audio.wav', 16000)
        # with open(r'./audio/test.wav', 'rb') as f:
        #     byte_data = f.read()
        #
        # byte_data = byte_data[16:]
        # inputs = np.concatenate(byte_data)  # [N * chunk]
        # wav = load_audio_from_bytes(inputs)
+        print('wav length:', len(wav))
        mel = audio.melspectrogram(wav)
        if np.isnan(mel.reshape(-1)).sum() > 0:
            raise ValueError(
@ -405,9 +406,9 @@ class Human:
                p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))

                f[y1:y2, x1:x2] = p
-                # name = "%04d" % j
-                # cv2.imwrite(f'temp/images/{j}.jpg', p)
-                # j = j + 1
+                name = "%04d" % j
+                cv2.imwrite(f'temp/images/{j}.jpg', p)
+                j = j + 1
                p = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
                self._test_image_queue.put(p)
                # out.write(f)
@ -460,7 +461,6 @@ class Human:
        self._feat_queue.put(mel_chunks)

    def push_audio_frames(self, chunk, type_):
-        print("push_audio_frames")
        self._output_queue.put((chunk, type_))

    def push_render_image(self, image):
--- a/audio_render/AudioRender.dll
+++ b/audio_render/AudioRender.dll
--- a/audio_render/AudioRender.lib
+++ b/audio_render/AudioRender.lib
--- a/audio_render/AudioRender.pdb
+++ b/audio_render/AudioRender.pdb
--- a/audio_render/init.py
+++ b/audio_render/init.py
@ -0,0 +1,3 @@
+#encoding = utf8
+
+from .audio_render import AudioRender
--- a/audio_render/audio_render.py
+++ b/audio_render/audio_render.py
@ -0,0 +1,35 @@
+#encoding = utf8
+
+from ctypes import *
+import os
+current = os.path.dirname(__file__)
+dynamic_path = os.path.join(current, 'AudioRender.dll')
+
+
+def audio_render_log_callback(level, log, size):
+    print(f'level={level}, log={log}, len={size}')
+
+
+class AudioRender:
+    def __init__(self):
+        self.__audio_render_obj = WinDLL(dynamic_path)
+        print(self.__audio_render_obj)
+        if self.__audio_render_obj is not None:
+            CALLBACK_TYPE = CFUNCTYPE(None, c_int, c_ubyte, c_uint)
+            c_callback = CALLBACK_TYPE(audio_render_log_callback)
+            self.__init = self.__audio_render_obj.Initialize(c_callback)
+            print('AudioRender init', self.__init)
+
+    def __del__(self):
+        print('AudioRender __del__')
+        if self.__audio_render_obj is None:
+            return
+        if self.__init:
+            self.__audio_render_obj.Uninitialize()
+
+    def write(self, data, size):
+        if not self.__init:
+            return False
+
+        self.__audio_render_obj.argtypes = (POINTER(c_ubyte), c_uint)
+        return self.__audio_render_obj.Write(data.ctypes.data_as(POINTER(c_ubyte)), size)
--- a/edge_tts_test.py
+++ b/edge_tts_test.py
@ -1,102 +1,105 @@
 #encoding = utf8
-
-import edge_tts
-import asyncio
-import pyaudio
-from pydub import AudioSegment
-from io import BytesIO
-
-# 如果在 Jupyter Notebook 中使用，解除事件循环限制
-try:
-    import nest_asyncio
-    nest_asyncio.apply()
-except ImportError:
-    pass
-
-def play_audio(data: bytes, stream: pyaudio.Stream) -> None:
-  stream.write(AudioSegment.from_mp3(BytesIO(data)).raw_data)
-
-CHUNK_SIZE = 20 * 1024
-async def play_tts(text, voice):
-    communicate = edge_tts.Communicate(text, voice)
-
-    # 设置 PyAudio
-    audio = pyaudio.PyAudio()
-    stream = audio.open(format=pyaudio.paInt16, channels=1, rate=16000, output=True)
-
-    # async for chunk in communicate.stream():  # 使用 stream 方法
-    #     if chunk['type'] == 'audio':  # 确保 chunk 是字节流
-    #         stream.write(chunk['data'])
-
-    total_data = b''
-    for chunk in communicate.stream_sync():
-      if chunk["type"] == "audio" and chunk["data"]:
-        total_data += chunk["data"]
-        if len(total_data) >= CHUNK_SIZE:
-          # print(f"Time elapsed: {time.time() - start_time:.2f} seconds")  # Print time
-          stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data)
-          # play_audio(total_data[:CHUNK_SIZE], stream)  # Play first CHUNK_SIZE bytes
-          total_data = total_data[CHUNK_SIZE:]  # Remove played data
-    # play_audio(total_data, stream)
-    # 停止和关闭音频流
-    stream.stop_stream()
-    stream.close()
-    audio.terminate()
-
-
-async def save_to_file(text, voice, filename):
-    communicate = edge_tts.Communicate(text, voice)
-
-    with open(filename, "wb") as f:
-        async for chunk in communicate.stream():
-            if chunk['type'] == 'audio':
-                f.write(chunk['data'])
-
-if __name__ == "__main__":
-    text = "Hello, this is a test of the Edge TTS service."
-    voice = "en-US-JessaNeural"
-
-    # 使用 asyncio.run() 运行异步函数
-    asyncio.run(play_tts(text, voice))
-    # asyncio.run(save_to_file(text, voice, "output.wav"))
-
 #
 # import edge_tts
+# import asyncio
 # import pyaudio
-# from io import BytesIO
 # from pydub import AudioSegment
-# import time
+# from io import BytesIO
 #
-# TEXT = 'Hello World! How are you guys doing? I hope great, cause I am having fun and honestly it has been a blast'
-# VOICE = "en-US-AndrewMultilingualNeural"
-# CHUNK_SIZE = 20 * 1024  # Assuming around 1024 bytes per chunk (adjust based on format)
-#
-# def main() -> None:
-#   start_time = time.time()
-#   communicator = edge_tts.Communicate(TEXT, VOICE)
-#
-#   pyaudio_instance = pyaudio.PyAudio()
-#   audio_stream = pyaudio_instance.open(format=pyaudio.paInt16, channels=1, rate=16000, output=True)
-#
-#   total_data = b''  # Store audio data instead of chunks
-#
-#   for chunk in communicator.stream_sync():
-#     if chunk["type"] == "audio" and chunk["data"]:
-#       total_data += chunk["data"]
-#       if len(total_data) >= CHUNK_SIZE:
-#         print(f"Time elapsed: {time.time() - start_time:.2f} seconds")  # Print time
-#         play_audio(total_data[:CHUNK_SIZE], audio_stream)  # Play first CHUNK_SIZE bytes
-#         total_data = total_data[CHUNK_SIZE:]  # Remove played data
-#
-#   # Play remaining audio
-#   play_audio(total_data, audio_stream)
-#
-#   audio_stream.stop_stream()
-#   audio_stream.close()
-#   pyaudio_instance.terminate()
+# # 如果在 Jupyter Notebook 中使用，解除事件循环限制
+# try:
+#     import nest_asyncio
+#     nest_asyncio.apply()
+# except ImportError:
+#     pass
 #
 # def play_audio(data: bytes, stream: pyaudio.Stream) -> None:
 #   stream.write(AudioSegment.from_mp3(BytesIO(data)).raw_data)
 #
+# CHUNK_SIZE = 20 * 1024
+# async def play_tts(text, voice):
+#     communicate = edge_tts.Communicate(text, voice)
+#
+#     # 设置 PyAudio
+#     audio = pyaudio.PyAudio()
+#     stream = audio.open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
+#
+#     # async for chunk in communicate.stream():  # 使用 stream 方法
+#     #     if chunk['type'] == 'audio':  # 确保 chunk 是字节流
+#     #         stream.write(chunk['data'])
+#
+#     total_data = b''
+#     for chunk in communicate.stream_sync():
+#       if chunk["type"] == "audio" and chunk["data"]:
+#         total_data += chunk["data"]
+#         if len(total_data) >= CHUNK_SIZE:
+#           # print(f"Time elapsed: {time.time() - start_time:.2f} seconds")  # Print time
+#           stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data)
+#           # play_audio(total_data[:CHUNK_SIZE], stream)  # Play first CHUNK_SIZE bytes
+#           total_data = total_data[CHUNK_SIZE:]  # Remove played data
+#     # play_audio(total_data, stream)
+#     # 停止和关闭音频流
+#     stream.stop_stream()
+#     stream.close()
+#     audio.terminate()
+#
+#
+# async def save_to_file(text, voice, filename):
+#     communicate = edge_tts.Communicate(text, voice)
+#
+#     with open(filename, "wb") as f:
+#         async for chunk in communicate.stream():
+#             if chunk['type'] == 'audio':
+#                 f.write(chunk['data'])
+#
 # if __name__ == "__main__":
-#   main()
+#     text = "Hello, this is a test of the Edge TTS service."
+#     voice = "en-US-JessaNeural"
+#
+#     # 使用 asyncio.run() 运行异步函数
+#     asyncio.run(play_tts(text, voice))
+#     # asyncio.run(save_to_file(text, voice, "output.wav"))
+
+
+import edge_tts
+import pyaudio
+from io import BytesIO
+from pydub import AudioSegment
+import time
+
+TEXT = 'Hello World! How are you guys doing? I hope great, cause I am having fun and honestly it has been a blast'
+VOICE = "en-US-AndrewMultilingualNeural"
+CHUNK_SIZE = 20 * 1024  # Assuming around 1024 bytes per chunk (adjust based on format)
+
+
+def main() -> None:
+    start_time = time.time()
+    communicator = edge_tts.Communicate(TEXT, VOICE)
+
+    pyaudio_instance = pyaudio.PyAudio()
+    audio_stream = pyaudio_instance.open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
+
+    total_data = b''  # Store audio data instead of chunks
+
+    for chunk in communicator.stream_sync():
+      if chunk["type"] == "audio" and chunk["data"]:
+        total_data += chunk["data"]
+        if len(total_data) >= CHUNK_SIZE:
+          print(f"Time elapsed: {time.time() - start_time:.2f} seconds")  # Print time
+          play_audio(total_data[:CHUNK_SIZE], audio_stream)  # Play first CHUNK_SIZE bytes
+          total_data = total_data[CHUNK_SIZE:]  # Remove played data
+
+    # Play remaining audio
+    play_audio(total_data, audio_stream)
+
+    audio_stream.stop_stream()
+    audio_stream.close()
+    pyaudio_instance.terminate()
+
+
+def play_audio(data: bytes, stream: pyaudio.Stream) -> None:
+    stream.write(AudioSegment.from_mp3(BytesIO(data)).raw_data)
+
+
+if __name__ == "__main__":
+    main()
--- a/infer.py
+++ b/infer.py
@ -1,5 +1,6 @@
 #encoding = utf8
 import queue
+import time
 from queue import Queue
 from threading import Thread, Event
 import logging
@ -169,6 +170,7 @@ class Infer:

        j = 0

+        count = 0
        while self._exit_event.is_set():
            try:
                m = self._queue.get(block=True, timeout=1)
@ -180,6 +182,8 @@ class Infer:
            img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
            mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)

+            time.sleep(0.01)
+
            with torch.no_grad():
                pred = model(mel_batch, img_batch)

@ -189,12 +193,14 @@ class Infer:
                p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))

                f[y1:y2, x1:x2] = p
-                name = "%04d" % j
+                # name = "%04d" % j
                cv2.imwrite(f'temp/images/{j}.jpg', p)
                j = j + 1
+                # count = count + 1
                p = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
                self._human.push_render_image(p)
                # out.write(f)
+            # print('infer count:', count)

    def push(self, chunk):
        self._queue.put(chunk)
--- a/tts/Chunk2Mal.py
+++ b/tts/Chunk2Mal.py
@ -1,5 +1,5 @@
 #encoding = utf8
-
+import ctypes
 import logging
 import queue
 import time
@ -8,6 +8,7 @@ from threading import Thread, Event

 import numpy as np
 import audio
+from audio_render import AudioRender


 class Chunk2Mal:
@ -17,6 +18,7 @@ class Chunk2Mal:
        self._thread = None

        self._chunks = []
+        self._audio_chunks = []
        # 320 samples per chunk (20ms * 16000 / 1000)audio_chunk
        self._chunk_len = self._human.get_audio_sample_rate() // self._human.get_fps()

@ -24,63 +26,67 @@ class Chunk2Mal:
        self._thread = Thread(target=self._on_run)
        self._exit_event.set()
        self._thread.start()
+        self._audio_render = AudioRender()
        logging.info('chunk2mal start')

+    def _concatenate(self):
+        logging.info('np.concatenate')
+        inputs = np.concatenate(self._chunks)  # [5 * chunk]
+        self._chunks = []
+        mel = audio.melspectrogram(inputs)
+        if np.isnan(mel.reshape(-1)).sum() > 0:
+            raise ValueError(
+                'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
+
+        mel_step_size = 16
+        # print('fps:', self._human.get_fps())
+        mel_idx_multiplier = 80. / self._human.get_fps()
+        # print('mel_idx_multiplier:', mel_idx_multiplier)
+        count = 0
+        i = 0
+        while 1:
+            count = count + 1
+            start_idx = int(i * mel_idx_multiplier)
+            print('i', i, 'start_idx', start_idx, 'mel len:', len(mel[0]))
+            if start_idx + mel_step_size > len(mel[0]):
+                self._human.push_mel_chunks_queue(mel[:, len(mel[0]) - mel_step_size:])
+                break
+            self._human.push_mel_chunks_queue(mel[:, start_idx: start_idx + mel_step_size])
+            i += 1
+
+        wav = np.concatenate(self._audio_chunks)  # [5 * chunk]self._audio_chunks
+        wav *= 32767 / max(0.01, np.max(np.abs(wav)))
+        wav = wav.astype(np.int16)
+        self._audio_render.write(wav, len(wav))
+        self._audio_chunks = []
+
+        print('mel_chunks count:', count)
+
    def _on_run(self):
        logging.info('chunk2mal run')
        while self._exit_event.is_set():
            if self._audio_chunk_queue.empty():
-                time.sleep(0.5)
+                if len(self._chunks) > 0:
+                    self._concatenate()
+                else:
+                    time.sleep(0.5)
                continue
            try:
                chunk = self._audio_chunk_queue.get(block=True, timeout=1)
                self._chunks.append(chunk)
-                self._human.push_audio_frames(chunk, 0)
-                if len(self._chunks) < 10:
+                self._audio_chunks.append(chunk.copy())
+                # print(type(chunk))
+
+                # self._human.push_audio_frames(chunk, 0)
+                if len(self._chunks) < 102: # 200ms
                    continue
            except queue.Empty:
                # print('Chunk2Mal queue.Empty')
                continue

+            print('len(self._chunks):', len(self._chunks))
+            self._concatenate()

-            logging.info('np.concatenate')
-            inputs = np.concatenate(self._chunks)  # [N * chunk]
-            mel = audio.melspectrogram(inputs)
-            if np.isnan(mel.reshape(-1)).sum() > 0:
-                raise ValueError(
-                    'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
-
-            mel_step_size = 16
-            print('fps:', self._human.get_fps())
-            mel_idx_multiplier = 80. / self._human.get_fps()
-            print('mel_idx_multiplier:', mel_idx_multiplier)
-
-            i = 0
-            while 1:
-                start_idx = int(i * mel_idx_multiplier)
-                if start_idx + mel_step_size > len(mel[0]):
-                    self._human.push_mel_chunks_queue(mel[:, len(mel[0]) - mel_step_size:])
-                    break
-                self._human.push_mel_chunks_queue(mel[:, start_idx: start_idx + mel_step_size])
-                i += 1
-
-            batch_size = 128
-
-            '''
-            while i < (len(self._chunks) - self._human.get_stride_left_size()
-                       - self._human.get_stride_right_size()) / 2:
-                start_idx = int(left + i * mel_idx_multiplier)
-                # print(start_idx)
-                if start_idx + mel_step_size > len(mel[0]):
-                    mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
-                else:
-                    mel_chunks.append(mel[:, start_idx: start_idx + mel_step_size])
-                i += 1
-            self._human.push_feat_queue(mel_chunks)
-
-            # discard the old part to save memory
-            self._chunks = self._chunks[-(self._human.get_stride_left_size() + self._human.get_stride_right_size()):]
-            '''

        logging.info('chunk2mal exit')

--- a/tts/TTSBase.py
+++ b/tts/TTSBase.py
@ -57,33 +57,19 @@ class TTSBase:

        self._io_stream.seek(0)
        stream = self.__create_bytes_stream(self._io_stream)
+        # audio.save_wav(stream, "./temp/audio/test1.wav", 16000)
        stream_len = stream.shape[0]

-        sr = 16000
-        soundfile.read('./temp/audio/audio.wav', stream, sr)
-        # audio_chunks = audio.split_audio(stream, sr, 4)
-
-        # display(audio.play_audio_chunk(audio_chunks[0], sr=sr))
-
-        # 保存切割后的片段
-        # audio.save_chunks(stream[0:-1], sr, './temp/audio/')
-        # audio.save_chunks(audio_chunks, sr, './temp/audio/')
-        # try:
-        #     sounddevice.play(stream, samplerate=self._human.get_audio_sample_rate())
-        #     sounddevice.wait()  # 等待音频播放完毕
-        # except Exception as e:
-        #     logger.error(f"播放音频出错: {e}") playrec
+        print("stream_len:", stream_len, " _chunk_len:", self._chunk_len)
        index = 0
+        segment = 0
        while stream_len >= self._chunk_len:
            audio_chunk = stream[index:index + self._chunk_len]
-            # sounddevice.play(audio_chunk, samplerate=self._human.get_audio_sample_rate())
-            # self._pcm_stream.write(audio_chunk)
-            # self._pcm_stream.write(audio_chunk.tobytes())
-            # self._human.push_audio_chunk(audio_chunk)
-            # self._human.push_mel_chunks_queue(audio_chunk)
            self._human.push_audio_chunk(audio_chunk)
            stream_len -= self._chunk_len
            index += self._chunk_len
+            segment = segment + 1
+        print("segment:", segment)
        self._io_stream.seek(0)
        self._io_stream.truncate()

--- a/ui.py
+++ b/ui.py
@ -44,6 +44,7 @@ class App(customtkinter.CTk):
        # self.logo_label.grid(row=0, column=0, padx=20, pady=(20, 10))

        self.entry = customtkinter.CTkEntry(self, placeholder_text="输入内容")
+        self.entry.insert(0, "基本信息,北京九零科技有限公司，成立于2015年，位于北京市，是一家以从事科技推广和应用服务业为主的企业。企业注册资本500万人民币。")
        self.entry.grid(row=2, column=0, columnspan=2, padx=(20, 0), pady=(20, 20), sticky="nsew")

        self.main_button_1 = customtkinter.CTkButton(master=self, fg_color="transparent", border_width=2,
@ -63,13 +64,13 @@ class App(customtkinter.CTk):
        self._human.on_destroy()

    def play_audio(self):
-        # return
-        if self._is_play_audio:
-            return
-        self._is_play_audio = True
-        file = os.path.curdir + '/audio/test.wav'
-        print(file)
-        winsound.PlaySound(file, winsound.SND_ASYNC or winsound.SND_FILENAME)
+        return
+        # if self._is_play_audio:
+        #     return
+        # self._is_play_audio = True
+        # file = os.path.curdir + '/audio/test1.wav'
+        # print(file)
+        # winsound.PlaySound(file, winsound.SND_ASYNC or winsound.SND_FILENAME)
        # playsound(file)

    def _init_image_canvas(self):
@ -105,11 +106,11 @@ class App(customtkinter.CTk):
        height = self.winfo_height() * 0.5
        self._canvas.create_image(width, height, anchor=customtkinter.CENTER, image=imgtk)
        self._canvas.update()
-        self.after(33, self._render)
+        self.after(40, self._render)

    def request_tts(self):
        content = self.entry.get()
-        content = 'Hello, this is a test of the Edge TTS service.'
+        # content = ''
        print('content:', content)
        self.entry.delete(0, customtkinter.END)
        self._human.read(content)