diff --git a/Human.py b/Human.py index 2450c20..de253c4 100644 --- a/Human.py +++ b/Human.py @@ -246,12 +246,12 @@ def datagen(frames, mels): def datagen_signal(frame, mel, face_det_results): - img_batch, mel_batch, frame_batch, coords_batch = [], [], [], [] + img_batch, mel_batch, frame_batch, coord_batch = [], [], [], [] # for i, m in enumerate(mels): idx = 0 frame_to_save = frame.copy() - face, coords = face_det_results[idx].copy() + face, coord = face_det_results[idx].copy() face = cv2.resize(face, (img_size, img_size)) m = mel @@ -259,7 +259,7 @@ def datagen_signal(frame, mel, face_det_results): img_batch.append(face) mel_batch.append(m) frame_batch.append(frame_to_save) - coords_batch.append(coords) + coord_batch.append(coord) if len(img_batch) >= wav2lip_batch_size: img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch) @@ -269,7 +269,7 @@ def datagen_signal(frame, mel, face_det_results): img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255. mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1]) - return img_batch, mel_batch, frame_batch, coords_batch + return img_batch, mel_batch, frame_batch, coord_batch if len(img_batch) > 0: img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch) @@ -279,7 +279,7 @@ def datagen_signal(frame, mel, face_det_results): img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255. mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1]) - return img_batch, mel_batch, frame_batch, coords_batch + return img_batch, mel_batch, frame_batch, coord_batch # 从字节流加载音频数据 @@ -294,7 +294,7 @@ def load_audio_from_bytes(byte_data): class Human: def __init__(self): - self._fps = 25 # 20 ms per frame + self._fps = 25 # 40 ms per frame self._batch_size = 16 self._sample_rate = 16000 self._stride_left_size = 10 @@ -340,13 +340,14 @@ class Human: # p.terminate() def test(self): - wav = audio.load_wav(r'./audio/test.wav', 16000) + wav = audio.load_wav(r'./audio/audio.wav', 16000) # with open(r'./audio/test.wav', 'rb') as f: # byte_data = f.read() # # byte_data = byte_data[16:] # inputs = np.concatenate(byte_data) # [N * chunk] # wav = load_audio_from_bytes(inputs) + print('wav length:', len(wav)) mel = audio.melspectrogram(wav) if np.isnan(mel.reshape(-1)).sum() > 0: raise ValueError( @@ -405,9 +406,9 @@ class Human: p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1)) f[y1:y2, x1:x2] = p - # name = "%04d" % j - # cv2.imwrite(f'temp/images/{j}.jpg', p) - # j = j + 1 + name = "%04d" % j + cv2.imwrite(f'temp/images/{j}.jpg', p) + j = j + 1 p = cv2.cvtColor(f, cv2.COLOR_BGR2RGB) self._test_image_queue.put(p) # out.write(f) @@ -460,7 +461,6 @@ class Human: self._feat_queue.put(mel_chunks) def push_audio_frames(self, chunk, type_): - print("push_audio_frames") self._output_queue.put((chunk, type_)) def push_render_image(self, image): diff --git a/audio_render/AudioRender.dll b/audio_render/AudioRender.dll new file mode 100644 index 0000000..af1e722 Binary files /dev/null and b/audio_render/AudioRender.dll differ diff --git a/audio_render/AudioRender.lib b/audio_render/AudioRender.lib new file mode 100644 index 0000000..1872fe7 Binary files /dev/null and b/audio_render/AudioRender.lib differ diff --git a/audio_render/AudioRender.pdb b/audio_render/AudioRender.pdb new file mode 100644 index 0000000..c23b0e1 Binary files /dev/null and b/audio_render/AudioRender.pdb differ diff --git a/audio_render/__init__.py b/audio_render/__init__.py new file mode 100644 index 0000000..8208c73 --- /dev/null +++ b/audio_render/__init__.py @@ -0,0 +1,3 @@ +#encoding = utf8 + +from .audio_render import AudioRender diff --git a/audio_render/audio_render.py b/audio_render/audio_render.py new file mode 100644 index 0000000..db59f11 --- /dev/null +++ b/audio_render/audio_render.py @@ -0,0 +1,35 @@ +#encoding = utf8 + +from ctypes import * +import os +current = os.path.dirname(__file__) +dynamic_path = os.path.join(current, 'AudioRender.dll') + + +def audio_render_log_callback(level, log, size): + print(f'level={level}, log={log}, len={size}') + + +class AudioRender: + def __init__(self): + self.__audio_render_obj = WinDLL(dynamic_path) + print(self.__audio_render_obj) + if self.__audio_render_obj is not None: + CALLBACK_TYPE = CFUNCTYPE(None, c_int, c_ubyte, c_uint) + c_callback = CALLBACK_TYPE(audio_render_log_callback) + self.__init = self.__audio_render_obj.Initialize(c_callback) + print('AudioRender init', self.__init) + + def __del__(self): + print('AudioRender __del__') + if self.__audio_render_obj is None: + return + if self.__init: + self.__audio_render_obj.Uninitialize() + + def write(self, data, size): + if not self.__init: + return False + + self.__audio_render_obj.argtypes = (POINTER(c_ubyte), c_uint) + return self.__audio_render_obj.Write(data.ctypes.data_as(POINTER(c_ubyte)), size) diff --git a/edge_tts_test.py b/edge_tts_test.py index 682b318..cefe719 100644 --- a/edge_tts_test.py +++ b/edge_tts_test.py @@ -1,102 +1,105 @@ #encoding = utf8 - -import edge_tts -import asyncio -import pyaudio -from pydub import AudioSegment -from io import BytesIO - -# 如果在 Jupyter Notebook 中使用,解除事件循环限制 -try: - import nest_asyncio - nest_asyncio.apply() -except ImportError: - pass - -def play_audio(data: bytes, stream: pyaudio.Stream) -> None: - stream.write(AudioSegment.from_mp3(BytesIO(data)).raw_data) - -CHUNK_SIZE = 20 * 1024 -async def play_tts(text, voice): - communicate = edge_tts.Communicate(text, voice) - - # 设置 PyAudio - audio = pyaudio.PyAudio() - stream = audio.open(format=pyaudio.paInt16, channels=1, rate=16000, output=True) - - # async for chunk in communicate.stream(): # 使用 stream 方法 - # if chunk['type'] == 'audio': # 确保 chunk 是字节流 - # stream.write(chunk['data']) - - total_data = b'' - for chunk in communicate.stream_sync(): - if chunk["type"] == "audio" and chunk["data"]: - total_data += chunk["data"] - if len(total_data) >= CHUNK_SIZE: - # print(f"Time elapsed: {time.time() - start_time:.2f} seconds") # Print time - stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data) - # play_audio(total_data[:CHUNK_SIZE], stream) # Play first CHUNK_SIZE bytes - total_data = total_data[CHUNK_SIZE:] # Remove played data - # play_audio(total_data, stream) - # 停止和关闭音频流 - stream.stop_stream() - stream.close() - audio.terminate() - - -async def save_to_file(text, voice, filename): - communicate = edge_tts.Communicate(text, voice) - - with open(filename, "wb") as f: - async for chunk in communicate.stream(): - if chunk['type'] == 'audio': - f.write(chunk['data']) - -if __name__ == "__main__": - text = "Hello, this is a test of the Edge TTS service." - voice = "en-US-JessaNeural" - - # 使用 asyncio.run() 运行异步函数 - asyncio.run(play_tts(text, voice)) - # asyncio.run(save_to_file(text, voice, "output.wav")) - # # import edge_tts +# import asyncio # import pyaudio -# from io import BytesIO # from pydub import AudioSegment -# import time +# from io import BytesIO # -# TEXT = 'Hello World! How are you guys doing? I hope great, cause I am having fun and honestly it has been a blast' -# VOICE = "en-US-AndrewMultilingualNeural" -# CHUNK_SIZE = 20 * 1024 # Assuming around 1024 bytes per chunk (adjust based on format) -# -# def main() -> None: -# start_time = time.time() -# communicator = edge_tts.Communicate(TEXT, VOICE) -# -# pyaudio_instance = pyaudio.PyAudio() -# audio_stream = pyaudio_instance.open(format=pyaudio.paInt16, channels=1, rate=16000, output=True) -# -# total_data = b'' # Store audio data instead of chunks -# -# for chunk in communicator.stream_sync(): -# if chunk["type"] == "audio" and chunk["data"]: -# total_data += chunk["data"] -# if len(total_data) >= CHUNK_SIZE: -# print(f"Time elapsed: {time.time() - start_time:.2f} seconds") # Print time -# play_audio(total_data[:CHUNK_SIZE], audio_stream) # Play first CHUNK_SIZE bytes -# total_data = total_data[CHUNK_SIZE:] # Remove played data -# -# # Play remaining audio -# play_audio(total_data, audio_stream) -# -# audio_stream.stop_stream() -# audio_stream.close() -# pyaudio_instance.terminate() +# # 如果在 Jupyter Notebook 中使用,解除事件循环限制 +# try: +# import nest_asyncio +# nest_asyncio.apply() +# except ImportError: +# pass # # def play_audio(data: bytes, stream: pyaudio.Stream) -> None: # stream.write(AudioSegment.from_mp3(BytesIO(data)).raw_data) # +# CHUNK_SIZE = 20 * 1024 +# async def play_tts(text, voice): +# communicate = edge_tts.Communicate(text, voice) +# +# # 设置 PyAudio +# audio = pyaudio.PyAudio() +# stream = audio.open(format=pyaudio.paInt16, channels=1, rate=24000, output=True) +# +# # async for chunk in communicate.stream(): # 使用 stream 方法 +# # if chunk['type'] == 'audio': # 确保 chunk 是字节流 +# # stream.write(chunk['data']) +# +# total_data = b'' +# for chunk in communicate.stream_sync(): +# if chunk["type"] == "audio" and chunk["data"]: +# total_data += chunk["data"] +# if len(total_data) >= CHUNK_SIZE: +# # print(f"Time elapsed: {time.time() - start_time:.2f} seconds") # Print time +# stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data) +# # play_audio(total_data[:CHUNK_SIZE], stream) # Play first CHUNK_SIZE bytes +# total_data = total_data[CHUNK_SIZE:] # Remove played data +# # play_audio(total_data, stream) +# # 停止和关闭音频流 +# stream.stop_stream() +# stream.close() +# audio.terminate() +# +# +# async def save_to_file(text, voice, filename): +# communicate = edge_tts.Communicate(text, voice) +# +# with open(filename, "wb") as f: +# async for chunk in communicate.stream(): +# if chunk['type'] == 'audio': +# f.write(chunk['data']) +# # if __name__ == "__main__": -# main() \ No newline at end of file +# text = "Hello, this is a test of the Edge TTS service." +# voice = "en-US-JessaNeural" +# +# # 使用 asyncio.run() 运行异步函数 +# asyncio.run(play_tts(text, voice)) +# # asyncio.run(save_to_file(text, voice, "output.wav")) + + +import edge_tts +import pyaudio +from io import BytesIO +from pydub import AudioSegment +import time + +TEXT = 'Hello World! How are you guys doing? I hope great, cause I am having fun and honestly it has been a blast' +VOICE = "en-US-AndrewMultilingualNeural" +CHUNK_SIZE = 20 * 1024 # Assuming around 1024 bytes per chunk (adjust based on format) + + +def main() -> None: + start_time = time.time() + communicator = edge_tts.Communicate(TEXT, VOICE) + + pyaudio_instance = pyaudio.PyAudio() + audio_stream = pyaudio_instance.open(format=pyaudio.paInt16, channels=1, rate=24000, output=True) + + total_data = b'' # Store audio data instead of chunks + + for chunk in communicator.stream_sync(): + if chunk["type"] == "audio" and chunk["data"]: + total_data += chunk["data"] + if len(total_data) >= CHUNK_SIZE: + print(f"Time elapsed: {time.time() - start_time:.2f} seconds") # Print time + play_audio(total_data[:CHUNK_SIZE], audio_stream) # Play first CHUNK_SIZE bytes + total_data = total_data[CHUNK_SIZE:] # Remove played data + + # Play remaining audio + play_audio(total_data, audio_stream) + + audio_stream.stop_stream() + audio_stream.close() + pyaudio_instance.terminate() + + +def play_audio(data: bytes, stream: pyaudio.Stream) -> None: + stream.write(AudioSegment.from_mp3(BytesIO(data)).raw_data) + + +if __name__ == "__main__": + main() diff --git a/infer.py b/infer.py index 5415dd1..d6f34eb 100644 --- a/infer.py +++ b/infer.py @@ -1,5 +1,6 @@ #encoding = utf8 import queue +import time from queue import Queue from threading import Thread, Event import logging @@ -169,6 +170,7 @@ class Infer: j = 0 + count = 0 while self._exit_event.is_set(): try: m = self._queue.get(block=True, timeout=1) @@ -180,6 +182,8 @@ class Infer: img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device) mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device) + time.sleep(0.01) + with torch.no_grad(): pred = model(mel_batch, img_batch) @@ -189,12 +193,14 @@ class Infer: p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1)) f[y1:y2, x1:x2] = p - name = "%04d" % j + # name = "%04d" % j cv2.imwrite(f'temp/images/{j}.jpg', p) j = j + 1 + # count = count + 1 p = cv2.cvtColor(f, cv2.COLOR_BGR2RGB) self._human.push_render_image(p) # out.write(f) + # print('infer count:', count) def push(self, chunk): self._queue.put(chunk) \ No newline at end of file diff --git a/tts/Chunk2Mal.py b/tts/Chunk2Mal.py index 5dff798..16b71c3 100644 --- a/tts/Chunk2Mal.py +++ b/tts/Chunk2Mal.py @@ -1,5 +1,5 @@ #encoding = utf8 - +import ctypes import logging import queue import time @@ -8,6 +8,7 @@ from threading import Thread, Event import numpy as np import audio +from audio_render import AudioRender class Chunk2Mal: @@ -17,6 +18,7 @@ class Chunk2Mal: self._thread = None self._chunks = [] + self._audio_chunks = [] # 320 samples per chunk (20ms * 16000 / 1000)audio_chunk self._chunk_len = self._human.get_audio_sample_rate() // self._human.get_fps() @@ -24,63 +26,67 @@ class Chunk2Mal: self._thread = Thread(target=self._on_run) self._exit_event.set() self._thread.start() + self._audio_render = AudioRender() logging.info('chunk2mal start') + def _concatenate(self): + logging.info('np.concatenate') + inputs = np.concatenate(self._chunks) # [5 * chunk] + self._chunks = [] + mel = audio.melspectrogram(inputs) + if np.isnan(mel.reshape(-1)).sum() > 0: + raise ValueError( + 'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again') + + mel_step_size = 16 + # print('fps:', self._human.get_fps()) + mel_idx_multiplier = 80. / self._human.get_fps() + # print('mel_idx_multiplier:', mel_idx_multiplier) + count = 0 + i = 0 + while 1: + count = count + 1 + start_idx = int(i * mel_idx_multiplier) + print('i', i, 'start_idx', start_idx, 'mel len:', len(mel[0])) + if start_idx + mel_step_size > len(mel[0]): + self._human.push_mel_chunks_queue(mel[:, len(mel[0]) - mel_step_size:]) + break + self._human.push_mel_chunks_queue(mel[:, start_idx: start_idx + mel_step_size]) + i += 1 + + wav = np.concatenate(self._audio_chunks) # [5 * chunk]self._audio_chunks + wav *= 32767 / max(0.01, np.max(np.abs(wav))) + wav = wav.astype(np.int16) + self._audio_render.write(wav, len(wav)) + self._audio_chunks = [] + + print('mel_chunks count:', count) + def _on_run(self): logging.info('chunk2mal run') while self._exit_event.is_set(): if self._audio_chunk_queue.empty(): - time.sleep(0.5) + if len(self._chunks) > 0: + self._concatenate() + else: + time.sleep(0.5) continue try: chunk = self._audio_chunk_queue.get(block=True, timeout=1) self._chunks.append(chunk) - self._human.push_audio_frames(chunk, 0) - if len(self._chunks) < 10: + self._audio_chunks.append(chunk.copy()) + # print(type(chunk)) + + # self._human.push_audio_frames(chunk, 0) + if len(self._chunks) < 102: # 200ms continue except queue.Empty: # print('Chunk2Mal queue.Empty') continue + print('len(self._chunks):', len(self._chunks)) + self._concatenate() - logging.info('np.concatenate') - inputs = np.concatenate(self._chunks) # [N * chunk] - mel = audio.melspectrogram(inputs) - if np.isnan(mel.reshape(-1)).sum() > 0: - raise ValueError( - 'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again') - - mel_step_size = 16 - print('fps:', self._human.get_fps()) - mel_idx_multiplier = 80. / self._human.get_fps() - print('mel_idx_multiplier:', mel_idx_multiplier) - - i = 0 - while 1: - start_idx = int(i * mel_idx_multiplier) - if start_idx + mel_step_size > len(mel[0]): - self._human.push_mel_chunks_queue(mel[:, len(mel[0]) - mel_step_size:]) - break - self._human.push_mel_chunks_queue(mel[:, start_idx: start_idx + mel_step_size]) - i += 1 - - batch_size = 128 - - ''' - while i < (len(self._chunks) - self._human.get_stride_left_size() - - self._human.get_stride_right_size()) / 2: - start_idx = int(left + i * mel_idx_multiplier) - # print(start_idx) - if start_idx + mel_step_size > len(mel[0]): - mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:]) - else: - mel_chunks.append(mel[:, start_idx: start_idx + mel_step_size]) - i += 1 - self._human.push_feat_queue(mel_chunks) - - # discard the old part to save memory - self._chunks = self._chunks[-(self._human.get_stride_left_size() + self._human.get_stride_right_size()):] - ''' logging.info('chunk2mal exit') diff --git a/tts/TTSBase.py b/tts/TTSBase.py index e747517..b5e7f8f 100644 --- a/tts/TTSBase.py +++ b/tts/TTSBase.py @@ -57,33 +57,19 @@ class TTSBase: self._io_stream.seek(0) stream = self.__create_bytes_stream(self._io_stream) + # audio.save_wav(stream, "./temp/audio/test1.wav", 16000) stream_len = stream.shape[0] - sr = 16000 - soundfile.read('./temp/audio/audio.wav', stream, sr) - # audio_chunks = audio.split_audio(stream, sr, 4) - - # display(audio.play_audio_chunk(audio_chunks[0], sr=sr)) - - # 保存切割后的片段 - # audio.save_chunks(stream[0:-1], sr, './temp/audio/') - # audio.save_chunks(audio_chunks, sr, './temp/audio/') - # try: - # sounddevice.play(stream, samplerate=self._human.get_audio_sample_rate()) - # sounddevice.wait() # 等待音频播放完毕 - # except Exception as e: - # logger.error(f"播放音频出错: {e}") playrec + print("stream_len:", stream_len, " _chunk_len:", self._chunk_len) index = 0 + segment = 0 while stream_len >= self._chunk_len: audio_chunk = stream[index:index + self._chunk_len] - # sounddevice.play(audio_chunk, samplerate=self._human.get_audio_sample_rate()) - # self._pcm_stream.write(audio_chunk) - # self._pcm_stream.write(audio_chunk.tobytes()) - # self._human.push_audio_chunk(audio_chunk) - # self._human.push_mel_chunks_queue(audio_chunk) self._human.push_audio_chunk(audio_chunk) stream_len -= self._chunk_len index += self._chunk_len + segment = segment + 1 + print("segment:", segment) self._io_stream.seek(0) self._io_stream.truncate() diff --git a/ui.py b/ui.py index 9dd6895..303453e 100644 --- a/ui.py +++ b/ui.py @@ -44,6 +44,7 @@ class App(customtkinter.CTk): # self.logo_label.grid(row=0, column=0, padx=20, pady=(20, 10)) self.entry = customtkinter.CTkEntry(self, placeholder_text="输入内容") + self.entry.insert(0, "基本信息,北京九零科技有限公司,成立于2015年,位于北京市,是一家以从事科技推广和应用服务业为主的企业。企业注册资本500万人民币。") self.entry.grid(row=2, column=0, columnspan=2, padx=(20, 0), pady=(20, 20), sticky="nsew") self.main_button_1 = customtkinter.CTkButton(master=self, fg_color="transparent", border_width=2, @@ -63,13 +64,13 @@ class App(customtkinter.CTk): self._human.on_destroy() def play_audio(self): - # return - if self._is_play_audio: - return - self._is_play_audio = True - file = os.path.curdir + '/audio/test.wav' - print(file) - winsound.PlaySound(file, winsound.SND_ASYNC or winsound.SND_FILENAME) + return + # if self._is_play_audio: + # return + # self._is_play_audio = True + # file = os.path.curdir + '/audio/test1.wav' + # print(file) + # winsound.PlaySound(file, winsound.SND_ASYNC or winsound.SND_FILENAME) # playsound(file) def _init_image_canvas(self): @@ -105,11 +106,11 @@ class App(customtkinter.CTk): height = self.winfo_height() * 0.5 self._canvas.create_image(width, height, anchor=customtkinter.CENTER, image=imgtk) self._canvas.update() - self.after(33, self._render) + self.after(40, self._render) def request_tts(self): content = self.entry.get() - content = 'Hello, this is a test of the Edge TTS service.' + # content = '' print('content:', content) self.entry.delete(0, customtkinter.END) self._human.read(content)