diff --git a/Human.py b/Human.py index 02c4097..41f7d17 100644 --- a/Human.py +++ b/Human.py @@ -9,6 +9,7 @@ import time import numpy as np +import pyaudio import audio import face_detection @@ -291,14 +292,16 @@ class Human: self._output_queue = mp.Queue() self._res_frame_queue = mp.Queue(self._batch_size * 2) - # self._chunk_2_mal = Chunk2Mal(self) - # self._tts = TTSBase(self) + self._chunk_2_mal = Chunk2Mal(self) + self._tts = TTSBase(self) self.mel_chunks_queue_ = Queue() + self.audio_chunks_queue_ = Queue() self._test_image_queue = Queue() self._thread = None # self.test() + # self.play_pcm() # face_images_path = r'./face/' # self._face_image_paths = utils.read_files_path(face_images_path) @@ -309,6 +312,19 @@ class Human: # )).start() # self.render_event.set() + # def play_pcm(self): + # p = pyaudio.PyAudio() + # stream = p.open(format=p.get_format_from_width(2), channels=1, rate=16000, output=True) + # file1 = r'./audio/en_weather.pcm' + # + # # 将 pcm 数据直接写入 PyAudio 的数据流 + # with open(file1, "rb") as f: + # stream.write(f.read()) + # + # stream.stop_stream() + # stream.close() + # p.terminate() + def test(self): wav = audio.load_wav(r'./audio/audio1.wav', 16000) mel = audio.melspectrogram(wav) @@ -346,8 +362,8 @@ class Human: print("Model loaded") frame_h, frame_w = face_list_cycle[0].shape[:-1] - out = cv2.VideoWriter('temp/resul_tttt.avi', - cv2.VideoWriter_fourcc(*'DIVX'), 25, (frame_w, frame_h)) + # out = cv2.VideoWriter('temp/resul_tttt.avi', + # cv2.VideoWriter_fourcc(*'DIVX'), 25, (frame_w, frame_h)) face_det_results = face_detect(face_list_cycle) @@ -374,12 +390,12 @@ class Human: # j = j + 1 p = cv2.cvtColor(f, cv2.COLOR_BGR2RGB) self._test_image_queue.put(p) - out.write(f) - - out.release() - command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format('./audio/audio1.wav', 'temp/resul_tttt.avi', - 'temp/resul_tttt.mp4') - subprocess.call(command, shell=platform.system() != 'Windows') + # out.write(f) + # + # out.release() + # command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format('./audio/audio1.wav', 'temp/resul_tttt.avi', + # 'temp/resul_tttt.mp4') + # subprocess.call(command, shell=platform.system() != 'Windows') # gen = datagen(face_list_cycle, self.mel_chunks_queue_) @@ -407,18 +423,18 @@ class Human: logging.info('human destroy') def read(self, txt): - # if self._tts is None: - # logging.warning('tts is none') - # return - - if self._thread is None: - self._thread = threading.Thread(target=self.test) - self._thread.start() - # self._tts.push_txt(txt) + if self._tts is None: + logging.warning('tts is none') + return + self._tts.push_txt(txt) def push_audio_chunk(self, audio_chunk): self._chunk_2_mal.push_chunk(audio_chunk) + def push_mel_chunks_queue(self, mel_chunk): + self.mel_chunks_queue_.put(mel_chunk) + # self.audio_chunks_queue_.put(audio_chunk) + def push_feat_queue(self, mel_chunks): print("push_feat_queue") self._feat_queue.put(mel_chunks) diff --git a/edge_tts_test.py b/edge_tts_test.py new file mode 100644 index 0000000..9b72790 --- /dev/null +++ b/edge_tts_test.py @@ -0,0 +1,102 @@ +#encoding = utf8 + +import edge_tts +import asyncio +import pyaudio +from pydub import AudioSegment +from io import BytesIO + +# 如果在 Jupyter Notebook 中使用,解除事件循环限制 +try: + import nest_asyncio + nest_asyncio.apply() +except ImportError: + pass + +def play_audio(data: bytes, stream: pyaudio.Stream) -> None: + stream.write(AudioSegment.from_mp3(BytesIO(data)).raw_data) + +CHUNK_SIZE = 20 * 1024 +async def play_tts(text, voice): + communicate = edge_tts.Communicate(text, voice) + + # 设置 PyAudio + audio = pyaudio.PyAudio() + stream = audio.open(format=pyaudio.paInt16, channels=1, rate=24000, output=True) + + # async for chunk in communicate.stream(): # 使用 stream 方法 + # if chunk['type'] == 'audio': # 确保 chunk 是字节流 + # stream.write(chunk['data']) + + total_data = b'' + for chunk in communicate.stream_sync(): + if chunk["type"] == "audio" and chunk["data"]: + total_data += chunk["data"] + if len(total_data) >= CHUNK_SIZE: + # print(f"Time elapsed: {time.time() - start_time:.2f} seconds") # Print time + stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data) + # play_audio(total_data[:CHUNK_SIZE], stream) # Play first CHUNK_SIZE bytes + total_data = total_data[CHUNK_SIZE:] # Remove played data + # play_audio(total_data, stream) + # 停止和关闭音频流 + stream.stop_stream() + stream.close() + audio.terminate() + + +async def save_to_file(text, voice, filename): + communicate = edge_tts.Communicate(text, voice) + + with open(filename, "wb") as f: + async for chunk in communicate.stream(): + if chunk['type'] == 'audio': + f.write(chunk['data']) + +if __name__ == "__main__": + text = "Hello, this is a test of the Edge TTS service." + voice = "en-US-JessaNeural" + + # 使用 asyncio.run() 运行异步函数 + asyncio.run(play_tts(text, voice)) + # asyncio.run(save_to_file(text, voice, "output.wav")) + +# +# import edge_tts +# import pyaudio +# from io import BytesIO +# from pydub import AudioSegment +# import time +# +# TEXT = 'Hello World! How are you guys doing? I hope great, cause I am having fun and honestly it has been a blast' +# VOICE = "en-US-AndrewMultilingualNeural" +# CHUNK_SIZE = 20 * 1024 # Assuming around 1024 bytes per chunk (adjust based on format) +# +# def main() -> None: +# start_time = time.time() +# communicator = edge_tts.Communicate(TEXT, VOICE) +# +# pyaudio_instance = pyaudio.PyAudio() +# audio_stream = pyaudio_instance.open(format=pyaudio.paInt16, channels=1, rate=16000, output=True) +# +# total_data = b'' # Store audio data instead of chunks +# +# for chunk in communicator.stream_sync(): +# if chunk["type"] == "audio" and chunk["data"]: +# total_data += chunk["data"] +# if len(total_data) >= CHUNK_SIZE: +# print(f"Time elapsed: {time.time() - start_time:.2f} seconds") # Print time +# play_audio(total_data[:CHUNK_SIZE], audio_stream) # Play first CHUNK_SIZE bytes +# total_data = total_data[CHUNK_SIZE:] # Remove played data +# +# # Play remaining audio +# play_audio(total_data, audio_stream) +# +# audio_stream.stop_stream() +# audio_stream.close() +# pyaudio_instance.terminate() +# +# def play_audio(data: bytes, stream: pyaudio.Stream) -> None: +# stream.write(AudioSegment.from_mp3(BytesIO(data)).raw_data) +# +# if __name__ == "__main__": +# main() \ No newline at end of file diff --git a/tts/Chunk2Mal.py b/tts/Chunk2Mal.py index 783a2a7..7388b3b 100644 --- a/tts/Chunk2Mal.py +++ b/tts/Chunk2Mal.py @@ -36,19 +36,35 @@ class Chunk2Mal: # print('Chunk2Mal queue.Empty') continue - if len(self._chunks) <= self._human.get_stride_left_size() + self._human.get_stride_right_size(): - # print('Chunk2Mal queue.Empty') + if type_ == 0: continue logging.info('np.concatenate') - inputs = np.concatenate(self._chunks) # [N * chunk] - mel = audio.melspectrogram(inputs) - left = max(0, self._human.get_stride_left_size() * 80 / 50) - right = min(len(mel[0]), len(mel[0]) - self._human.get_stride_right_size() * 80 / 50) - mel_idx_multiplier = 80. * 2 / self._human.get_fps() + mel = audio.melspectrogram(chunk) + if np.isnan(mel.reshape(-1)).sum() > 0: + raise ValueError( + 'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again') + mel_step_size = 16 + + print('fps:', self._human.get_fps()) + mel_idx_multiplier = 80. / self._human.get_fps() + print('mel_idx_multiplier:', mel_idx_multiplier) + i = 0 - mel_chunks = [] + while 1: + start_idx = int(i * mel_idx_multiplier) + if start_idx + mel_step_size > len(mel[0]): + # mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:]) + self._human.push_mel_chunks_queue(mel[:, len(mel[0]) - mel_step_size:]) + break + # mel_chunks.append(mel[:, start_idx: start_idx + mel_step_size]) + self._human.push_mel_chunks_queue(mel[:, start_idx: start_idx + mel_step_size]) + i += 1 + + batch_size = 128 + + ''' while i < (len(self._chunks) - self._human.get_stride_left_size() - self._human.get_stride_right_size()) / 2: start_idx = int(left + i * mel_idx_multiplier) @@ -62,6 +78,7 @@ class Chunk2Mal: # discard the old part to save memory self._chunks = self._chunks[-(self._human.get_stride_left_size() + self._human.get_stride_right_size()):] + ''' logging.info('chunk2mal exit') diff --git a/tts/TTSBase.py b/tts/TTSBase.py index d38a539..49f3b01 100644 --- a/tts/TTSBase.py +++ b/tts/TTSBase.py @@ -5,6 +5,7 @@ import time import edge_tts import numpy as np +import pyaudio import soundfile import resampy import queue @@ -12,6 +13,8 @@ from io import BytesIO from queue import Queue from threading import Thread, Event +from pydub import AudioSegment + logger = logging.getLogger(__name__) @@ -23,12 +26,15 @@ class TTSBase: self._exit_event = None self._io_stream = BytesIO() self._sample_rate = 16000 - self._chunk = self._sample_rate // self._human.get_fps() + self._chunk_len = self._sample_rate // self._human.get_fps() self._exit_event = Event() self._thread = Thread(target=self._on_run) self._exit_event.set() self._thread.start() + self._pcm_player = pyaudio.PyAudio() + self._pcm_stream = self._pcm_player.open(format=pyaudio.paInt16, + channels=1, rate=16000, output=True) logging.info('tts start') def _on_run(self): @@ -51,10 +57,15 @@ class TTSBase: stream = self.__create_bytes_stream(self._io_stream) stream_len = stream.shape[0] index = 0 - while stream_len >= self._chunk: - self._human.push_audio_chunk(stream[index:index + self._chunk]) - stream_len -= self._chunk - index += self._chunk + while stream_len >= self._chunk_len: + audio_chunk = stream[index:index + self._chunk_len] + # self._pcm_stream.write(audio_chunk) + # self._pcm_stream.write(AudioSegment.from_mp3(audio_chunk)) + # self._human.push_audio_chunk(audio_chunk) + # self._human.push_mel_chunks_queue(audio_chunk) + self._human.push_audio_chunk(audio_chunk) + stream_len -= self._chunk_len + index += self._chunk_len def __create_bytes_stream(self, io_stream): stream, sample_rate = soundfile.read(io_stream) @@ -74,14 +85,38 @@ class TTSBase: async def __on_request(self, voice, txt): communicate = edge_tts.Communicate(txt, voice) first = True - async for chuck in communicate.stream(): - if first: - first = False + # total_data = b'' + # CHUNK_SIZE = self._chunk_len + async for chunk in communicate.stream(): + if chunk["type"] == "audio" and chunk["data"]: + self._io_stream.write(chunk['data']) + # total_data += chunk["data"] + # if len(total_data) >= CHUNK_SIZE: + # print(f"Time elapsed: {time.time() - start_time:.2f} seconds") # Print time + # audio_data = AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])) #.raw_data + # audio_data = audio_data.set_frame_rate(self._human.get_audio_sample_rate()) + # self._human.push_audio_chunk(audio_data) + # self._pcm_stream.write(audio_data.raw_data) + # play_audio(total_data[:CHUNK_SIZE], stream) # Play first CHUNK_SIZE bytes + # total_data = total_data[CHUNK_SIZE:] # Remove played data - if chuck['type'] == 'audio': - self._io_stream.write(chuck['data']) + # if first: + # first = False + + # if chuck['type'] == 'audio': + # # self._io_stream.write(chuck['data']) + # self._io_stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data) + # if len(total_data) > 0: + # self._pcm_stream.write(AudioSegment.from_mp3(BytesIO(total_data)).raw_data) + # audio_data = AudioSegment.from_mp3(BytesIO(total_data)) # .raw_data + # audio_data = audio_data.set_frame_rate(self._human.get_audio_sample_rate()) + # self._human.push_audio_chunk(audio_data) + # self._io_stream.write(AudioSegment.from_mp3(BytesIO(total_data)).raw_data) def stop(self): + self._pcm_stream.stop_stream() + self._pcm_player.close(self._pcm_stream) + self._pcm_player.terminate() if self._exit_event is None: return diff --git a/ui.py b/ui.py index 12190f8..a4a80cc 100644 --- a/ui.py +++ b/ui.py @@ -1,14 +1,18 @@ #encoding = utf8 import json import logging +import os from logging import handlers import tkinter import tkinter.messagebox import customtkinter import cv2 import requests +import winsound from PIL import Image, ImageTk +from playsound import playsound + from Human import Human from tts.EdgeTTS import EdgeTTS @@ -25,7 +29,7 @@ class App(customtkinter.CTk): self._tts_url = 'http://localhost:8080' # configure window - self.title("数字人测试demo") + self.title("TTS demo") self.geometry(f"{1100}x{580}") self.grid_columnconfigure(1, weight=1) @@ -49,13 +53,24 @@ class App(customtkinter.CTk): self._init_image_canvas() + self._is_play_audio = False self._human = Human() self._render() + # self.play_audio() def on_destroy(self): logger.info('------------App destroy------------') self._human.on_destroy() + def play_audio(self): + if self._is_play_audio: + return + self._is_play_audio = True + file = os.path.curdir + '/audio/audio1.wav' + print(file) + winsound.PlaySound(file, winsound.SND_ASYNC or winsound.SND_FILENAME) + # playsound(file) + def _init_image_canvas(self): self._canvas = customtkinter.CTkCanvas(self.image_frame) self._canvas.pack(fill=customtkinter.BOTH, expand=customtkinter.YES) @@ -66,6 +81,7 @@ class App(customtkinter.CTk): self.after(100, self._render) return + self.play_audio() iheight, iwidth = image.shape[0], image.shape[1] width = self.winfo_width() height = self.winfo_height() @@ -88,10 +104,11 @@ class App(customtkinter.CTk): height = self.winfo_height() * 0.5 self._canvas.create_image(width, height, anchor=customtkinter.CENTER, image=imgtk) self._canvas.update() - self.after(60, self._render) + self.after(34, self._render) def request_tts(self): content = self.entry.get() + content = 'Hello, this is a test of the Edge TTS service.' print('content:', content) self.entry.delete(0, customtkinter.END) self._human.read(content)