修改tts录制文件

This commit is contained in:
brige 2024-09-26 20:28:49 +08:00
parent 2127982650
commit bef51d5c47
5 changed files with 225 additions and 38 deletions

View File

@ -9,6 +9,7 @@ import time
import numpy as np import numpy as np
import pyaudio
import audio import audio
import face_detection import face_detection
@ -291,14 +292,16 @@ class Human:
self._output_queue = mp.Queue() self._output_queue = mp.Queue()
self._res_frame_queue = mp.Queue(self._batch_size * 2) self._res_frame_queue = mp.Queue(self._batch_size * 2)
# self._chunk_2_mal = Chunk2Mal(self) self._chunk_2_mal = Chunk2Mal(self)
# self._tts = TTSBase(self) self._tts = TTSBase(self)
self.mel_chunks_queue_ = Queue() self.mel_chunks_queue_ = Queue()
self.audio_chunks_queue_ = Queue()
self._test_image_queue = Queue() self._test_image_queue = Queue()
self._thread = None self._thread = None
# self.test() # self.test()
# self.play_pcm()
# face_images_path = r'./face/' # face_images_path = r'./face/'
# self._face_image_paths = utils.read_files_path(face_images_path) # self._face_image_paths = utils.read_files_path(face_images_path)
@ -309,6 +312,19 @@ class Human:
# )).start() # )).start()
# self.render_event.set() # self.render_event.set()
# def play_pcm(self):
# p = pyaudio.PyAudio()
# stream = p.open(format=p.get_format_from_width(2), channels=1, rate=16000, output=True)
# file1 = r'./audio/en_weather.pcm'
#
# # 将 pcm 数据直接写入 PyAudio 的数据流
# with open(file1, "rb") as f:
# stream.write(f.read())
#
# stream.stop_stream()
# stream.close()
# p.terminate()
def test(self): def test(self):
wav = audio.load_wav(r'./audio/audio1.wav', 16000) wav = audio.load_wav(r'./audio/audio1.wav', 16000)
mel = audio.melspectrogram(wav) mel = audio.melspectrogram(wav)
@ -346,8 +362,8 @@ class Human:
print("Model loaded") print("Model loaded")
frame_h, frame_w = face_list_cycle[0].shape[:-1] frame_h, frame_w = face_list_cycle[0].shape[:-1]
out = cv2.VideoWriter('temp/resul_tttt.avi', # out = cv2.VideoWriter('temp/resul_tttt.avi',
cv2.VideoWriter_fourcc(*'DIVX'), 25, (frame_w, frame_h)) # cv2.VideoWriter_fourcc(*'DIVX'), 25, (frame_w, frame_h))
face_det_results = face_detect(face_list_cycle) face_det_results = face_detect(face_list_cycle)
@ -374,12 +390,12 @@ class Human:
# j = j + 1 # j = j + 1
p = cv2.cvtColor(f, cv2.COLOR_BGR2RGB) p = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
self._test_image_queue.put(p) self._test_image_queue.put(p)
out.write(f) # out.write(f)
#
out.release() # out.release()
command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format('./audio/audio1.wav', 'temp/resul_tttt.avi', # command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format('./audio/audio1.wav', 'temp/resul_tttt.avi',
'temp/resul_tttt.mp4') # 'temp/resul_tttt.mp4')
subprocess.call(command, shell=platform.system() != 'Windows') # subprocess.call(command, shell=platform.system() != 'Windows')
# gen = datagen(face_list_cycle, self.mel_chunks_queue_) # gen = datagen(face_list_cycle, self.mel_chunks_queue_)
@ -407,18 +423,18 @@ class Human:
logging.info('human destroy') logging.info('human destroy')
def read(self, txt): def read(self, txt):
# if self._tts is None: if self._tts is None:
# logging.warning('tts is none') logging.warning('tts is none')
# return return
self._tts.push_txt(txt)
if self._thread is None:
self._thread = threading.Thread(target=self.test)
self._thread.start()
# self._tts.push_txt(txt)
def push_audio_chunk(self, audio_chunk): def push_audio_chunk(self, audio_chunk):
self._chunk_2_mal.push_chunk(audio_chunk) self._chunk_2_mal.push_chunk(audio_chunk)
def push_mel_chunks_queue(self, mel_chunk):
self.mel_chunks_queue_.put(mel_chunk)
# self.audio_chunks_queue_.put(audio_chunk)
def push_feat_queue(self, mel_chunks): def push_feat_queue(self, mel_chunks):
print("push_feat_queue") print("push_feat_queue")
self._feat_queue.put(mel_chunks) self._feat_queue.put(mel_chunks)

102
edge_tts_test.py Normal file
View File

@ -0,0 +1,102 @@
#encoding = utf8
import edge_tts
import asyncio
import pyaudio
from pydub import AudioSegment
from io import BytesIO
# 如果在 Jupyter Notebook 中使用,解除事件循环限制
try:
import nest_asyncio
nest_asyncio.apply()
except ImportError:
pass
def play_audio(data: bytes, stream: pyaudio.Stream) -> None:
stream.write(AudioSegment.from_mp3(BytesIO(data)).raw_data)
CHUNK_SIZE = 20 * 1024
async def play_tts(text, voice):
communicate = edge_tts.Communicate(text, voice)
# 设置 PyAudio
audio = pyaudio.PyAudio()
stream = audio.open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
# async for chunk in communicate.stream(): # 使用 stream 方法
# if chunk['type'] == 'audio': # 确保 chunk 是字节流
# stream.write(chunk['data'])
total_data = b''
for chunk in communicate.stream_sync():
if chunk["type"] == "audio" and chunk["data"]:
total_data += chunk["data"]
if len(total_data) >= CHUNK_SIZE:
# print(f"Time elapsed: {time.time() - start_time:.2f} seconds") # Print time
stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data)
# play_audio(total_data[:CHUNK_SIZE], stream) # Play first CHUNK_SIZE bytes
total_data = total_data[CHUNK_SIZE:] # Remove played data
# play_audio(total_data, stream)
# 停止和关闭音频流
stream.stop_stream()
stream.close()
audio.terminate()
async def save_to_file(text, voice, filename):
communicate = edge_tts.Communicate(text, voice)
with open(filename, "wb") as f:
async for chunk in communicate.stream():
if chunk['type'] == 'audio':
f.write(chunk['data'])
if __name__ == "__main__":
text = "Hello, this is a test of the Edge TTS service."
voice = "en-US-JessaNeural"
# 使用 asyncio.run() 运行异步函数
asyncio.run(play_tts(text, voice))
# asyncio.run(save_to_file(text, voice, "output.wav"))
#
# import edge_tts
# import pyaudio
# from io import BytesIO
# from pydub import AudioSegment
# import time
#
# TEXT = 'Hello World! How are you guys doing? I hope great, cause I am having fun and honestly it has been a blast'
# VOICE = "en-US-AndrewMultilingualNeural"
# CHUNK_SIZE = 20 * 1024 # Assuming around 1024 bytes per chunk (adjust based on format)
#
# def main() -> None:
# start_time = time.time()
# communicator = edge_tts.Communicate(TEXT, VOICE)
#
# pyaudio_instance = pyaudio.PyAudio()
# audio_stream = pyaudio_instance.open(format=pyaudio.paInt16, channels=1, rate=16000, output=True)
#
# total_data = b'' # Store audio data instead of chunks
#
# for chunk in communicator.stream_sync():
# if chunk["type"] == "audio" and chunk["data"]:
# total_data += chunk["data"]
# if len(total_data) >= CHUNK_SIZE:
# print(f"Time elapsed: {time.time() - start_time:.2f} seconds") # Print time
# play_audio(total_data[:CHUNK_SIZE], audio_stream) # Play first CHUNK_SIZE bytes
# total_data = total_data[CHUNK_SIZE:] # Remove played data
#
# # Play remaining audio
# play_audio(total_data, audio_stream)
#
# audio_stream.stop_stream()
# audio_stream.close()
# pyaudio_instance.terminate()
#
# def play_audio(data: bytes, stream: pyaudio.Stream) -> None:
# stream.write(AudioSegment.from_mp3(BytesIO(data)).raw_data)
#
# if __name__ == "__main__":
# main()

View File

@ -36,19 +36,35 @@ class Chunk2Mal:
# print('Chunk2Mal queue.Empty') # print('Chunk2Mal queue.Empty')
continue continue
if len(self._chunks) <= self._human.get_stride_left_size() + self._human.get_stride_right_size(): if type_ == 0:
# print('Chunk2Mal queue.Empty')
continue continue
logging.info('np.concatenate') logging.info('np.concatenate')
inputs = np.concatenate(self._chunks) # [N * chunk] mel = audio.melspectrogram(chunk)
mel = audio.melspectrogram(inputs) if np.isnan(mel.reshape(-1)).sum() > 0:
left = max(0, self._human.get_stride_left_size() * 80 / 50) raise ValueError(
right = min(len(mel[0]), len(mel[0]) - self._human.get_stride_right_size() * 80 / 50) 'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
mel_idx_multiplier = 80. * 2 / self._human.get_fps()
mel_step_size = 16 mel_step_size = 16
print('fps:', self._human.get_fps())
mel_idx_multiplier = 80. / self._human.get_fps()
print('mel_idx_multiplier:', mel_idx_multiplier)
i = 0 i = 0
mel_chunks = [] while 1:
start_idx = int(i * mel_idx_multiplier)
if start_idx + mel_step_size > len(mel[0]):
# mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
self._human.push_mel_chunks_queue(mel[:, len(mel[0]) - mel_step_size:])
break
# mel_chunks.append(mel[:, start_idx: start_idx + mel_step_size])
self._human.push_mel_chunks_queue(mel[:, start_idx: start_idx + mel_step_size])
i += 1
batch_size = 128
'''
while i < (len(self._chunks) - self._human.get_stride_left_size() while i < (len(self._chunks) - self._human.get_stride_left_size()
- self._human.get_stride_right_size()) / 2: - self._human.get_stride_right_size()) / 2:
start_idx = int(left + i * mel_idx_multiplier) start_idx = int(left + i * mel_idx_multiplier)
@ -62,6 +78,7 @@ class Chunk2Mal:
# discard the old part to save memory # discard the old part to save memory
self._chunks = self._chunks[-(self._human.get_stride_left_size() + self._human.get_stride_right_size()):] self._chunks = self._chunks[-(self._human.get_stride_left_size() + self._human.get_stride_right_size()):]
'''
logging.info('chunk2mal exit') logging.info('chunk2mal exit')

View File

@ -5,6 +5,7 @@ import time
import edge_tts import edge_tts
import numpy as np import numpy as np
import pyaudio
import soundfile import soundfile
import resampy import resampy
import queue import queue
@ -12,6 +13,8 @@ from io import BytesIO
from queue import Queue from queue import Queue
from threading import Thread, Event from threading import Thread, Event
from pydub import AudioSegment
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -23,12 +26,15 @@ class TTSBase:
self._exit_event = None self._exit_event = None
self._io_stream = BytesIO() self._io_stream = BytesIO()
self._sample_rate = 16000 self._sample_rate = 16000
self._chunk = self._sample_rate // self._human.get_fps() self._chunk_len = self._sample_rate // self._human.get_fps()
self._exit_event = Event() self._exit_event = Event()
self._thread = Thread(target=self._on_run) self._thread = Thread(target=self._on_run)
self._exit_event.set() self._exit_event.set()
self._thread.start() self._thread.start()
self._pcm_player = pyaudio.PyAudio()
self._pcm_stream = self._pcm_player.open(format=pyaudio.paInt16,
channels=1, rate=16000, output=True)
logging.info('tts start') logging.info('tts start')
def _on_run(self): def _on_run(self):
@ -51,10 +57,15 @@ class TTSBase:
stream = self.__create_bytes_stream(self._io_stream) stream = self.__create_bytes_stream(self._io_stream)
stream_len = stream.shape[0] stream_len = stream.shape[0]
index = 0 index = 0
while stream_len >= self._chunk: while stream_len >= self._chunk_len:
self._human.push_audio_chunk(stream[index:index + self._chunk]) audio_chunk = stream[index:index + self._chunk_len]
stream_len -= self._chunk # self._pcm_stream.write(audio_chunk)
index += self._chunk # self._pcm_stream.write(AudioSegment.from_mp3(audio_chunk))
# self._human.push_audio_chunk(audio_chunk)
# self._human.push_mel_chunks_queue(audio_chunk)
self._human.push_audio_chunk(audio_chunk)
stream_len -= self._chunk_len
index += self._chunk_len
def __create_bytes_stream(self, io_stream): def __create_bytes_stream(self, io_stream):
stream, sample_rate = soundfile.read(io_stream) stream, sample_rate = soundfile.read(io_stream)
@ -74,14 +85,38 @@ class TTSBase:
async def __on_request(self, voice, txt): async def __on_request(self, voice, txt):
communicate = edge_tts.Communicate(txt, voice) communicate = edge_tts.Communicate(txt, voice)
first = True first = True
async for chuck in communicate.stream(): # total_data = b''
if first: # CHUNK_SIZE = self._chunk_len
first = False async for chunk in communicate.stream():
if chunk["type"] == "audio" and chunk["data"]:
self._io_stream.write(chunk['data'])
# total_data += chunk["data"]
# if len(total_data) >= CHUNK_SIZE:
# print(f"Time elapsed: {time.time() - start_time:.2f} seconds") # Print time
# audio_data = AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])) #.raw_data
# audio_data = audio_data.set_frame_rate(self._human.get_audio_sample_rate())
# self._human.push_audio_chunk(audio_data)
# self._pcm_stream.write(audio_data.raw_data)
# play_audio(total_data[:CHUNK_SIZE], stream) # Play first CHUNK_SIZE bytes
# total_data = total_data[CHUNK_SIZE:] # Remove played data
if chuck['type'] == 'audio': # if first:
self._io_stream.write(chuck['data']) # first = False
# if chuck['type'] == 'audio':
# # self._io_stream.write(chuck['data'])
# self._io_stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data)
# if len(total_data) > 0:
# self._pcm_stream.write(AudioSegment.from_mp3(BytesIO(total_data)).raw_data)
# audio_data = AudioSegment.from_mp3(BytesIO(total_data)) # .raw_data
# audio_data = audio_data.set_frame_rate(self._human.get_audio_sample_rate())
# self._human.push_audio_chunk(audio_data)
# self._io_stream.write(AudioSegment.from_mp3(BytesIO(total_data)).raw_data)
def stop(self): def stop(self):
self._pcm_stream.stop_stream()
self._pcm_player.close(self._pcm_stream)
self._pcm_player.terminate()
if self._exit_event is None: if self._exit_event is None:
return return

21
ui.py
View File

@ -1,14 +1,18 @@
#encoding = utf8 #encoding = utf8
import json import json
import logging import logging
import os
from logging import handlers from logging import handlers
import tkinter import tkinter
import tkinter.messagebox import tkinter.messagebox
import customtkinter import customtkinter
import cv2 import cv2
import requests import requests
import winsound
from PIL import Image, ImageTk from PIL import Image, ImageTk
from playsound import playsound
from Human import Human from Human import Human
from tts.EdgeTTS import EdgeTTS from tts.EdgeTTS import EdgeTTS
@ -25,7 +29,7 @@ class App(customtkinter.CTk):
self._tts_url = 'http://localhost:8080' self._tts_url = 'http://localhost:8080'
# configure window # configure window
self.title("数字人测试demo") self.title("TTS demo")
self.geometry(f"{1100}x{580}") self.geometry(f"{1100}x{580}")
self.grid_columnconfigure(1, weight=1) self.grid_columnconfigure(1, weight=1)
@ -49,13 +53,24 @@ class App(customtkinter.CTk):
self._init_image_canvas() self._init_image_canvas()
self._is_play_audio = False
self._human = Human() self._human = Human()
self._render() self._render()
# self.play_audio()
def on_destroy(self): def on_destroy(self):
logger.info('------------App destroy------------') logger.info('------------App destroy------------')
self._human.on_destroy() self._human.on_destroy()
def play_audio(self):
if self._is_play_audio:
return
self._is_play_audio = True
file = os.path.curdir + '/audio/audio1.wav'
print(file)
winsound.PlaySound(file, winsound.SND_ASYNC or winsound.SND_FILENAME)
# playsound(file)
def _init_image_canvas(self): def _init_image_canvas(self):
self._canvas = customtkinter.CTkCanvas(self.image_frame) self._canvas = customtkinter.CTkCanvas(self.image_frame)
self._canvas.pack(fill=customtkinter.BOTH, expand=customtkinter.YES) self._canvas.pack(fill=customtkinter.BOTH, expand=customtkinter.YES)
@ -66,6 +81,7 @@ class App(customtkinter.CTk):
self.after(100, self._render) self.after(100, self._render)
return return
self.play_audio()
iheight, iwidth = image.shape[0], image.shape[1] iheight, iwidth = image.shape[0], image.shape[1]
width = self.winfo_width() width = self.winfo_width()
height = self.winfo_height() height = self.winfo_height()
@ -88,10 +104,11 @@ class App(customtkinter.CTk):
height = self.winfo_height() * 0.5 height = self.winfo_height() * 0.5
self._canvas.create_image(width, height, anchor=customtkinter.CENTER, image=imgtk) self._canvas.create_image(width, height, anchor=customtkinter.CENTER, image=imgtk)
self._canvas.update() self._canvas.update()
self.after(60, self._render) self.after(34, self._render)
def request_tts(self): def request_tts(self):
content = self.entry.get() content = self.entry.get()
content = 'Hello, this is a test of the Edge TTS service.'
print('content:', content) print('content:', content)
self.entry.delete(0, customtkinter.END) self.entry.delete(0, customtkinter.END)
self._human.read(content) self._human.read(content)