修改tts录制文件

This commit is contained in:
brige 2024-09-26 20:28:49 +08:00
parent 2127982650
commit bef51d5c47
5 changed files with 225 additions and 38 deletions

View File

@ -9,6 +9,7 @@ import time
import numpy as np
import pyaudio
import audio
import face_detection
@ -291,14 +292,16 @@ class Human:
self._output_queue = mp.Queue()
self._res_frame_queue = mp.Queue(self._batch_size * 2)
# self._chunk_2_mal = Chunk2Mal(self)
# self._tts = TTSBase(self)
self._chunk_2_mal = Chunk2Mal(self)
self._tts = TTSBase(self)
self.mel_chunks_queue_ = Queue()
self.audio_chunks_queue_ = Queue()
self._test_image_queue = Queue()
self._thread = None
# self.test()
# self.play_pcm()
# face_images_path = r'./face/'
# self._face_image_paths = utils.read_files_path(face_images_path)
@ -309,6 +312,19 @@ class Human:
# )).start()
# self.render_event.set()
# def play_pcm(self):
# p = pyaudio.PyAudio()
# stream = p.open(format=p.get_format_from_width(2), channels=1, rate=16000, output=True)
# file1 = r'./audio/en_weather.pcm'
#
# # 将 pcm 数据直接写入 PyAudio 的数据流
# with open(file1, "rb") as f:
# stream.write(f.read())
#
# stream.stop_stream()
# stream.close()
# p.terminate()
def test(self):
wav = audio.load_wav(r'./audio/audio1.wav', 16000)
mel = audio.melspectrogram(wav)
@ -346,8 +362,8 @@ class Human:
print("Model loaded")
frame_h, frame_w = face_list_cycle[0].shape[:-1]
out = cv2.VideoWriter('temp/resul_tttt.avi',
cv2.VideoWriter_fourcc(*'DIVX'), 25, (frame_w, frame_h))
# out = cv2.VideoWriter('temp/resul_tttt.avi',
# cv2.VideoWriter_fourcc(*'DIVX'), 25, (frame_w, frame_h))
face_det_results = face_detect(face_list_cycle)
@ -374,12 +390,12 @@ class Human:
# j = j + 1
p = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
self._test_image_queue.put(p)
out.write(f)
out.release()
command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format('./audio/audio1.wav', 'temp/resul_tttt.avi',
'temp/resul_tttt.mp4')
subprocess.call(command, shell=platform.system() != 'Windows')
# out.write(f)
#
# out.release()
# command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format('./audio/audio1.wav', 'temp/resul_tttt.avi',
# 'temp/resul_tttt.mp4')
# subprocess.call(command, shell=platform.system() != 'Windows')
# gen = datagen(face_list_cycle, self.mel_chunks_queue_)
@ -407,18 +423,18 @@ class Human:
logging.info('human destroy')
def read(self, txt):
# if self._tts is None:
# logging.warning('tts is none')
# return
if self._thread is None:
self._thread = threading.Thread(target=self.test)
self._thread.start()
# self._tts.push_txt(txt)
if self._tts is None:
logging.warning('tts is none')
return
self._tts.push_txt(txt)
def push_audio_chunk(self, audio_chunk):
self._chunk_2_mal.push_chunk(audio_chunk)
def push_mel_chunks_queue(self, mel_chunk):
self.mel_chunks_queue_.put(mel_chunk)
# self.audio_chunks_queue_.put(audio_chunk)
def push_feat_queue(self, mel_chunks):
print("push_feat_queue")
self._feat_queue.put(mel_chunks)

102
edge_tts_test.py Normal file
View File

@ -0,0 +1,102 @@
#encoding = utf8
import edge_tts
import asyncio
import pyaudio
from pydub import AudioSegment
from io import BytesIO
# 如果在 Jupyter Notebook 中使用,解除事件循环限制
try:
import nest_asyncio
nest_asyncio.apply()
except ImportError:
pass
def play_audio(data: bytes, stream: pyaudio.Stream) -> None:
stream.write(AudioSegment.from_mp3(BytesIO(data)).raw_data)
CHUNK_SIZE = 20 * 1024
async def play_tts(text, voice):
communicate = edge_tts.Communicate(text, voice)
# 设置 PyAudio
audio = pyaudio.PyAudio()
stream = audio.open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
# async for chunk in communicate.stream(): # 使用 stream 方法
# if chunk['type'] == 'audio': # 确保 chunk 是字节流
# stream.write(chunk['data'])
total_data = b''
for chunk in communicate.stream_sync():
if chunk["type"] == "audio" and chunk["data"]:
total_data += chunk["data"]
if len(total_data) >= CHUNK_SIZE:
# print(f"Time elapsed: {time.time() - start_time:.2f} seconds") # Print time
stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data)
# play_audio(total_data[:CHUNK_SIZE], stream) # Play first CHUNK_SIZE bytes
total_data = total_data[CHUNK_SIZE:] # Remove played data
# play_audio(total_data, stream)
# 停止和关闭音频流
stream.stop_stream()
stream.close()
audio.terminate()
async def save_to_file(text, voice, filename):
communicate = edge_tts.Communicate(text, voice)
with open(filename, "wb") as f:
async for chunk in communicate.stream():
if chunk['type'] == 'audio':
f.write(chunk['data'])
if __name__ == "__main__":
text = "Hello, this is a test of the Edge TTS service."
voice = "en-US-JessaNeural"
# 使用 asyncio.run() 运行异步函数
asyncio.run(play_tts(text, voice))
# asyncio.run(save_to_file(text, voice, "output.wav"))
#
# import edge_tts
# import pyaudio
# from io import BytesIO
# from pydub import AudioSegment
# import time
#
# TEXT = 'Hello World! How are you guys doing? I hope great, cause I am having fun and honestly it has been a blast'
# VOICE = "en-US-AndrewMultilingualNeural"
# CHUNK_SIZE = 20 * 1024 # Assuming around 1024 bytes per chunk (adjust based on format)
#
# def main() -> None:
# start_time = time.time()
# communicator = edge_tts.Communicate(TEXT, VOICE)
#
# pyaudio_instance = pyaudio.PyAudio()
# audio_stream = pyaudio_instance.open(format=pyaudio.paInt16, channels=1, rate=16000, output=True)
#
# total_data = b'' # Store audio data instead of chunks
#
# for chunk in communicator.stream_sync():
# if chunk["type"] == "audio" and chunk["data"]:
# total_data += chunk["data"]
# if len(total_data) >= CHUNK_SIZE:
# print(f"Time elapsed: {time.time() - start_time:.2f} seconds") # Print time
# play_audio(total_data[:CHUNK_SIZE], audio_stream) # Play first CHUNK_SIZE bytes
# total_data = total_data[CHUNK_SIZE:] # Remove played data
#
# # Play remaining audio
# play_audio(total_data, audio_stream)
#
# audio_stream.stop_stream()
# audio_stream.close()
# pyaudio_instance.terminate()
#
# def play_audio(data: bytes, stream: pyaudio.Stream) -> None:
# stream.write(AudioSegment.from_mp3(BytesIO(data)).raw_data)
#
# if __name__ == "__main__":
# main()

View File

@ -36,19 +36,35 @@ class Chunk2Mal:
# print('Chunk2Mal queue.Empty')
continue
if len(self._chunks) <= self._human.get_stride_left_size() + self._human.get_stride_right_size():
# print('Chunk2Mal queue.Empty')
if type_ == 0:
continue
logging.info('np.concatenate')
inputs = np.concatenate(self._chunks) # [N * chunk]
mel = audio.melspectrogram(inputs)
left = max(0, self._human.get_stride_left_size() * 80 / 50)
right = min(len(mel[0]), len(mel[0]) - self._human.get_stride_right_size() * 80 / 50)
mel_idx_multiplier = 80. * 2 / self._human.get_fps()
mel = audio.melspectrogram(chunk)
if np.isnan(mel.reshape(-1)).sum() > 0:
raise ValueError(
'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
mel_step_size = 16
print('fps:', self._human.get_fps())
mel_idx_multiplier = 80. / self._human.get_fps()
print('mel_idx_multiplier:', mel_idx_multiplier)
i = 0
mel_chunks = []
while 1:
start_idx = int(i * mel_idx_multiplier)
if start_idx + mel_step_size > len(mel[0]):
# mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
self._human.push_mel_chunks_queue(mel[:, len(mel[0]) - mel_step_size:])
break
# mel_chunks.append(mel[:, start_idx: start_idx + mel_step_size])
self._human.push_mel_chunks_queue(mel[:, start_idx: start_idx + mel_step_size])
i += 1
batch_size = 128
'''
while i < (len(self._chunks) - self._human.get_stride_left_size()
- self._human.get_stride_right_size()) / 2:
start_idx = int(left + i * mel_idx_multiplier)
@ -62,6 +78,7 @@ class Chunk2Mal:
# discard the old part to save memory
self._chunks = self._chunks[-(self._human.get_stride_left_size() + self._human.get_stride_right_size()):]
'''
logging.info('chunk2mal exit')

View File

@ -5,6 +5,7 @@ import time
import edge_tts
import numpy as np
import pyaudio
import soundfile
import resampy
import queue
@ -12,6 +13,8 @@ from io import BytesIO
from queue import Queue
from threading import Thread, Event
from pydub import AudioSegment
logger = logging.getLogger(__name__)
@ -23,12 +26,15 @@ class TTSBase:
self._exit_event = None
self._io_stream = BytesIO()
self._sample_rate = 16000
self._chunk = self._sample_rate // self._human.get_fps()
self._chunk_len = self._sample_rate // self._human.get_fps()
self._exit_event = Event()
self._thread = Thread(target=self._on_run)
self._exit_event.set()
self._thread.start()
self._pcm_player = pyaudio.PyAudio()
self._pcm_stream = self._pcm_player.open(format=pyaudio.paInt16,
channels=1, rate=16000, output=True)
logging.info('tts start')
def _on_run(self):
@ -51,10 +57,15 @@ class TTSBase:
stream = self.__create_bytes_stream(self._io_stream)
stream_len = stream.shape[0]
index = 0
while stream_len >= self._chunk:
self._human.push_audio_chunk(stream[index:index + self._chunk])
stream_len -= self._chunk
index += self._chunk
while stream_len >= self._chunk_len:
audio_chunk = stream[index:index + self._chunk_len]
# self._pcm_stream.write(audio_chunk)
# self._pcm_stream.write(AudioSegment.from_mp3(audio_chunk))
# self._human.push_audio_chunk(audio_chunk)
# self._human.push_mel_chunks_queue(audio_chunk)
self._human.push_audio_chunk(audio_chunk)
stream_len -= self._chunk_len
index += self._chunk_len
def __create_bytes_stream(self, io_stream):
stream, sample_rate = soundfile.read(io_stream)
@ -74,14 +85,38 @@ class TTSBase:
async def __on_request(self, voice, txt):
communicate = edge_tts.Communicate(txt, voice)
first = True
async for chuck in communicate.stream():
if first:
first = False
# total_data = b''
# CHUNK_SIZE = self._chunk_len
async for chunk in communicate.stream():
if chunk["type"] == "audio" and chunk["data"]:
self._io_stream.write(chunk['data'])
# total_data += chunk["data"]
# if len(total_data) >= CHUNK_SIZE:
# print(f"Time elapsed: {time.time() - start_time:.2f} seconds") # Print time
# audio_data = AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])) #.raw_data
# audio_data = audio_data.set_frame_rate(self._human.get_audio_sample_rate())
# self._human.push_audio_chunk(audio_data)
# self._pcm_stream.write(audio_data.raw_data)
# play_audio(total_data[:CHUNK_SIZE], stream) # Play first CHUNK_SIZE bytes
# total_data = total_data[CHUNK_SIZE:] # Remove played data
if chuck['type'] == 'audio':
self._io_stream.write(chuck['data'])
# if first:
# first = False
# if chuck['type'] == 'audio':
# # self._io_stream.write(chuck['data'])
# self._io_stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data)
# if len(total_data) > 0:
# self._pcm_stream.write(AudioSegment.from_mp3(BytesIO(total_data)).raw_data)
# audio_data = AudioSegment.from_mp3(BytesIO(total_data)) # .raw_data
# audio_data = audio_data.set_frame_rate(self._human.get_audio_sample_rate())
# self._human.push_audio_chunk(audio_data)
# self._io_stream.write(AudioSegment.from_mp3(BytesIO(total_data)).raw_data)
def stop(self):
self._pcm_stream.stop_stream()
self._pcm_player.close(self._pcm_stream)
self._pcm_player.terminate()
if self._exit_event is None:
return

21
ui.py
View File

@ -1,14 +1,18 @@
#encoding = utf8
import json
import logging
import os
from logging import handlers
import tkinter
import tkinter.messagebox
import customtkinter
import cv2
import requests
import winsound
from PIL import Image, ImageTk
from playsound import playsound
from Human import Human
from tts.EdgeTTS import EdgeTTS
@ -25,7 +29,7 @@ class App(customtkinter.CTk):
self._tts_url = 'http://localhost:8080'
# configure window
self.title("数字人测试demo")
self.title("TTS demo")
self.geometry(f"{1100}x{580}")
self.grid_columnconfigure(1, weight=1)
@ -49,13 +53,24 @@ class App(customtkinter.CTk):
self._init_image_canvas()
self._is_play_audio = False
self._human = Human()
self._render()
# self.play_audio()
def on_destroy(self):
logger.info('------------App destroy------------')
self._human.on_destroy()
def play_audio(self):
if self._is_play_audio:
return
self._is_play_audio = True
file = os.path.curdir + '/audio/audio1.wav'
print(file)
winsound.PlaySound(file, winsound.SND_ASYNC or winsound.SND_FILENAME)
# playsound(file)
def _init_image_canvas(self):
self._canvas = customtkinter.CTkCanvas(self.image_frame)
self._canvas.pack(fill=customtkinter.BOTH, expand=customtkinter.YES)
@ -66,6 +81,7 @@ class App(customtkinter.CTk):
self.after(100, self._render)
return
self.play_audio()
iheight, iwidth = image.shape[0], image.shape[1]
width = self.winfo_width()
height = self.winfo_height()
@ -88,10 +104,11 @@ class App(customtkinter.CTk):
height = self.winfo_height() * 0.5
self._canvas.create_image(width, height, anchor=customtkinter.CENTER, image=imgtk)
self._canvas.update()
self.after(60, self._render)
self.after(34, self._render)
def request_tts(self):
content = self.entry.get()
content = 'Hello, this is a test of the Edge TTS service.'
print('content:', content)
self.entry.delete(0, customtkinter.END)
self._human.read(content)