modify render and sync audio

This commit is contained in:
jiegeaiai 2024-11-01 02:31:59 +08:00
parent f631fbb067
commit 82eac73454
12 changed files with 36 additions and 24 deletions

View File

@ -61,7 +61,7 @@ class SherpaNcnnAsr(AsrBase):
last_result = ""
logger.info(f'_recognize_loop')
while not self._stop_event.is_set():
self._notify_complete('中国人民万岁')
self._notify_complete('介绍中国5000年历史文学')
segment_id += 1
time.sleep(60)
#

Binary file not shown.

Before

Width:  |  Height:  |  Size: 452 KiB

View File

@ -40,7 +40,7 @@ class AudioInferenceHandler(AudioHandler):
super().on_message(message)
def __on_run(self):
wav2lip_path = os.path.join(current_file_path, '..', 'checkpoints', 'wav2lip.pth')
wav2lip_path = os.path.join(current_file_path, '..', 'checkpoints', 'wav2lip_gan.pth')
logger.info(f'AudioInferenceHandler init, path:{wav2lip_path}')
model = load_model(wav2lip_path)
logger.info("Model loaded")

View File

@ -17,7 +17,8 @@ class HumanRender(AudioHandler):
self._voice_render = VoiceRender(play_clock, context)
self._video_render = VideoRender(play_clock, context, self)
self._image_render = None
self._last_ps = 0
self._last_audio_ps = 0
self._last_video_ps = 0
def set_image_render(self, render):
self._image_render = render
@ -31,12 +32,13 @@ class HumanRender(AudioHandler):
def on_handle(self, stream, index):
res_frame, idx, audio_frames = stream
self._voice_render.put(audio_frames, self._last_ps)
self._voice_render.put(audio_frames, self._last_audio_ps)
self._last_audio_ps = self._last_audio_ps + 0.2
type_ = 1
if audio_frames[0][1] != 0 and audio_frames[1][1] != 0:
type_ = 0
self._video_render.put((res_frame, idx, type_), self._last_ps)
self._last_ps = self._last_ps + 0.2
self._video_render.put((res_frame, idx, type_), self._last_video_ps)
self._last_video_ps = self._last_video_ps + 0.4
if self._voice_render.is_full():
self._context.notify({'msg_id': MessageType.Video_Render_Queue_Full})

View File

@ -44,7 +44,7 @@ class DouBao(NLPBase):
sec = ''
async for completion in stream:
sec = sec + completion.choices[0].delta.content
# print(sec)
print('DouBao content:', sec)
sec, message = self._split_handle.handle(sec)
if len(message) > 0:
self._on_callback(message)

View File

@ -14,6 +14,7 @@ class PunctuationSplit(NLPSplit):
self._pattern = r'(?<!\d)[,.,。?!:;、]'
def handle(self, message: str):
message = message.replace('*', '')
match = re.search(self._pattern, message)
if match:
pos = match.start() + 1

View File

@ -31,6 +31,7 @@ class BaseRender(ABC):
logging.info(f'{self._type} render exit')
def put(self, frame, ps):
print('put:', ps)
self._queue.put((frame, ps))
def size(self):

View File

@ -13,7 +13,7 @@ from human.message_type import MessageType
class VideoRender(BaseRender):
def __init__(self, play_clock, context, human_render):
super().__init__(play_clock, context, 'Video', 0.035)
super().__init__(play_clock, context, 'Video', 0.038)
self._human_render = human_render
self._diff_avg_count = 0
@ -36,7 +36,7 @@ class VideoRender(BaseRender):
else:
if time_difference < -self._play_clock.audio_diff_threshold:
sleep_time = abs(time_difference)
# print("Video frame waiting to catch up with audio", sleep_time)
print("Video frame waiting to catch up with audio", sleep_time)
if sleep_time <= 1.0:
time.sleep(sleep_time)
@ -47,7 +47,7 @@ class VideoRender(BaseRender):
else:
self._diff_avg_count = 0
print('video render:', ps, ' ', clock_time, ' ', time_difference,
print('video render:', ps, clock_time, time_difference,
'get face', self._queue.size(), self._diff_avg_count)
if type_ == 0:

View File

@ -12,7 +12,7 @@ logger = logging.getLogger(__name__)
class TTSBase(NLPCallback):
def __init__(self, handle):
self._handle = handle
self._message_queue = AsyncTaskQueue(1)
self._message_queue = AsyncTaskQueue(5)
@property
def handle(self):

27
ui.py
View File

@ -14,16 +14,13 @@ import requests
import winsound
from PIL import Image, ImageTk
from playsound import playsound
from audio_render import AudioRender
# from Human import Human
from human import HumanContext
from utils import config_logging
from utils import config_logging, read_image
# from tts.EdgeTTS import EdgeTTS
logger = logging.getLogger(__name__)
current_file_path = os.path.dirname(os.path.abspath(__file__))
customtkinter.set_appearance_mode("System") # Modes: "System" (standard), "Dark", "Light"
customtkinter.set_default_color_theme("green") # Themes: "blue" (standard), "green", "dark-blue"
@ -50,14 +47,17 @@ class App(customtkinter.CTk):
font=customtkinter.CTkFont(size=20, weight="bold"))
# self.logo_label.grid(row=0, column=0, padx=20, pady=(20, 10))
self.entry = customtkinter.CTkEntry(self, placeholder_text="输入内容")
self.entry.insert(0, "大家好,测试虚拟数字人。")
self.entry.grid(row=2, column=0, columnspan=2, padx=(20, 0), pady=(20, 20), sticky="nsew")
self.main_button_1 = customtkinter.CTkButton(master=self, fg_color="transparent", border_width=2,
text_color=("gray10", "#DCE4EE"), text='发送',
command=self.request_tts)
self.main_button_1.grid(row=2, column=2, padx=(20, 20), pady=(20, 20), sticky="nsew")
# self.entry = customtkinter.CTkEntry(self, placeholder_text="输入内容")
# self.entry.insert(0, "大家好,测试虚拟数字人。")
# self.entry.grid(row=2, column=0, columnspan=2, padx=(20, 0), pady=(20, 20), sticky="nsew")
#
# self.main_button_1 = customtkinter.CTkButton(master=self, fg_color="transparent", border_width=2,
# text_color=("gray10", "#DCE4EE"), text='发送',
# command=self.request_tts)
# self.main_button_1.grid(row=2, column=2, padx=(20, 20), pady=(20, 20), sticky="nsew")
background = os.path.join(current_file_path, 'data', 'background', 'background.webp')
logger.info(f'background: {background}')
self._background = ImageTk.PhotoImage(read_image(background))
self._init_image_canvas()
@ -110,6 +110,7 @@ class App(customtkinter.CTk):
self._canvas.delete("all")
self._canvas.imgtk = imgtk
width = self.winfo_width() * 0.5
height = self.winfo_height() * 0.5
self._canvas.create_image(width, height, anchor=customtkinter.CENTER, image=imgtk)

View File

@ -3,4 +3,5 @@
from .async_task_queue import AsyncTaskQueue
from .sync_queue import SyncQueue
from .utils import mirror_index, load_model, get_device, load_avatar, config_logging
from .utils import read_image
from .audio_utils import melspectrogram, save_wav

View File

@ -6,6 +6,7 @@ import cv2
import numpy as np
import torch
from tqdm import tqdm
from PIL import Image
import face_detection
from models import Wav2Lip
@ -23,6 +24,11 @@ def mirror_index(size, index):
return size - res - 1
def read_image(path):
image = Image.open(path)
return image
def read_images(img_list):
frames = []
print('reading images...')