modify render and sync audio
This commit is contained in:
parent
f631fbb067
commit
82eac73454
@ -61,7 +61,7 @@ class SherpaNcnnAsr(AsrBase):
|
||||
last_result = ""
|
||||
logger.info(f'_recognize_loop')
|
||||
while not self._stop_event.is_set():
|
||||
self._notify_complete('中国人民万岁')
|
||||
self._notify_complete('介绍中国5000年历史文学')
|
||||
segment_id += 1
|
||||
time.sleep(60)
|
||||
#
|
||||
|
Binary file not shown.
Before Width: | Height: | Size: 452 KiB |
@ -40,7 +40,7 @@ class AudioInferenceHandler(AudioHandler):
|
||||
super().on_message(message)
|
||||
|
||||
def __on_run(self):
|
||||
wav2lip_path = os.path.join(current_file_path, '..', 'checkpoints', 'wav2lip.pth')
|
||||
wav2lip_path = os.path.join(current_file_path, '..', 'checkpoints', 'wav2lip_gan.pth')
|
||||
logger.info(f'AudioInferenceHandler init, path:{wav2lip_path}')
|
||||
model = load_model(wav2lip_path)
|
||||
logger.info("Model loaded")
|
||||
|
@ -17,7 +17,8 @@ class HumanRender(AudioHandler):
|
||||
self._voice_render = VoiceRender(play_clock, context)
|
||||
self._video_render = VideoRender(play_clock, context, self)
|
||||
self._image_render = None
|
||||
self._last_ps = 0
|
||||
self._last_audio_ps = 0
|
||||
self._last_video_ps = 0
|
||||
|
||||
def set_image_render(self, render):
|
||||
self._image_render = render
|
||||
@ -31,12 +32,13 @@ class HumanRender(AudioHandler):
|
||||
|
||||
def on_handle(self, stream, index):
|
||||
res_frame, idx, audio_frames = stream
|
||||
self._voice_render.put(audio_frames, self._last_ps)
|
||||
self._voice_render.put(audio_frames, self._last_audio_ps)
|
||||
self._last_audio_ps = self._last_audio_ps + 0.2
|
||||
type_ = 1
|
||||
if audio_frames[0][1] != 0 and audio_frames[1][1] != 0:
|
||||
type_ = 0
|
||||
self._video_render.put((res_frame, idx, type_), self._last_ps)
|
||||
self._last_ps = self._last_ps + 0.2
|
||||
self._video_render.put((res_frame, idx, type_), self._last_video_ps)
|
||||
self._last_video_ps = self._last_video_ps + 0.4
|
||||
|
||||
if self._voice_render.is_full():
|
||||
self._context.notify({'msg_id': MessageType.Video_Render_Queue_Full})
|
||||
|
@ -44,7 +44,7 @@ class DouBao(NLPBase):
|
||||
sec = ''
|
||||
async for completion in stream:
|
||||
sec = sec + completion.choices[0].delta.content
|
||||
# print(sec)
|
||||
print('DouBao content:', sec)
|
||||
sec, message = self._split_handle.handle(sec)
|
||||
if len(message) > 0:
|
||||
self._on_callback(message)
|
||||
|
@ -14,6 +14,7 @@ class PunctuationSplit(NLPSplit):
|
||||
self._pattern = r'(?<!\d)[,.,。?!:;、]'
|
||||
|
||||
def handle(self, message: str):
|
||||
message = message.replace('*', '')
|
||||
match = re.search(self._pattern, message)
|
||||
if match:
|
||||
pos = match.start() + 1
|
||||
|
@ -31,6 +31,7 @@ class BaseRender(ABC):
|
||||
logging.info(f'{self._type} render exit')
|
||||
|
||||
def put(self, frame, ps):
|
||||
print('put:', ps)
|
||||
self._queue.put((frame, ps))
|
||||
|
||||
def size(self):
|
||||
|
@ -13,7 +13,7 @@ from human.message_type import MessageType
|
||||
|
||||
class VideoRender(BaseRender):
|
||||
def __init__(self, play_clock, context, human_render):
|
||||
super().__init__(play_clock, context, 'Video', 0.035)
|
||||
super().__init__(play_clock, context, 'Video', 0.038)
|
||||
self._human_render = human_render
|
||||
self._diff_avg_count = 0
|
||||
|
||||
@ -36,7 +36,7 @@ class VideoRender(BaseRender):
|
||||
else:
|
||||
if time_difference < -self._play_clock.audio_diff_threshold:
|
||||
sleep_time = abs(time_difference)
|
||||
# print("Video frame waiting to catch up with audio", sleep_time)
|
||||
print("Video frame waiting to catch up with audio", sleep_time)
|
||||
if sleep_time <= 1.0:
|
||||
time.sleep(sleep_time)
|
||||
|
||||
@ -47,7 +47,7 @@ class VideoRender(BaseRender):
|
||||
else:
|
||||
self._diff_avg_count = 0
|
||||
|
||||
print('video render:', ps, ' ', clock_time, ' ', time_difference,
|
||||
print('video render:', ps, clock_time, time_difference,
|
||||
'get face', self._queue.size(), self._diff_avg_count)
|
||||
|
||||
if type_ == 0:
|
||||
|
@ -12,7 +12,7 @@ logger = logging.getLogger(__name__)
|
||||
class TTSBase(NLPCallback):
|
||||
def __init__(self, handle):
|
||||
self._handle = handle
|
||||
self._message_queue = AsyncTaskQueue(1)
|
||||
self._message_queue = AsyncTaskQueue(5)
|
||||
|
||||
@property
|
||||
def handle(self):
|
||||
|
27
ui.py
27
ui.py
@ -14,16 +14,13 @@ import requests
|
||||
import winsound
|
||||
from PIL import Image, ImageTk
|
||||
|
||||
from playsound import playsound
|
||||
|
||||
from audio_render import AudioRender
|
||||
# from Human import Human
|
||||
from human import HumanContext
|
||||
from utils import config_logging
|
||||
from utils import config_logging, read_image
|
||||
|
||||
# from tts.EdgeTTS import EdgeTTS
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
||||
|
||||
customtkinter.set_appearance_mode("System") # Modes: "System" (standard), "Dark", "Light"
|
||||
customtkinter.set_default_color_theme("green") # Themes: "blue" (standard), "green", "dark-blue"
|
||||
@ -50,14 +47,17 @@ class App(customtkinter.CTk):
|
||||
font=customtkinter.CTkFont(size=20, weight="bold"))
|
||||
# self.logo_label.grid(row=0, column=0, padx=20, pady=(20, 10))
|
||||
|
||||
self.entry = customtkinter.CTkEntry(self, placeholder_text="输入内容")
|
||||
self.entry.insert(0, "大家好,测试虚拟数字人。")
|
||||
self.entry.grid(row=2, column=0, columnspan=2, padx=(20, 0), pady=(20, 20), sticky="nsew")
|
||||
|
||||
self.main_button_1 = customtkinter.CTkButton(master=self, fg_color="transparent", border_width=2,
|
||||
text_color=("gray10", "#DCE4EE"), text='发送',
|
||||
command=self.request_tts)
|
||||
self.main_button_1.grid(row=2, column=2, padx=(20, 20), pady=(20, 20), sticky="nsew")
|
||||
# self.entry = customtkinter.CTkEntry(self, placeholder_text="输入内容")
|
||||
# self.entry.insert(0, "大家好,测试虚拟数字人。")
|
||||
# self.entry.grid(row=2, column=0, columnspan=2, padx=(20, 0), pady=(20, 20), sticky="nsew")
|
||||
#
|
||||
# self.main_button_1 = customtkinter.CTkButton(master=self, fg_color="transparent", border_width=2,
|
||||
# text_color=("gray10", "#DCE4EE"), text='发送',
|
||||
# command=self.request_tts)
|
||||
# self.main_button_1.grid(row=2, column=2, padx=(20, 20), pady=(20, 20), sticky="nsew")
|
||||
background = os.path.join(current_file_path, 'data', 'background', 'background.webp')
|
||||
logger.info(f'background: {background}')
|
||||
self._background = ImageTk.PhotoImage(read_image(background))
|
||||
|
||||
self._init_image_canvas()
|
||||
|
||||
@ -110,6 +110,7 @@ class App(customtkinter.CTk):
|
||||
self._canvas.delete("all")
|
||||
|
||||
self._canvas.imgtk = imgtk
|
||||
|
||||
width = self.winfo_width() * 0.5
|
||||
height = self.winfo_height() * 0.5
|
||||
self._canvas.create_image(width, height, anchor=customtkinter.CENTER, image=imgtk)
|
||||
|
@ -3,4 +3,5 @@
|
||||
from .async_task_queue import AsyncTaskQueue
|
||||
from .sync_queue import SyncQueue
|
||||
from .utils import mirror_index, load_model, get_device, load_avatar, config_logging
|
||||
from .utils import read_image
|
||||
from .audio_utils import melspectrogram, save_wav
|
||||
|
@ -6,6 +6,7 @@ import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
from PIL import Image
|
||||
|
||||
import face_detection
|
||||
from models import Wav2Lip
|
||||
@ -23,6 +24,11 @@ def mirror_index(size, index):
|
||||
return size - res - 1
|
||||
|
||||
|
||||
def read_image(path):
|
||||
image = Image.open(path)
|
||||
return image
|
||||
|
||||
|
||||
def read_images(img_list):
|
||||
frames = []
|
||||
print('reading images...')
|
||||
|
Loading…
Reference in New Issue
Block a user