modify render and sync audio
This commit is contained in:
parent
f631fbb067
commit
82eac73454
@ -61,7 +61,7 @@ class SherpaNcnnAsr(AsrBase):
|
|||||||
last_result = ""
|
last_result = ""
|
||||||
logger.info(f'_recognize_loop')
|
logger.info(f'_recognize_loop')
|
||||||
while not self._stop_event.is_set():
|
while not self._stop_event.is_set():
|
||||||
self._notify_complete('中国人民万岁')
|
self._notify_complete('介绍中国5000年历史文学')
|
||||||
segment_id += 1
|
segment_id += 1
|
||||||
time.sleep(60)
|
time.sleep(60)
|
||||||
#
|
#
|
||||||
|
Binary file not shown.
Before Width: | Height: | Size: 452 KiB |
@ -40,7 +40,7 @@ class AudioInferenceHandler(AudioHandler):
|
|||||||
super().on_message(message)
|
super().on_message(message)
|
||||||
|
|
||||||
def __on_run(self):
|
def __on_run(self):
|
||||||
wav2lip_path = os.path.join(current_file_path, '..', 'checkpoints', 'wav2lip.pth')
|
wav2lip_path = os.path.join(current_file_path, '..', 'checkpoints', 'wav2lip_gan.pth')
|
||||||
logger.info(f'AudioInferenceHandler init, path:{wav2lip_path}')
|
logger.info(f'AudioInferenceHandler init, path:{wav2lip_path}')
|
||||||
model = load_model(wav2lip_path)
|
model = load_model(wav2lip_path)
|
||||||
logger.info("Model loaded")
|
logger.info("Model loaded")
|
||||||
|
@ -17,7 +17,8 @@ class HumanRender(AudioHandler):
|
|||||||
self._voice_render = VoiceRender(play_clock, context)
|
self._voice_render = VoiceRender(play_clock, context)
|
||||||
self._video_render = VideoRender(play_clock, context, self)
|
self._video_render = VideoRender(play_clock, context, self)
|
||||||
self._image_render = None
|
self._image_render = None
|
||||||
self._last_ps = 0
|
self._last_audio_ps = 0
|
||||||
|
self._last_video_ps = 0
|
||||||
|
|
||||||
def set_image_render(self, render):
|
def set_image_render(self, render):
|
||||||
self._image_render = render
|
self._image_render = render
|
||||||
@ -31,12 +32,13 @@ class HumanRender(AudioHandler):
|
|||||||
|
|
||||||
def on_handle(self, stream, index):
|
def on_handle(self, stream, index):
|
||||||
res_frame, idx, audio_frames = stream
|
res_frame, idx, audio_frames = stream
|
||||||
self._voice_render.put(audio_frames, self._last_ps)
|
self._voice_render.put(audio_frames, self._last_audio_ps)
|
||||||
|
self._last_audio_ps = self._last_audio_ps + 0.2
|
||||||
type_ = 1
|
type_ = 1
|
||||||
if audio_frames[0][1] != 0 and audio_frames[1][1] != 0:
|
if audio_frames[0][1] != 0 and audio_frames[1][1] != 0:
|
||||||
type_ = 0
|
type_ = 0
|
||||||
self._video_render.put((res_frame, idx, type_), self._last_ps)
|
self._video_render.put((res_frame, idx, type_), self._last_video_ps)
|
||||||
self._last_ps = self._last_ps + 0.2
|
self._last_video_ps = self._last_video_ps + 0.4
|
||||||
|
|
||||||
if self._voice_render.is_full():
|
if self._voice_render.is_full():
|
||||||
self._context.notify({'msg_id': MessageType.Video_Render_Queue_Full})
|
self._context.notify({'msg_id': MessageType.Video_Render_Queue_Full})
|
||||||
|
@ -44,7 +44,7 @@ class DouBao(NLPBase):
|
|||||||
sec = ''
|
sec = ''
|
||||||
async for completion in stream:
|
async for completion in stream:
|
||||||
sec = sec + completion.choices[0].delta.content
|
sec = sec + completion.choices[0].delta.content
|
||||||
# print(sec)
|
print('DouBao content:', sec)
|
||||||
sec, message = self._split_handle.handle(sec)
|
sec, message = self._split_handle.handle(sec)
|
||||||
if len(message) > 0:
|
if len(message) > 0:
|
||||||
self._on_callback(message)
|
self._on_callback(message)
|
||||||
|
@ -14,6 +14,7 @@ class PunctuationSplit(NLPSplit):
|
|||||||
self._pattern = r'(?<!\d)[,.,。?!:;、]'
|
self._pattern = r'(?<!\d)[,.,。?!:;、]'
|
||||||
|
|
||||||
def handle(self, message: str):
|
def handle(self, message: str):
|
||||||
|
message = message.replace('*', '')
|
||||||
match = re.search(self._pattern, message)
|
match = re.search(self._pattern, message)
|
||||||
if match:
|
if match:
|
||||||
pos = match.start() + 1
|
pos = match.start() + 1
|
||||||
|
@ -31,6 +31,7 @@ class BaseRender(ABC):
|
|||||||
logging.info(f'{self._type} render exit')
|
logging.info(f'{self._type} render exit')
|
||||||
|
|
||||||
def put(self, frame, ps):
|
def put(self, frame, ps):
|
||||||
|
print('put:', ps)
|
||||||
self._queue.put((frame, ps))
|
self._queue.put((frame, ps))
|
||||||
|
|
||||||
def size(self):
|
def size(self):
|
||||||
|
@ -13,7 +13,7 @@ from human.message_type import MessageType
|
|||||||
|
|
||||||
class VideoRender(BaseRender):
|
class VideoRender(BaseRender):
|
||||||
def __init__(self, play_clock, context, human_render):
|
def __init__(self, play_clock, context, human_render):
|
||||||
super().__init__(play_clock, context, 'Video', 0.035)
|
super().__init__(play_clock, context, 'Video', 0.038)
|
||||||
self._human_render = human_render
|
self._human_render = human_render
|
||||||
self._diff_avg_count = 0
|
self._diff_avg_count = 0
|
||||||
|
|
||||||
@ -36,7 +36,7 @@ class VideoRender(BaseRender):
|
|||||||
else:
|
else:
|
||||||
if time_difference < -self._play_clock.audio_diff_threshold:
|
if time_difference < -self._play_clock.audio_diff_threshold:
|
||||||
sleep_time = abs(time_difference)
|
sleep_time = abs(time_difference)
|
||||||
# print("Video frame waiting to catch up with audio", sleep_time)
|
print("Video frame waiting to catch up with audio", sleep_time)
|
||||||
if sleep_time <= 1.0:
|
if sleep_time <= 1.0:
|
||||||
time.sleep(sleep_time)
|
time.sleep(sleep_time)
|
||||||
|
|
||||||
@ -47,7 +47,7 @@ class VideoRender(BaseRender):
|
|||||||
else:
|
else:
|
||||||
self._diff_avg_count = 0
|
self._diff_avg_count = 0
|
||||||
|
|
||||||
print('video render:', ps, ' ', clock_time, ' ', time_difference,
|
print('video render:', ps, clock_time, time_difference,
|
||||||
'get face', self._queue.size(), self._diff_avg_count)
|
'get face', self._queue.size(), self._diff_avg_count)
|
||||||
|
|
||||||
if type_ == 0:
|
if type_ == 0:
|
||||||
|
@ -12,7 +12,7 @@ logger = logging.getLogger(__name__)
|
|||||||
class TTSBase(NLPCallback):
|
class TTSBase(NLPCallback):
|
||||||
def __init__(self, handle):
|
def __init__(self, handle):
|
||||||
self._handle = handle
|
self._handle = handle
|
||||||
self._message_queue = AsyncTaskQueue(1)
|
self._message_queue = AsyncTaskQueue(5)
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def handle(self):
|
def handle(self):
|
||||||
|
27
ui.py
27
ui.py
@ -14,16 +14,13 @@ import requests
|
|||||||
import winsound
|
import winsound
|
||||||
from PIL import Image, ImageTk
|
from PIL import Image, ImageTk
|
||||||
|
|
||||||
from playsound import playsound
|
|
||||||
|
|
||||||
from audio_render import AudioRender
|
|
||||||
# from Human import Human
|
|
||||||
from human import HumanContext
|
from human import HumanContext
|
||||||
from utils import config_logging
|
from utils import config_logging, read_image
|
||||||
|
|
||||||
# from tts.EdgeTTS import EdgeTTS
|
# from tts.EdgeTTS import EdgeTTS
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
|
|
||||||
customtkinter.set_appearance_mode("System") # Modes: "System" (standard), "Dark", "Light"
|
customtkinter.set_appearance_mode("System") # Modes: "System" (standard), "Dark", "Light"
|
||||||
customtkinter.set_default_color_theme("green") # Themes: "blue" (standard), "green", "dark-blue"
|
customtkinter.set_default_color_theme("green") # Themes: "blue" (standard), "green", "dark-blue"
|
||||||
@ -50,14 +47,17 @@ class App(customtkinter.CTk):
|
|||||||
font=customtkinter.CTkFont(size=20, weight="bold"))
|
font=customtkinter.CTkFont(size=20, weight="bold"))
|
||||||
# self.logo_label.grid(row=0, column=0, padx=20, pady=(20, 10))
|
# self.logo_label.grid(row=0, column=0, padx=20, pady=(20, 10))
|
||||||
|
|
||||||
self.entry = customtkinter.CTkEntry(self, placeholder_text="输入内容")
|
# self.entry = customtkinter.CTkEntry(self, placeholder_text="输入内容")
|
||||||
self.entry.insert(0, "大家好,测试虚拟数字人。")
|
# self.entry.insert(0, "大家好,测试虚拟数字人。")
|
||||||
self.entry.grid(row=2, column=0, columnspan=2, padx=(20, 0), pady=(20, 20), sticky="nsew")
|
# self.entry.grid(row=2, column=0, columnspan=2, padx=(20, 0), pady=(20, 20), sticky="nsew")
|
||||||
|
#
|
||||||
self.main_button_1 = customtkinter.CTkButton(master=self, fg_color="transparent", border_width=2,
|
# self.main_button_1 = customtkinter.CTkButton(master=self, fg_color="transparent", border_width=2,
|
||||||
text_color=("gray10", "#DCE4EE"), text='发送',
|
# text_color=("gray10", "#DCE4EE"), text='发送',
|
||||||
command=self.request_tts)
|
# command=self.request_tts)
|
||||||
self.main_button_1.grid(row=2, column=2, padx=(20, 20), pady=(20, 20), sticky="nsew")
|
# self.main_button_1.grid(row=2, column=2, padx=(20, 20), pady=(20, 20), sticky="nsew")
|
||||||
|
background = os.path.join(current_file_path, 'data', 'background', 'background.webp')
|
||||||
|
logger.info(f'background: {background}')
|
||||||
|
self._background = ImageTk.PhotoImage(read_image(background))
|
||||||
|
|
||||||
self._init_image_canvas()
|
self._init_image_canvas()
|
||||||
|
|
||||||
@ -110,6 +110,7 @@ class App(customtkinter.CTk):
|
|||||||
self._canvas.delete("all")
|
self._canvas.delete("all")
|
||||||
|
|
||||||
self._canvas.imgtk = imgtk
|
self._canvas.imgtk = imgtk
|
||||||
|
|
||||||
width = self.winfo_width() * 0.5
|
width = self.winfo_width() * 0.5
|
||||||
height = self.winfo_height() * 0.5
|
height = self.winfo_height() * 0.5
|
||||||
self._canvas.create_image(width, height, anchor=customtkinter.CENTER, image=imgtk)
|
self._canvas.create_image(width, height, anchor=customtkinter.CENTER, image=imgtk)
|
||||||
|
@ -3,4 +3,5 @@
|
|||||||
from .async_task_queue import AsyncTaskQueue
|
from .async_task_queue import AsyncTaskQueue
|
||||||
from .sync_queue import SyncQueue
|
from .sync_queue import SyncQueue
|
||||||
from .utils import mirror_index, load_model, get_device, load_avatar, config_logging
|
from .utils import mirror_index, load_model, get_device, load_avatar, config_logging
|
||||||
|
from .utils import read_image
|
||||||
from .audio_utils import melspectrogram, save_wav
|
from .audio_utils import melspectrogram, save_wav
|
||||||
|
@ -6,6 +6,7 @@ import cv2
|
|||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
from PIL import Image
|
||||||
|
|
||||||
import face_detection
|
import face_detection
|
||||||
from models import Wav2Lip
|
from models import Wav2Lip
|
||||||
@ -23,6 +24,11 @@ def mirror_index(size, index):
|
|||||||
return size - res - 1
|
return size - res - 1
|
||||||
|
|
||||||
|
|
||||||
|
def read_image(path):
|
||||||
|
image = Image.open(path)
|
||||||
|
return image
|
||||||
|
|
||||||
|
|
||||||
def read_images(img_list):
|
def read_images(img_list):
|
||||||
frames = []
|
frames = []
|
||||||
print('reading images...')
|
print('reading images...')
|
||||||
|
Loading…
Reference in New Issue
Block a user