modify render and sync audio

2024-11-01 02:31:59 +08:00 · 2024-11-01 02:31:59 +08:00 · 82eac73454
commit 82eac73454
parent f631fbb067
12 changed files with 36 additions and 24 deletions
--- a/asr/sherpa_ncnn_asr.py
+++ b/asr/sherpa_ncnn_asr.py
@ -61,7 +61,7 @@ class SherpaNcnnAsr(AsrBase):
        last_result = ""
        logger.info(f'_recognize_loop')
        while not self._stop_event.is_set():
-            self._notify_complete('中国人民万岁')
+            self._notify_complete('介绍中国5000年历史文学')
            segment_id += 1
            time.sleep(60)
        #
--- a/face/img00016.jpg
+++ b/face/img00016.jpg
--- a/human/audio_inference_handler.py
+++ b/human/audio_inference_handler.py
@ -40,7 +40,7 @@ class AudioInferenceHandler(AudioHandler):
        super().on_message(message)
    def __on_run(self):
-        wav2lip_path = os.path.join(current_file_path, '..', 'checkpoints', 'wav2lip.pth')
+        wav2lip_path = os.path.join(current_file_path, '..', 'checkpoints', 'wav2lip_gan.pth')
        logger.info(f'AudioInferenceHandler init, path:{wav2lip_path}')
        model = load_model(wav2lip_path)
        logger.info("Model loaded")
--- a/human/human_render.py
+++ b/human/human_render.py
@ -17,7 +17,8 @@ class HumanRender(AudioHandler):
        self._voice_render = VoiceRender(play_clock, context)
        self._video_render = VideoRender(play_clock, context, self)
        self._image_render = None
-        self._last_ps = 0
+        self._last_audio_ps = 0
        self._last_video_ps = 0
    def set_image_render(self, render):
        self._image_render = render
@ -31,12 +32,13 @@ class HumanRender(AudioHandler):
    def on_handle(self, stream, index):
        res_frame, idx, audio_frames = stream
-        self._voice_render.put(audio_frames, self._last_ps)
+        self._voice_render.put(audio_frames, self._last_audio_ps)
        self._last_audio_ps = self._last_audio_ps + 0.2
        type_ = 1
        if audio_frames[0][1] != 0 and audio_frames[1][1] != 0:
            type_ = 0
-        self._video_render.put((res_frame, idx, type_), self._last_ps)
+        self._video_render.put((res_frame, idx, type_), self._last_video_ps)
-        self._last_ps = self._last_ps + 0.2
+        self._last_video_ps = self._last_video_ps + 0.4
        if self._voice_render.is_full():
            self._context.notify({'msg_id': MessageType.Video_Render_Queue_Full})
--- a/nlp/nlp_doubao.py
+++ b/nlp/nlp_doubao.py
@ -44,7 +44,7 @@ class DouBao(NLPBase):
            sec = ''
            async for completion in stream:
                sec = sec + completion.choices[0].delta.content
-                # print(sec)
+                print('DouBao content:', sec)
                sec, message = self._split_handle.handle(sec)
                if len(message) > 0:
                    self._on_callback(message)
--- a/nlp/nlp_split.py
+++ b/nlp/nlp_split.py
@ -14,6 +14,7 @@ class PunctuationSplit(NLPSplit):
        self._pattern = r'(?<!\d)[,.，。？！：；、]'
    def handle(self, message: str):
        message = message.replace('*', '')
        match = re.search(self._pattern, message)
        if match:
            pos = match.start() + 1
--- a/render/base_render.py
+++ b/render/base_render.py
@ -31,6 +31,7 @@ class BaseRender(ABC):
        logging.info(f'{self._type} render exit')
    def put(self, frame, ps):
        print('put:', ps)
        self._queue.put((frame, ps))
    def size(self):
--- a/render/video_render.py
+++ b/render/video_render.py
@ -13,7 +13,7 @@ from human.message_type import MessageType
 class VideoRender(BaseRender):
    def __init__(self, play_clock, context, human_render):
-        super().__init__(play_clock, context, 'Video', 0.035)
+        super().__init__(play_clock, context, 'Video', 0.038)
        self._human_render = human_render
        self._diff_avg_count = 0
@ -36,7 +36,7 @@ class VideoRender(BaseRender):
                else:
                    if time_difference < -self._play_clock.audio_diff_threshold:
                        sleep_time = abs(time_difference)
-                        # print("Video frame waiting to catch up with audio", sleep_time)
+                        print("Video frame waiting to catch up with audio", sleep_time)
                        if sleep_time <= 1.0:
                            time.sleep(sleep_time)
@ -47,7 +47,7 @@ class VideoRender(BaseRender):
            else:
                self._diff_avg_count = 0
-            print('video render:', ps, '  ', clock_time, '  ', time_difference,
+            print('video render:', ps, clock_time, time_difference,
                  'get face', self._queue.size(), self._diff_avg_count)
            if type_ == 0:
--- a/tts/tts_base.py
+++ b/tts/tts_base.py
@ -12,7 +12,7 @@ logger = logging.getLogger(__name__)
 class TTSBase(NLPCallback):
    def __init__(self, handle):
        self._handle = handle
-        self._message_queue = AsyncTaskQueue(1)
+        self._message_queue = AsyncTaskQueue(5)
    @property
    def handle(self):
--- a/ui.py
+++ b/ui.py
@ -14,16 +14,13 @@ import requests
 import winsound
 from PIL import Image, ImageTk
 from playsound import playsound
 from audio_render import AudioRender
 # from Human import Human
 from human import HumanContext
-from utils import config_logging
+from utils import config_logging, read_image
 # from tts.EdgeTTS import EdgeTTS
 logger = logging.getLogger(__name__)
 current_file_path = os.path.dirname(os.path.abspath(__file__))
 customtkinter.set_appearance_mode("System")  # Modes: "System" (standard), "Dark", "Light"
 customtkinter.set_default_color_theme("green")  # Themes: "blue" (standard), "green", "dark-blue"
@ -50,14 +47,17 @@ class App(customtkinter.CTk):
                                                 font=customtkinter.CTkFont(size=20, weight="bold"))
        # self.logo_label.grid(row=0, column=0, padx=20, pady=(20, 10))
-        self.entry = customtkinter.CTkEntry(self, placeholder_text="输入内容")
+        # self.entry = customtkinter.CTkEntry(self, placeholder_text="输入内容")
-        self.entry.insert(0, "大家好，测试虚拟数字人。")
+        # self.entry.insert(0, "大家好，测试虚拟数字人。")
-        self.entry.grid(row=2, column=0, columnspan=2, padx=(20, 0), pady=(20, 20), sticky="nsew")
+        # self.entry.grid(row=2, column=0, columnspan=2, padx=(20, 0), pady=(20, 20), sticky="nsew")
-
+        #
-        self.main_button_1 = customtkinter.CTkButton(master=self, fg_color="transparent", border_width=2,
+        # self.main_button_1 = customtkinter.CTkButton(master=self, fg_color="transparent", border_width=2,
-                                                     text_color=("gray10", "#DCE4EE"), text='发送',
+        #                                              text_color=("gray10", "#DCE4EE"), text='发送',
-                                                     command=self.request_tts)
+        #                                              command=self.request_tts)
-        self.main_button_1.grid(row=2, column=2, padx=(20, 20), pady=(20, 20), sticky="nsew")
+        # self.main_button_1.grid(row=2, column=2, padx=(20, 20), pady=(20, 20), sticky="nsew")
        background = os.path.join(current_file_path, 'data', 'background', 'background.webp')
        logger.info(f'background: {background}')
        self._background = ImageTk.PhotoImage(read_image(background))
        self._init_image_canvas()
@ -110,6 +110,7 @@ class App(customtkinter.CTk):
        self._canvas.delete("all")
        self._canvas.imgtk = imgtk
        width = self.winfo_width() * 0.5
        height = self.winfo_height() * 0.5
        self._canvas.create_image(width, height, anchor=customtkinter.CENTER, image=imgtk)
--- a/utils/init.py
+++ b/utils/init.py
@ -3,4 +3,5 @@
 from .async_task_queue import AsyncTaskQueue
 from .sync_queue import SyncQueue
 from .utils import mirror_index, load_model, get_device, load_avatar, config_logging
 from .utils import read_image
 from .audio_utils import melspectrogram, save_wav
--- a/utils/utils.py
+++ b/utils/utils.py
@ -6,6 +6,7 @@ import cv2
 import numpy as np
 import torch
 from tqdm import tqdm
 from PIL import Image
 import face_detection
 from models import Wav2Lip
@ -23,6 +24,11 @@ def mirror_index(size, index):
        return size - res - 1
 def read_image(path):
    image = Image.open(path)
    return image
 def read_images(img_list):
    frames = []
    print('reading images...')