modify render talking

2024-12-06 08:27:18 +08:00 · 2024-12-06 08:27:18 +08:00 · 0eefa7b1ce
commit 0eefa7b1ce
parent 31f9ec50cb
3 changed files with 37 additions and 14 deletions
--- a/human/audio_inference_handler.py
+++ b/human/audio_inference_handler.py
@ -3,7 +3,6 @@ import logging
 import os
 import queue
 import time
-from queue import Queue
 from threading import Event, Thread

 import cv2
@ -13,6 +12,7 @@ import torch
 from eventbus import EventBus
 from human_handler import AudioHandler
 from utils import load_model, mirror_index, get_device, SyncQueue
+from .huaman_status import HumanStatus

 logger = logging.getLogger(__name__)
 current_file_path = os.path.dirname(os.path.abspath(__file__))
@ -74,7 +74,7 @@ class AudioInferenceHandler(AudioHandler):
        count_time = 0
        logger.info('start inference')
        silence_length = 133
-        # human_status = HumanStatus(length, silence_length)
+        human_status = HumanStatus(length, silence_length)

        device = get_device()
        logger.info(f'use device:{device}')
@ -109,18 +109,22 @@ class AudioInferenceHandler(AudioHandler):
                    for i in range(batch_size):
                        if not self._is_running:
                            break
-                        self.on_next_handle((None, mirror_index(silence_length, index),
-                        # self.on_next_handle((None, human_status.get_index(),
+                        # self.on_next_handle((None, mirror_index(silence_length, index),
+                        self.on_next_handle((None, human_status.get_index(),
                                             audio_frames[i * 2:i * 2 + 2]), 0)
-                        index = index + 1
+                        # index = index + 1
                else:
+                    human_status.start_talking()
                    logger.info(f'infer======= {current_text}')
                    # human_status.try_to_talk()
                    t = time.perf_counter()
                    img_batch = []
+                    index_list = []
                    # for i in range(batch_size):
                    for i in range(len(mel_batch)):
-                        idx = mirror_index(length, index + i)
+                        # idx = mirror_index(length, index + i)
+                        idx = human_status.get_index()
+                        index_list.append(idx)
                        face = face_list_cycle[idx]
                        img_batch.append(face)

@ -154,9 +158,10 @@ class AudioInferenceHandler(AudioHandler):
                        if not self._is_running:
                            break
                        self.on_next_handle(
-                            (res_frame, mirror_index(length, index), audio_frames[i * 2:i * 2 + 2]),
+                            # (res_frame, mirror_index(length, index), audio_frames[i * 2:i * 2 + 2]),
+                            (res_frame, index_list[i], audio_frames[i * 2:i * 2 + 2]),
                            0)
-                        index = index + 1
+
                    logger.info(f'total batch time: {time.perf_counter() - start_time}')
            else:
                time.sleep(1)
--- a/human/huaman_status.py
+++ b/human/huaman_status.py
@ -12,11 +12,13 @@ class HumanStatusEnum(Enum):


 class HumanStatus:
-    def __init__(self, total_frames=0, last_silence_frame=0):
+    def __init__(self, total_frames=0, silence_length=0):
        self._status = HumanStatusEnum.silence
        self._total_frames = total_frames
-        self._last_silence_frame = last_silence_frame
+        self._silence_length = silence_length
+        self._talking_length = total_frames - silence_length
        self._current_frame = 0
+        self._is_talking = False

    def get_status(self):
        return self._status
@ -27,10 +29,26 @@ class HumanStatus:

    def try_to_talk(self):
        if self._status == HumanStatusEnum.silence:
-            if self._current_frame - self._last_silence_frame < 0:
+            if self._current_frame - self._silence_length < 0:
                return False
            self._status = HumanStatusEnum.talking
        return True

    def get_index(self):
-        return self._total_frames
+        if self._is_talking:
+            if self._current_frame < self._silence_length:
+                index = self._current_frame
+            else:
+                index = self._silence_length + (self._current_frame - self._silence_length) % self._talking_length
+        else:
+            index = self._current_frame % self._silence_length
+
+        self._current_frame = (self._current_frame + 1) % self._total_frames
+        return index
+
+    def start_talking(self):
+        self._is_talking = True
+
+    def stop_talking(self):
+        self._is_talking = False
+        self._current_frame = 0
--- a/human/human_context.py
+++ b/human/human_context.py
@ -125,8 +125,8 @@ class HumanContext:
        self._tts_handle = TTSAudioSplitHandle(self,  self._mal_handler)
        self._tts = TTSEdgeHttp(self._tts_handle)
        split = PunctuationSplit()
-        # self._nlp = DouBao(self, split,  self._tts)
-        self._nlp = Kimi(self, split,  self._tts)
+        self._nlp = DouBao(self, split,  self._tts)
+        # self._nlp = Kimi(self, split,  self._tts)
        self._asr = SherpaNcnnAsr()
        self._asr.attach(self._nlp)