From 0eefa7b1ce39201cf4a89934ea1fa7d3225d6742 Mon Sep 17 00:00:00 2001 From: jiegeaiai Date: Fri, 6 Dec 2024 08:27:18 +0800 Subject: [PATCH] modify render talking --- human/audio_inference_handler.py | 21 +++++++++++++-------- human/huaman_status.py | 26 ++++++++++++++++++++++---- human/human_context.py | 4 ++-- 3 files changed, 37 insertions(+), 14 deletions(-) diff --git a/human/audio_inference_handler.py b/human/audio_inference_handler.py index 537b3a8..590f316 100644 --- a/human/audio_inference_handler.py +++ b/human/audio_inference_handler.py @@ -3,7 +3,6 @@ import logging import os import queue import time -from queue import Queue from threading import Event, Thread import cv2 @@ -13,6 +12,7 @@ import torch from eventbus import EventBus from human_handler import AudioHandler from utils import load_model, mirror_index, get_device, SyncQueue +from .huaman_status import HumanStatus logger = logging.getLogger(__name__) current_file_path = os.path.dirname(os.path.abspath(__file__)) @@ -74,7 +74,7 @@ class AudioInferenceHandler(AudioHandler): count_time = 0 logger.info('start inference') silence_length = 133 - # human_status = HumanStatus(length, silence_length) + human_status = HumanStatus(length, silence_length) device = get_device() logger.info(f'use device:{device}') @@ -109,18 +109,22 @@ class AudioInferenceHandler(AudioHandler): for i in range(batch_size): if not self._is_running: break - self.on_next_handle((None, mirror_index(silence_length, index), - # self.on_next_handle((None, human_status.get_index(), + # self.on_next_handle((None, mirror_index(silence_length, index), + self.on_next_handle((None, human_status.get_index(), audio_frames[i * 2:i * 2 + 2]), 0) - index = index + 1 + # index = index + 1 else: + human_status.start_talking() logger.info(f'infer======= {current_text}') # human_status.try_to_talk() t = time.perf_counter() img_batch = [] + index_list = [] # for i in range(batch_size): for i in range(len(mel_batch)): - idx = mirror_index(length, index + i) + # idx = mirror_index(length, index + i) + idx = human_status.get_index() + index_list.append(idx) face = face_list_cycle[idx] img_batch.append(face) @@ -154,9 +158,10 @@ class AudioInferenceHandler(AudioHandler): if not self._is_running: break self.on_next_handle( - (res_frame, mirror_index(length, index), audio_frames[i * 2:i * 2 + 2]), + # (res_frame, mirror_index(length, index), audio_frames[i * 2:i * 2 + 2]), + (res_frame, index_list[i], audio_frames[i * 2:i * 2 + 2]), 0) - index = index + 1 + logger.info(f'total batch time: {time.perf_counter() - start_time}') else: time.sleep(1) diff --git a/human/huaman_status.py b/human/huaman_status.py index a031f95..46d4c13 100644 --- a/human/huaman_status.py +++ b/human/huaman_status.py @@ -12,11 +12,13 @@ class HumanStatusEnum(Enum): class HumanStatus: - def __init__(self, total_frames=0, last_silence_frame=0): + def __init__(self, total_frames=0, silence_length=0): self._status = HumanStatusEnum.silence self._total_frames = total_frames - self._last_silence_frame = last_silence_frame + self._silence_length = silence_length + self._talking_length = total_frames - silence_length self._current_frame = 0 + self._is_talking = False def get_status(self): return self._status @@ -27,10 +29,26 @@ class HumanStatus: def try_to_talk(self): if self._status == HumanStatusEnum.silence: - if self._current_frame - self._last_silence_frame < 0: + if self._current_frame - self._silence_length < 0: return False self._status = HumanStatusEnum.talking return True def get_index(self): - return self._total_frames + if self._is_talking: + if self._current_frame < self._silence_length: + index = self._current_frame + else: + index = self._silence_length + (self._current_frame - self._silence_length) % self._talking_length + else: + index = self._current_frame % self._silence_length + + self._current_frame = (self._current_frame + 1) % self._total_frames + return index + + def start_talking(self): + self._is_talking = True + + def stop_talking(self): + self._is_talking = False + self._current_frame = 0 diff --git a/human/human_context.py b/human/human_context.py index cc28cc0..f6e6add 100644 --- a/human/human_context.py +++ b/human/human_context.py @@ -125,8 +125,8 @@ class HumanContext: self._tts_handle = TTSAudioSplitHandle(self, self._mal_handler) self._tts = TTSEdgeHttp(self._tts_handle) split = PunctuationSplit() - # self._nlp = DouBao(self, split, self._tts) - self._nlp = Kimi(self, split, self._tts) + self._nlp = DouBao(self, split, self._tts) + # self._nlp = Kimi(self, split, self._tts) self._asr = SherpaNcnnAsr() self._asr.attach(self._nlp)