modify render talking

This commit is contained in:
jiegeaiai 2024-12-06 08:27:18 +08:00
parent 31f9ec50cb
commit 0eefa7b1ce
3 changed files with 37 additions and 14 deletions

View File

@ -3,7 +3,6 @@ import logging
import os import os
import queue import queue
import time import time
from queue import Queue
from threading import Event, Thread from threading import Event, Thread
import cv2 import cv2
@ -13,6 +12,7 @@ import torch
from eventbus import EventBus from eventbus import EventBus
from human_handler import AudioHandler from human_handler import AudioHandler
from utils import load_model, mirror_index, get_device, SyncQueue from utils import load_model, mirror_index, get_device, SyncQueue
from .huaman_status import HumanStatus
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
current_file_path = os.path.dirname(os.path.abspath(__file__)) current_file_path = os.path.dirname(os.path.abspath(__file__))
@ -74,7 +74,7 @@ class AudioInferenceHandler(AudioHandler):
count_time = 0 count_time = 0
logger.info('start inference') logger.info('start inference')
silence_length = 133 silence_length = 133
# human_status = HumanStatus(length, silence_length) human_status = HumanStatus(length, silence_length)
device = get_device() device = get_device()
logger.info(f'use device:{device}') logger.info(f'use device:{device}')
@ -109,18 +109,22 @@ class AudioInferenceHandler(AudioHandler):
for i in range(batch_size): for i in range(batch_size):
if not self._is_running: if not self._is_running:
break break
self.on_next_handle((None, mirror_index(silence_length, index), # self.on_next_handle((None, mirror_index(silence_length, index),
# self.on_next_handle((None, human_status.get_index(), self.on_next_handle((None, human_status.get_index(),
audio_frames[i * 2:i * 2 + 2]), 0) audio_frames[i * 2:i * 2 + 2]), 0)
index = index + 1 # index = index + 1
else: else:
human_status.start_talking()
logger.info(f'infer======= {current_text}') logger.info(f'infer======= {current_text}')
# human_status.try_to_talk() # human_status.try_to_talk()
t = time.perf_counter() t = time.perf_counter()
img_batch = [] img_batch = []
index_list = []
# for i in range(batch_size): # for i in range(batch_size):
for i in range(len(mel_batch)): for i in range(len(mel_batch)):
idx = mirror_index(length, index + i) # idx = mirror_index(length, index + i)
idx = human_status.get_index()
index_list.append(idx)
face = face_list_cycle[idx] face = face_list_cycle[idx]
img_batch.append(face) img_batch.append(face)
@ -154,9 +158,10 @@ class AudioInferenceHandler(AudioHandler):
if not self._is_running: if not self._is_running:
break break
self.on_next_handle( self.on_next_handle(
(res_frame, mirror_index(length, index), audio_frames[i * 2:i * 2 + 2]), # (res_frame, mirror_index(length, index), audio_frames[i * 2:i * 2 + 2]),
(res_frame, index_list[i], audio_frames[i * 2:i * 2 + 2]),
0) 0)
index = index + 1
logger.info(f'total batch time: {time.perf_counter() - start_time}') logger.info(f'total batch time: {time.perf_counter() - start_time}')
else: else:
time.sleep(1) time.sleep(1)

View File

@ -12,11 +12,13 @@ class HumanStatusEnum(Enum):
class HumanStatus: class HumanStatus:
def __init__(self, total_frames=0, last_silence_frame=0): def __init__(self, total_frames=0, silence_length=0):
self._status = HumanStatusEnum.silence self._status = HumanStatusEnum.silence
self._total_frames = total_frames self._total_frames = total_frames
self._last_silence_frame = last_silence_frame self._silence_length = silence_length
self._talking_length = total_frames - silence_length
self._current_frame = 0 self._current_frame = 0
self._is_talking = False
def get_status(self): def get_status(self):
return self._status return self._status
@ -27,10 +29,26 @@ class HumanStatus:
def try_to_talk(self): def try_to_talk(self):
if self._status == HumanStatusEnum.silence: if self._status == HumanStatusEnum.silence:
if self._current_frame - self._last_silence_frame < 0: if self._current_frame - self._silence_length < 0:
return False return False
self._status = HumanStatusEnum.talking self._status = HumanStatusEnum.talking
return True return True
def get_index(self): def get_index(self):
return self._total_frames if self._is_talking:
if self._current_frame < self._silence_length:
index = self._current_frame
else:
index = self._silence_length + (self._current_frame - self._silence_length) % self._talking_length
else:
index = self._current_frame % self._silence_length
self._current_frame = (self._current_frame + 1) % self._total_frames
return index
def start_talking(self):
self._is_talking = True
def stop_talking(self):
self._is_talking = False
self._current_frame = 0

View File

@ -125,8 +125,8 @@ class HumanContext:
self._tts_handle = TTSAudioSplitHandle(self, self._mal_handler) self._tts_handle = TTSAudioSplitHandle(self, self._mal_handler)
self._tts = TTSEdgeHttp(self._tts_handle) self._tts = TTSEdgeHttp(self._tts_handle)
split = PunctuationSplit() split = PunctuationSplit()
# self._nlp = DouBao(self, split, self._tts) self._nlp = DouBao(self, split, self._tts)
self._nlp = Kimi(self, split, self._tts) # self._nlp = Kimi(self, split, self._tts)
self._asr = SherpaNcnnAsr() self._asr = SherpaNcnnAsr()
self._asr.attach(self._nlp) self._asr.attach(self._nlp)