modify render talking

This commit is contained in:
jiegeaiai 2024-12-06 08:27:18 +08:00
parent 31f9ec50cb
commit 0eefa7b1ce
3 changed files with 37 additions and 14 deletions

View File

@ -3,7 +3,6 @@ import logging
import os
import queue
import time
from queue import Queue
from threading import Event, Thread
import cv2
@ -13,6 +12,7 @@ import torch
from eventbus import EventBus
from human_handler import AudioHandler
from utils import load_model, mirror_index, get_device, SyncQueue
from .huaman_status import HumanStatus
logger = logging.getLogger(__name__)
current_file_path = os.path.dirname(os.path.abspath(__file__))
@ -74,7 +74,7 @@ class AudioInferenceHandler(AudioHandler):
count_time = 0
logger.info('start inference')
silence_length = 133
# human_status = HumanStatus(length, silence_length)
human_status = HumanStatus(length, silence_length)
device = get_device()
logger.info(f'use device:{device}')
@ -109,18 +109,22 @@ class AudioInferenceHandler(AudioHandler):
for i in range(batch_size):
if not self._is_running:
break
self.on_next_handle((None, mirror_index(silence_length, index),
# self.on_next_handle((None, human_status.get_index(),
# self.on_next_handle((None, mirror_index(silence_length, index),
self.on_next_handle((None, human_status.get_index(),
audio_frames[i * 2:i * 2 + 2]), 0)
index = index + 1
# index = index + 1
else:
human_status.start_talking()
logger.info(f'infer======= {current_text}')
# human_status.try_to_talk()
t = time.perf_counter()
img_batch = []
index_list = []
# for i in range(batch_size):
for i in range(len(mel_batch)):
idx = mirror_index(length, index + i)
# idx = mirror_index(length, index + i)
idx = human_status.get_index()
index_list.append(idx)
face = face_list_cycle[idx]
img_batch.append(face)
@ -154,9 +158,10 @@ class AudioInferenceHandler(AudioHandler):
if not self._is_running:
break
self.on_next_handle(
(res_frame, mirror_index(length, index), audio_frames[i * 2:i * 2 + 2]),
# (res_frame, mirror_index(length, index), audio_frames[i * 2:i * 2 + 2]),
(res_frame, index_list[i], audio_frames[i * 2:i * 2 + 2]),
0)
index = index + 1
logger.info(f'total batch time: {time.perf_counter() - start_time}')
else:
time.sleep(1)

View File

@ -12,11 +12,13 @@ class HumanStatusEnum(Enum):
class HumanStatus:
def __init__(self, total_frames=0, last_silence_frame=0):
def __init__(self, total_frames=0, silence_length=0):
self._status = HumanStatusEnum.silence
self._total_frames = total_frames
self._last_silence_frame = last_silence_frame
self._silence_length = silence_length
self._talking_length = total_frames - silence_length
self._current_frame = 0
self._is_talking = False
def get_status(self):
return self._status
@ -27,10 +29,26 @@ class HumanStatus:
def try_to_talk(self):
if self._status == HumanStatusEnum.silence:
if self._current_frame - self._last_silence_frame < 0:
if self._current_frame - self._silence_length < 0:
return False
self._status = HumanStatusEnum.talking
return True
def get_index(self):
return self._total_frames
if self._is_talking:
if self._current_frame < self._silence_length:
index = self._current_frame
else:
index = self._silence_length + (self._current_frame - self._silence_length) % self._talking_length
else:
index = self._current_frame % self._silence_length
self._current_frame = (self._current_frame + 1) % self._total_frames
return index
def start_talking(self):
self._is_talking = True
def stop_talking(self):
self._is_talking = False
self._current_frame = 0

View File

@ -125,8 +125,8 @@ class HumanContext:
self._tts_handle = TTSAudioSplitHandle(self, self._mal_handler)
self._tts = TTSEdgeHttp(self._tts_handle)
split = PunctuationSplit()
# self._nlp = DouBao(self, split, self._tts)
self._nlp = Kimi(self, split, self._tts)
self._nlp = DouBao(self, split, self._tts)
# self._nlp = Kimi(self, split, self._tts)
self._asr = SherpaNcnnAsr()
self._asr.attach(self._nlp)