modify render talking
This commit is contained in:
parent
31f9ec50cb
commit
0eefa7b1ce
@ -3,7 +3,6 @@ import logging
|
|||||||
import os
|
import os
|
||||||
import queue
|
import queue
|
||||||
import time
|
import time
|
||||||
from queue import Queue
|
|
||||||
from threading import Event, Thread
|
from threading import Event, Thread
|
||||||
|
|
||||||
import cv2
|
import cv2
|
||||||
@ -13,6 +12,7 @@ import torch
|
|||||||
from eventbus import EventBus
|
from eventbus import EventBus
|
||||||
from human_handler import AudioHandler
|
from human_handler import AudioHandler
|
||||||
from utils import load_model, mirror_index, get_device, SyncQueue
|
from utils import load_model, mirror_index, get_device, SyncQueue
|
||||||
|
from .huaman_status import HumanStatus
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
current_file_path = os.path.dirname(os.path.abspath(__file__))
|
||||||
@ -74,7 +74,7 @@ class AudioInferenceHandler(AudioHandler):
|
|||||||
count_time = 0
|
count_time = 0
|
||||||
logger.info('start inference')
|
logger.info('start inference')
|
||||||
silence_length = 133
|
silence_length = 133
|
||||||
# human_status = HumanStatus(length, silence_length)
|
human_status = HumanStatus(length, silence_length)
|
||||||
|
|
||||||
device = get_device()
|
device = get_device()
|
||||||
logger.info(f'use device:{device}')
|
logger.info(f'use device:{device}')
|
||||||
@ -109,18 +109,22 @@ class AudioInferenceHandler(AudioHandler):
|
|||||||
for i in range(batch_size):
|
for i in range(batch_size):
|
||||||
if not self._is_running:
|
if not self._is_running:
|
||||||
break
|
break
|
||||||
self.on_next_handle((None, mirror_index(silence_length, index),
|
# self.on_next_handle((None, mirror_index(silence_length, index),
|
||||||
# self.on_next_handle((None, human_status.get_index(),
|
self.on_next_handle((None, human_status.get_index(),
|
||||||
audio_frames[i * 2:i * 2 + 2]), 0)
|
audio_frames[i * 2:i * 2 + 2]), 0)
|
||||||
index = index + 1
|
# index = index + 1
|
||||||
else:
|
else:
|
||||||
|
human_status.start_talking()
|
||||||
logger.info(f'infer======= {current_text}')
|
logger.info(f'infer======= {current_text}')
|
||||||
# human_status.try_to_talk()
|
# human_status.try_to_talk()
|
||||||
t = time.perf_counter()
|
t = time.perf_counter()
|
||||||
img_batch = []
|
img_batch = []
|
||||||
|
index_list = []
|
||||||
# for i in range(batch_size):
|
# for i in range(batch_size):
|
||||||
for i in range(len(mel_batch)):
|
for i in range(len(mel_batch)):
|
||||||
idx = mirror_index(length, index + i)
|
# idx = mirror_index(length, index + i)
|
||||||
|
idx = human_status.get_index()
|
||||||
|
index_list.append(idx)
|
||||||
face = face_list_cycle[idx]
|
face = face_list_cycle[idx]
|
||||||
img_batch.append(face)
|
img_batch.append(face)
|
||||||
|
|
||||||
@ -154,9 +158,10 @@ class AudioInferenceHandler(AudioHandler):
|
|||||||
if not self._is_running:
|
if not self._is_running:
|
||||||
break
|
break
|
||||||
self.on_next_handle(
|
self.on_next_handle(
|
||||||
(res_frame, mirror_index(length, index), audio_frames[i * 2:i * 2 + 2]),
|
# (res_frame, mirror_index(length, index), audio_frames[i * 2:i * 2 + 2]),
|
||||||
|
(res_frame, index_list[i], audio_frames[i * 2:i * 2 + 2]),
|
||||||
0)
|
0)
|
||||||
index = index + 1
|
|
||||||
logger.info(f'total batch time: {time.perf_counter() - start_time}')
|
logger.info(f'total batch time: {time.perf_counter() - start_time}')
|
||||||
else:
|
else:
|
||||||
time.sleep(1)
|
time.sleep(1)
|
||||||
|
@ -12,11 +12,13 @@ class HumanStatusEnum(Enum):
|
|||||||
|
|
||||||
|
|
||||||
class HumanStatus:
|
class HumanStatus:
|
||||||
def __init__(self, total_frames=0, last_silence_frame=0):
|
def __init__(self, total_frames=0, silence_length=0):
|
||||||
self._status = HumanStatusEnum.silence
|
self._status = HumanStatusEnum.silence
|
||||||
self._total_frames = total_frames
|
self._total_frames = total_frames
|
||||||
self._last_silence_frame = last_silence_frame
|
self._silence_length = silence_length
|
||||||
|
self._talking_length = total_frames - silence_length
|
||||||
self._current_frame = 0
|
self._current_frame = 0
|
||||||
|
self._is_talking = False
|
||||||
|
|
||||||
def get_status(self):
|
def get_status(self):
|
||||||
return self._status
|
return self._status
|
||||||
@ -27,10 +29,26 @@ class HumanStatus:
|
|||||||
|
|
||||||
def try_to_talk(self):
|
def try_to_talk(self):
|
||||||
if self._status == HumanStatusEnum.silence:
|
if self._status == HumanStatusEnum.silence:
|
||||||
if self._current_frame - self._last_silence_frame < 0:
|
if self._current_frame - self._silence_length < 0:
|
||||||
return False
|
return False
|
||||||
self._status = HumanStatusEnum.talking
|
self._status = HumanStatusEnum.talking
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def get_index(self):
|
def get_index(self):
|
||||||
return self._total_frames
|
if self._is_talking:
|
||||||
|
if self._current_frame < self._silence_length:
|
||||||
|
index = self._current_frame
|
||||||
|
else:
|
||||||
|
index = self._silence_length + (self._current_frame - self._silence_length) % self._talking_length
|
||||||
|
else:
|
||||||
|
index = self._current_frame % self._silence_length
|
||||||
|
|
||||||
|
self._current_frame = (self._current_frame + 1) % self._total_frames
|
||||||
|
return index
|
||||||
|
|
||||||
|
def start_talking(self):
|
||||||
|
self._is_talking = True
|
||||||
|
|
||||||
|
def stop_talking(self):
|
||||||
|
self._is_talking = False
|
||||||
|
self._current_frame = 0
|
||||||
|
@ -125,8 +125,8 @@ class HumanContext:
|
|||||||
self._tts_handle = TTSAudioSplitHandle(self, self._mal_handler)
|
self._tts_handle = TTSAudioSplitHandle(self, self._mal_handler)
|
||||||
self._tts = TTSEdgeHttp(self._tts_handle)
|
self._tts = TTSEdgeHttp(self._tts_handle)
|
||||||
split = PunctuationSplit()
|
split = PunctuationSplit()
|
||||||
# self._nlp = DouBao(self, split, self._tts)
|
self._nlp = DouBao(self, split, self._tts)
|
||||||
self._nlp = Kimi(self, split, self._tts)
|
# self._nlp = Kimi(self, split, self._tts)
|
||||||
self._asr = SherpaNcnnAsr()
|
self._asr = SherpaNcnnAsr()
|
||||||
self._asr.attach(self._nlp)
|
self._asr.attach(self._nlp)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user