[ADD]add logic of loop frame

This commit is contained in:
jocelyn 2025-06-10 15:04:35 +08:00
parent 2bd94b9680
commit c2871cac69
5 changed files with 468 additions and 14 deletions

View File

@ -13,20 +13,24 @@ from eventbus import EventBus
from human_handler import AudioHandler from human_handler import AudioHandler
from utils import load_model, mirror_index, get_device, SyncQueue from utils import load_model, mirror_index, get_device, SyncQueue
from .huaman_status import HumanStatus from .huaman_status import HumanStatus
from utils.log import logger
logger = logging.getLogger(__name__)
# logger = logging.getLogger(__name__)
current_file_path = os.path.dirname(os.path.abspath(__file__)) current_file_path = os.path.dirname(os.path.abspath(__file__))
class AudioInferenceHandler(AudioHandler): class AudioInferenceHandler(AudioHandler):
def __init__(self, context, handler): def __init__(self, context, handler, person_config):
super().__init__(context, handler) super().__init__(context, handler)
EventBus().register('stop', self._on_stop) EventBus().register('stop', self._on_stop)
self._mal_queue = SyncQueue(1, "AudioInferenceHandler_Mel") self._mal_queue = SyncQueue(1, "AudioInferenceHandler_Mel")
self._audio_queue = SyncQueue(context.batch_size * 2, "AudioInferenceHandler_Audio") self._audio_queue = SyncQueue(context.batch_size * 2, "AudioInferenceHandler_Audio")
self.person_config = person_config
self._is_running = True self._is_running = True
self.last_direction = 1
self._exit_event = Event() self._exit_event = Event()
self._run_thread = Thread(target=self.__on_run, name="AudioInferenceHandlerThread") self._run_thread = Thread(target=self.__on_run, name="AudioInferenceHandlerThread")
self._exit_event.set() self._exit_event.set()
@ -100,12 +104,12 @@ class AudioInferenceHandler(AudioHandler):
break break
if is_all_silence: if is_all_silence:
for i in range(batch_size): frame_indexes, self.last_direction = human_status.get_index_v2(self.person_config["frame_config"], self.last_direction, batch_size) # [1,3,4]
for i, frame_idx in zip(range(batch_size), frame_indexes):
if not self._is_running: if not self._is_running:
break break
# self.on_next_handle((None, mirror_index(length, index), # self.on_next_handle((None, mirror_index(length, index),
self.on_next_handle((None, human_status.get_index(), self.on_next_handle((None,frame_idx, audio_frames[i * 2:i * 2 + 2]), 0)
audio_frames[i * 2:i * 2 + 2]), 0)
# index = index + 1 # index = index + 1
else: else:
human_status.start_talking() human_status.start_talking()
@ -115,11 +119,12 @@ class AudioInferenceHandler(AudioHandler):
img_batch = [] img_batch = []
index_list = [] index_list = []
# for i in range(batch_size): # for i in range(batch_size):
for i in range(len(mel_batch)): frame_indexes,self.last_direction = human_status.get_index_v2(self.person_config["frame_config"], self.last_direction, batch_size) # [1,3,4]
# TODO: 推理状态下获取循环帧逻辑
for i, frame_idx in zip(range(len(mel_batch)), frame_indexes):
# idx = mirror_index(length, index + i) # idx = mirror_index(length, index + i)
idx = human_status.get_index() index_list.append(frame_idx)
index_list.append(idx) face = face_list_cycle[frame_idx]
face = face_list_cycle[idx]
img_batch.append(face) img_batch.append(face)
# print('orign img_batch:', len(img_batch), 'origin mel_batch:', len(mel_batch)) # print('orign img_batch:', len(img_batch), 'origin mel_batch:', len(mel_batch))

View File

@ -1,8 +1,5 @@
#encoding = utf8 #encoding = utf8
from utils.loop_frame_tool import play_in_loop_v2
import logging
from enum import Enum from enum import Enum
@ -20,6 +17,9 @@ class HumanStatus:
self._current_frame = 0 self._current_frame = 0
self._is_talking = False self._is_talking = False
self.last_frame_talking_status = "silent" # 记录上一帧讲话状态
self.next_frame_talking_status = "silent"
def get_status(self): def get_status(self):
return self._status return self._status
@ -51,6 +51,27 @@ class HumanStatus:
self._current_frame = (self._current_frame + 1) % self._total_frames self._current_frame = (self._current_frame + 1) % self._total_frames
return index return index
def get_index_v2(self, frame_config:list, last_direction:int=1, batch_size:int=5):
"""
"""
audio_frame_length = batch_size
is_silent = True if not self._is_talking else False
first_speak = True if self._is_talking and self.last_frame_talking_status == "silent" else False
last_speak = True if self.last_frame_talking_status == "talk" and self.next_frame_talking_status == "silent" else False
start_idx_list, last_direction = play_in_loop_v2(
frame_config,
startfrom,
audio_frame_length,
last_direction,
is_silent,
first_speak,# 刚开始讲话 向讲话移动
last_speak, # 讲话结束 向静默移动
)
startfrom = start_idx_list[-1]
# 一次返回一个batch帧号
return start_idx_list, last_direction
def start_talking(self): def start_talking(self):
self._is_talking = True self._is_talking = True

View File

@ -47,6 +47,10 @@ class HumanContext:
self._m_frames = m_frames self._m_frames = m_frames
self._inv_m_frames = inv_m_frames self._inv_m_frames = inv_m_frames
face_images_length = len(self._face_list_cycle) face_images_length = len(self._face_list_cycle)
# TODO: get person config
self.person_config ={
"frame_config": [[1,face_frames-1, True]],
}
logging.info(f'face images length: {face_images_length}') logging.info(f'face images length: {face_images_length}')
print(f'face images length: {face_images_length}') print(f'face images length: {face_images_length}')
@ -113,7 +117,7 @@ class HumanContext:
def build(self, render_handler): def build(self, render_handler):
self._render_handler = render_handler self._render_handler = render_handler
self._infer_handler = AudioInferenceHandler(self, self._render_handler) self._infer_handler = AudioInferenceHandler(self, self._render_handler, self.person_config)
self._mal_handler = AudioMalHandler(self, self._infer_handler) self._mal_handler = AudioMalHandler(self, self._infer_handler)
self._tts_handle = TTSAudioSplitHandle(self, self._mal_handler) self._tts_handle = TTSAudioSplitHandle(self, self._mal_handler)
self._tts = TTSEdgeHttp(self._tts_handle) self._tts = TTSEdgeHttp(self._tts_handle)

86
utils/log.py Normal file
View File

@ -0,0 +1,86 @@
import logging
import os
import sys
from loguru import logger as logurulogger
import json
LOG_FORMAT = (
"<level>{level: <8}</level> "
"{process.name} | " # 进程名
"{thread.name} | "
"<green>{time:YYYY-MM-DD HH:mm:ss.SSS}</green> - "
"<blue>{process}</blue> "
"<cyan>{module}</cyan>.<cyan>{function}</cyan>:<cyan>{line}</cyan> - "
"<level>{message}</level>"
)
LOG_NAME = ["uvicorn", "uvicorn.access", "uvicorn.error", "flask"]
class InterceptHandler(logging.Handler):
def emit(self, record):
try:
level = logurulogger.level(record.levelname).name
except AttributeError:
level = logging._levelToName[record.levelno]
frame, depth = logging.currentframe(), 2
while frame.f_code.co_filename == logging.__file__:
frame = frame.f_back
depth += 1
logurulogger.opt(depth=depth, exception=record.exc_info).log(
level, record.getMessage()
)
class Logging:
"""自定义日志"""
def __init__(self):
self.log_path = "logs"
os.makedirs(self.log_path, exist_ok=True)
self._initlogger()
self._reset_log_handler()
def _initlogger(self):
"""初始化loguru配置"""
logurulogger.remove()
logurulogger.add(
os.path.join(self.log_path, "error.log.{time:YYYY-MM-DD}"),
format=LOG_FORMAT,
level=logging.ERROR,
rotation="00:00",
retention="1 week",
backtrace=True,
diagnose=True,
enqueue=True
)
logurulogger.add(
os.path.join(self.log_path, "info.log.{time:YYYY-MM-DD}"),
format=LOG_FORMAT,
level=logging.INFO,
rotation="00:00",
retention="1 week",
enqueue=True
)
logurulogger.add(
sys.stdout,
format=LOG_FORMAT,
level=logging.DEBUG,
colorize=True,
)
self.logger = logurulogger
def _reset_log_handler(self):
for log in LOG_NAME:
logger = logging.getLogger(log)
logger.handlers = [InterceptHandler()]
def getlogger(self):
return self.logger
logger = Logging().getlogger()

338
utils/loop_frame_tool.py Normal file
View File

@ -0,0 +1,338 @@
from utils.log import logger
def play_in_loop_v2(
segments,
startfrom,
batch_num,
last_direction,
is_silent,
first_speak,
last_speak,
):
"""
batch_num: 初始和结束每一帧都这么判断
1静默时在静默段循环 左边界正向右边界反向, 根据上一次方向和位置给出新的方向和位置
2静默转说话 就近到说话段pre_falg, post_flag, 都为true VS 其中一个为true
3说话转静默 动作段播完再进入静默(如果还在持续说话静默段不循环)
4在整个视频左端点 开始端只能正向静默时循环说话时走2
5在整个视频右端点 开始时只能反向静默时循环说话时走2
6根据方向获取batch_num 数量的视频帧return batch_idxes, current_direction
Args:
segments: 循环帧配置 [[st, ed, True], ...]
startfrom: cur_pos
batch_num: 5
last_direction: 0反向1正向
is_silent: 0说话态1动作态
first_speak: 记录是不是第一次讲话
last_speak: 记录是不是讲话结束那一刻
"""
frames = []
cur_pos = startfrom
cur_direction = last_direction
is_first_speak_frame = first_speak
is_last_speak_frame = True if last_speak and batch_num == 1 else False
while batch_num != 0:
# 获取当前帧的所在子分割区间
sub_seg_idx = subseg_judge(cur_pos, segments)
# 获取移动方向
next_direction, next_pos = get_next_direction(
segments,
cur_pos,
cur_direction,
is_silent,
sub_seg_idx,
is_first_speak_frame,
is_last_speak_frame,
)
# 获取指定方向的帧
next_pos = get_next_frame(next_pos, next_direction)
frames.append(next_pos)
batch_num -= 1
is_first_speak_frame = (
True if first_speak and batch_num == config.batch_size else False
)
is_last_speak_frame = True if last_speak and batch_num == 1 else False
cur_direction = next_direction
cur_pos = next_pos
return frames, next_direction
def subseg_judge(cur_pos, segments):
for idx, frame_seg in enumerate(segments):
if cur_pos >= frame_seg[0] and cur_pos <= frame_seg[1]:
return idx
if cur_pos == 0:
return 0
def get_next_direction(
segments,
cur_pos,
cur_direction,
is_silent,
sub_seg_idx,
is_first_speak_frame: bool = False,
is_last_speak_frame: bool = False,
):
"""
3.3.0 循环帧需求想尽快走到预期状态
if 动作段
if 开始说话
if 边界
if 正向
pass
else:
pass
else:
if 正向
pass
else:
pass
elif 静默:
同上
elif 说话中
同上
elif 说话结束
同上
elif 静默段
同上
Args:
is_first_speak_frame: 开始说话flag
is_last_speak_frame 说话结束flag
"""
left, right, loop_flag = segments[sub_seg_idx]
if loop_flag:
if is_silent == 1:
next_direct, next_pos = pure_silent(
segments, left, right, cur_pos, cur_direction, sub_seg_idx
)
logger.debug(
f"cur_pos{cur_pos}, next_direct:{next_direct}, is_first_speak_frame:{is_first_speak_frame}"
)
elif is_silent == 0:
next_direct, next_pos = silent2action(
segments,
left,
right,
cur_pos,
cur_direction,
sub_seg_idx,
is_first_speak_frame,
)
logger.debug(
f"cur_pos:{cur_pos}, next_direct:{next_direct}, is_first_speak_frame{is_first_speak_frame}"
)
else:
if is_silent == 1:
next_direct, next_pos = action2silent(
segments,
left,
right,
cur_pos,
cur_direction,
sub_seg_idx,
is_last_speak_frame,
)
logger.debug(
f"cur_pos{cur_pos} next_direct:{next_direct},is_first_speak_frame{is_first_speak_frame},is_last_speak_frame:{is_last_speak_frame}"
)
elif is_silent == 0:
next_direct, next_pos = pure_action(
segments,
left,
right,
cur_pos,
cur_direction,
sub_seg_idx,
is_last_speak_frame,
)
logger.debug(
f"cur_pos{cur_pos}, next_direct:{next_direct},is_first_speak_frame{is_first_speak_frame}, is_last_speak_frame:{is_last_speak_frame}"
)
return next_direct, next_pos
def get_next_frame(cur_pos, cur_direction):
"""根据当前帧和方向,获取下一帧,这里应该保证方向上的帧是一定能取到的
不需要再做额外的边界判断
"""
# 正向
if cur_direction == 1:
return cur_pos + 1
# 反向
elif cur_direction == 0:
return cur_pos - 1
def pure_silent(segments, left, right, cur_pos, cur_direction, sub_seg_idx):
"""
loop_flag == True and is_silent==1
whether border
whether forward
Return:
next_direction
"""
# 左边界正向,右边界反向
if cur_pos == segments[0][0]:
return 1, cur_pos
if cur_pos == segments[-1][1]:
return 0, cur_pos
# 右边界,反向
if cur_pos == right:
return 0, cur_pos
# 左边界,正向
if cur_pos == left:
return 1, cur_pos
# 非边界,之前正向,则继续正向,否则反向
if cur_pos > left and cur_direction == 1:
return 1, cur_pos
else:
return 0, cur_pos
def pure_action(
segments, left, right, cur_pos, cur_direction, sub_seg_idx, is_last_speak_frame
):
"""
loop_flag ==False and is_silent == 0
动作播完正向到静默段 (存在跳段行为)
whether border
whether forward # 正播反播
Args:
is_last_speak_frame: 最后说话结束时刻
Return: next_direction
"""
if cur_pos == segments[0][0]:
return 1, cur_pos
if cur_pos == segments[-1][1]:
return 0, cur_pos
if is_last_speak_frame:
# 动作段在末尾,向前找静默
if sub_seg_idx == len(segments) - 1:
return 0, cur_pos
# 动作段在开始, 向后
if sub_seg_idx == 0:
return 1, cur_pos
# 动作段在中间,就近原则
mid = left + (right - left + 1) // 2
# 就近原则优先
if cur_pos < mid:
return 0, cur_pos
else:
return 1, cur_pos
else:
# 其他情况,播放方向一致
if cur_direction == 1:
return 1, cur_pos
else:
return 0, cur_pos
def silent2action(
segments,
left,
right,
cur_pos,
cur_direction,
sub_seg_idx,
is_first_speak_frame: bool = False,
):
"""
在静默区间但是在讲话
loop_flag=True and is_silent == 0
whether border
whether forward
Return: next_direction
"""
# 向最近的动作段移动, 如果左面没有动作段
# TODO: 确认下面逻辑是否正确
if (
cur_pos == segments[0][0]
): # 如果发生过跳跃,新段无论是不是动作段,仍然都向后执行
return 1, cur_pos
if cur_pos == segments[-1][1]:
return 0, cur_pos
# 在静默左边界处,且仍在讲话
if cur_pos == left:
if cur_direction == 1:
return 1, cur_pos
else:
return 0, cur_pos
# 在静默右边界处,且仍在讲话
elif cur_pos == right:
if cur_direction == 1:
return 1, cur_pos
else:
return 0, cur_pos
else:
mid = left + (right - left + 1) // 2
# !!就近原则只对第一次说话有效,其他情况遵循上一次状态
if is_first_speak_frame:
# 如果第一段
if sub_seg_idx == 0 and segments[0][2]:
return 1, cur_pos
# 如果最后一段
elif sub_seg_idx == len(segments) - 1 and segments[-1][2]:
return 0, cur_pos
if cur_pos < mid:
return 0, cur_pos
else:
return 1, cur_pos
else:
if cur_direction == 1:
return 1, cur_pos
elif cur_direction == 0:
return 0, cur_pos
def action2silent(
segments, left, right, cur_pos, cur_direction, sub_seg_idx, is_last_speak_frame
):
"""
loop_flag=False and is_silent==1
whether border
Return: next_direction
"""
if cur_pos == segments[0][0]:
return 1, cur_pos
if cur_pos == segments[-1][1]:
return 0, cur_pos
# 动作段,说话结束转静默情况下,就近原则,进入静默
if is_last_speak_frame:
mid = left + (right - left + 1) // 2
if cur_pos < mid:
return 0, cur_pos
else:
return 1, cur_pos
else:
if cur_direction == 1:
return 1, cur_pos
else:
return 0, cur_pos
if __name__ == "__main__":
startfrom = 0 # 上一个batch的最后一帧
frame_config= [[1, 200, True]]
audio_frame_length = len(mel_chunks) # TODO: 确认是否为 batch_size
startfrom = startfrom if startfrom>= frame_config[0][0] else frame_config[0][0]
first_speak, last_speak = True, False
is_silent= True # 当前batch是否为静默
last_direction = 1 # -1 为反方向
i = 0
while i<=10:
start_idx_list, last_direction = play_in_loop_v2(
frame_config,
startfrom,
audio_frame_length,
last_direction,
is_silent,
first_speak,
last_speak,
)
startfrom = start_idx_list[-1]
i+=1