From c2871cac69793002d55f272e714f4f4a5ceeee30 Mon Sep 17 00:00:00 2001 From: jocelyn Date: Tue, 10 Jun 2025 15:04:35 +0800 Subject: [PATCH] [ADD]add logic of loop frame --- human/audio_inference_handler.py | 23 ++- human/huaman_status.py | 29 ++- human/human_context.py | 6 +- utils/log.py | 86 ++++++++ utils/loop_frame_tool.py | 338 +++++++++++++++++++++++++++++++ 5 files changed, 468 insertions(+), 14 deletions(-) create mode 100644 utils/log.py create mode 100644 utils/loop_frame_tool.py diff --git a/human/audio_inference_handler.py b/human/audio_inference_handler.py index 1f57b6f..2042184 100644 --- a/human/audio_inference_handler.py +++ b/human/audio_inference_handler.py @@ -13,20 +13,24 @@ from eventbus import EventBus from human_handler import AudioHandler from utils import load_model, mirror_index, get_device, SyncQueue from .huaman_status import HumanStatus +from utils.log import logger -logger = logging.getLogger(__name__) + +# logger = logging.getLogger(__name__) current_file_path = os.path.dirname(os.path.abspath(__file__)) class AudioInferenceHandler(AudioHandler): - def __init__(self, context, handler): + def __init__(self, context, handler, person_config): super().__init__(context, handler) EventBus().register('stop', self._on_stop) self._mal_queue = SyncQueue(1, "AudioInferenceHandler_Mel") self._audio_queue = SyncQueue(context.batch_size * 2, "AudioInferenceHandler_Audio") + self.person_config = person_config self._is_running = True + self.last_direction = 1 self._exit_event = Event() self._run_thread = Thread(target=self.__on_run, name="AudioInferenceHandlerThread") self._exit_event.set() @@ -100,12 +104,12 @@ class AudioInferenceHandler(AudioHandler): break if is_all_silence: - for i in range(batch_size): + frame_indexes, self.last_direction = human_status.get_index_v2(self.person_config["frame_config"], self.last_direction, batch_size) # [1,3,4] + for i, frame_idx in zip(range(batch_size), frame_indexes): if not self._is_running: break # self.on_next_handle((None, mirror_index(length, index), - self.on_next_handle((None, human_status.get_index(), - audio_frames[i * 2:i * 2 + 2]), 0) + self.on_next_handle((None,frame_idx, audio_frames[i * 2:i * 2 + 2]), 0) # index = index + 1 else: human_status.start_talking() @@ -115,11 +119,12 @@ class AudioInferenceHandler(AudioHandler): img_batch = [] index_list = [] # for i in range(batch_size): - for i in range(len(mel_batch)): + frame_indexes,self.last_direction = human_status.get_index_v2(self.person_config["frame_config"], self.last_direction, batch_size) # [1,3,4] + # TODO: 推理状态下获取循环帧逻辑 + for i, frame_idx in zip(range(len(mel_batch)), frame_indexes): # idx = mirror_index(length, index + i) - idx = human_status.get_index() - index_list.append(idx) - face = face_list_cycle[idx] + index_list.append(frame_idx) + face = face_list_cycle[frame_idx] img_batch.append(face) # print('orign img_batch:', len(img_batch), 'origin mel_batch:', len(mel_batch)) diff --git a/human/huaman_status.py b/human/huaman_status.py index fe24139..934af9e 100644 --- a/human/huaman_status.py +++ b/human/huaman_status.py @@ -1,8 +1,5 @@ #encoding = utf8 - -import logging - - +from utils.loop_frame_tool import play_in_loop_v2 from enum import Enum @@ -20,6 +17,9 @@ class HumanStatus: self._current_frame = 0 self._is_talking = False + self.last_frame_talking_status = "silent" # 记录上一帧讲话状态 + self.next_frame_talking_status = "silent" + def get_status(self): return self._status @@ -51,6 +51,27 @@ class HumanStatus: self._current_frame = (self._current_frame + 1) % self._total_frames return index + + def get_index_v2(self, frame_config:list, last_direction:int=1, batch_size:int=5): + """ + """ + audio_frame_length = batch_size + is_silent = True if not self._is_talking else False + first_speak = True if self._is_talking and self.last_frame_talking_status == "silent" else False + last_speak = True if self.last_frame_talking_status == "talk" and self.next_frame_talking_status == "silent" else False + start_idx_list, last_direction = play_in_loop_v2( + frame_config, + startfrom, + audio_frame_length, + last_direction, + is_silent, + first_speak,# 刚开始讲话 向讲话移动 + last_speak, # 讲话结束 向静默移动 + ) + startfrom = start_idx_list[-1] + # 一次返回一个batch帧号 + return start_idx_list, last_direction + def start_talking(self): self._is_talking = True diff --git a/human/human_context.py b/human/human_context.py index 6d55e8e..65b1613 100644 --- a/human/human_context.py +++ b/human/human_context.py @@ -47,6 +47,10 @@ class HumanContext: self._m_frames = m_frames self._inv_m_frames = inv_m_frames face_images_length = len(self._face_list_cycle) + # TODO: get person config + self.person_config ={ + "frame_config": [[1,face_frames-1, True]], + } logging.info(f'face images length: {face_images_length}') print(f'face images length: {face_images_length}') @@ -113,7 +117,7 @@ class HumanContext: def build(self, render_handler): self._render_handler = render_handler - self._infer_handler = AudioInferenceHandler(self, self._render_handler) + self._infer_handler = AudioInferenceHandler(self, self._render_handler, self.person_config) self._mal_handler = AudioMalHandler(self, self._infer_handler) self._tts_handle = TTSAudioSplitHandle(self, self._mal_handler) self._tts = TTSEdgeHttp(self._tts_handle) diff --git a/utils/log.py b/utils/log.py new file mode 100644 index 0000000..28e5877 --- /dev/null +++ b/utils/log.py @@ -0,0 +1,86 @@ +import logging +import os +import sys +from loguru import logger as logurulogger +import json + +LOG_FORMAT = ( + "{level: <8} " + "{process.name} | " # 进程名 + "{thread.name} | " + "{time:YYYY-MM-DD HH:mm:ss.SSS} - " + "{process} " + "{module}.{function}:{line} - " + "{message}" +) +LOG_NAME = ["uvicorn", "uvicorn.access", "uvicorn.error", "flask"] + + +class InterceptHandler(logging.Handler): + def emit(self, record): + try: + level = logurulogger.level(record.levelname).name + except AttributeError: + level = logging._levelToName[record.levelno] + + frame, depth = logging.currentframe(), 2 + while frame.f_code.co_filename == logging.__file__: + frame = frame.f_back + depth += 1 + + logurulogger.opt(depth=depth, exception=record.exc_info).log( + level, record.getMessage() + ) + +class Logging: + """自定义日志""" + + def __init__(self): + self.log_path = "logs" + os.makedirs(self.log_path, exist_ok=True) + self._initlogger() + self._reset_log_handler() + + + def _initlogger(self): + """初始化loguru配置""" + logurulogger.remove() + logurulogger.add( + os.path.join(self.log_path, "error.log.{time:YYYY-MM-DD}"), + format=LOG_FORMAT, + level=logging.ERROR, + rotation="00:00", + retention="1 week", + backtrace=True, + diagnose=True, + enqueue=True + ) + logurulogger.add( + os.path.join(self.log_path, "info.log.{time:YYYY-MM-DD}"), + format=LOG_FORMAT, + level=logging.INFO, + rotation="00:00", + retention="1 week", + enqueue=True + ) + logurulogger.add( + sys.stdout, + format=LOG_FORMAT, + level=logging.DEBUG, + colorize=True, + ) + + self.logger = logurulogger + + + + def _reset_log_handler(self): + for log in LOG_NAME: + logger = logging.getLogger(log) + logger.handlers = [InterceptHandler()] + + def getlogger(self): + return self.logger + +logger = Logging().getlogger() + diff --git a/utils/loop_frame_tool.py b/utils/loop_frame_tool.py new file mode 100644 index 0000000..4b4978f --- /dev/null +++ b/utils/loop_frame_tool.py @@ -0,0 +1,338 @@ +from utils.log import logger + + +def play_in_loop_v2( + segments, + startfrom, + batch_num, + last_direction, + is_silent, + first_speak, + last_speak, +): + """ + batch_num: 初始和结束,每一帧都这么判断 + 1、静默时,在静默段循环, 左边界正向,右边界反向, 根据上一次方向和位置,给出新的方向和位置 + 2、静默转说话: 就近到说话段,pre_falg, post_flag, 都为true VS 其中一个为true + 3、说话转静默: 动作段播完,再进入静默(如果还在持续说话,静默段不循环) + 4、在整个视频左端点: 开始端只能正向,静默时循环,说话时走2 + 5、在整个视频右端点: 开始时只能反向,静默时循环,说话时走2 + 6、根据方向获取batch_num 数量的视频帧,return batch_idxes, current_direction + Args: + segments: 循环帧配置 [[st, ed, True], ...] + startfrom: cur_pos + batch_num: 5 + last_direction: 0反向1正向 + is_silent: 0说话态1动作态 + first_speak: 记录是不是第一次讲话 + last_speak: 记录是不是讲话结束那一刻 + """ + frames = [] + cur_pos = startfrom + cur_direction = last_direction + is_first_speak_frame = first_speak + is_last_speak_frame = True if last_speak and batch_num == 1 else False + while batch_num != 0: + # 获取当前帧的所在子分割区间 + sub_seg_idx = subseg_judge(cur_pos, segments) + # 获取移动方向 + next_direction, next_pos = get_next_direction( + segments, + cur_pos, + cur_direction, + is_silent, + sub_seg_idx, + is_first_speak_frame, + is_last_speak_frame, + ) + # 获取指定方向的帧 + next_pos = get_next_frame(next_pos, next_direction) + frames.append(next_pos) + batch_num -= 1 + is_first_speak_frame = ( + True if first_speak and batch_num == config.batch_size else False + ) + is_last_speak_frame = True if last_speak and batch_num == 1 else False + + cur_direction = next_direction + cur_pos = next_pos + return frames, next_direction + + +def subseg_judge(cur_pos, segments): + for idx, frame_seg in enumerate(segments): + if cur_pos >= frame_seg[0] and cur_pos <= frame_seg[1]: + return idx + if cur_pos == 0: + return 0 + +def get_next_direction( + segments, + cur_pos, + cur_direction, + is_silent, + sub_seg_idx, + is_first_speak_frame: bool = False, + is_last_speak_frame: bool = False, +): + """ + 3.3.0 循环帧需求,想尽快走到预期状态 + if 动作段: + if 开始说话: + if 边界: + if 正向: + pass + else: + pass + else: + if 正向: + pass + else: + pass + elif 静默: + 同上 + elif 说话中: + 同上 + elif 说话结束: + 同上 + elif 静默段: + 同上 + Args: + is_first_speak_frame: 开始说话flag + is_last_speak_frame: 说话结束flag + """ + left, right, loop_flag = segments[sub_seg_idx] + if loop_flag: + if is_silent == 1: + next_direct, next_pos = pure_silent( + segments, left, right, cur_pos, cur_direction, sub_seg_idx + ) + logger.debug( + f"cur_pos:{cur_pos}, next_direct:{next_direct}, is_first_speak_frame:{is_first_speak_frame}" + ) + elif is_silent == 0: + next_direct, next_pos = silent2action( + segments, + left, + right, + cur_pos, + cur_direction, + sub_seg_idx, + is_first_speak_frame, + ) + logger.debug( + f"cur_pos:{cur_pos}, next_direct:{next_direct}, is_first_speak_frame{is_first_speak_frame}" + ) + else: + if is_silent == 1: + next_direct, next_pos = action2silent( + segments, + left, + right, + cur_pos, + cur_direction, + sub_seg_idx, + is_last_speak_frame, + ) + logger.debug( + f"cur_pos{cur_pos}, next_direct:{next_direct},is_first_speak_frame{is_first_speak_frame},is_last_speak_frame:{is_last_speak_frame}" + ) + elif is_silent == 0: + next_direct, next_pos = pure_action( + segments, + left, + right, + cur_pos, + cur_direction, + sub_seg_idx, + is_last_speak_frame, + ) + logger.debug( + f"cur_pos:{cur_pos}, next_direct:{next_direct},is_first_speak_frame{is_first_speak_frame}, is_last_speak_frame:{is_last_speak_frame}" + ) + return next_direct, next_pos + +def get_next_frame(cur_pos, cur_direction): + """根据当前帧和方向,获取下一帧,这里应该保证方向上的帧是一定能取到的 + 不需要再做额外的边界判断 + """ + # 正向 + if cur_direction == 1: + return cur_pos + 1 + # 反向 + elif cur_direction == 0: + return cur_pos - 1 + +def pure_silent(segments, left, right, cur_pos, cur_direction, sub_seg_idx): + """ + loop_flag == True and is_silent==1 + whether border + whether forward + Return: + next_direction + """ + # 左边界正向,右边界反向 + if cur_pos == segments[0][0]: + return 1, cur_pos + if cur_pos == segments[-1][1]: + return 0, cur_pos + # 右边界,反向 + if cur_pos == right: + return 0, cur_pos + # 左边界,正向 + if cur_pos == left: + return 1, cur_pos + # 非边界,之前正向,则继续正向,否则反向 + if cur_pos > left and cur_direction == 1: + return 1, cur_pos + else: + return 0, cur_pos + + +def pure_action( + segments, left, right, cur_pos, cur_direction, sub_seg_idx, is_last_speak_frame +): + """ + loop_flag ==False and is_silent == 0 + 动作播完,正向到静默段 (存在跳段行为) + whether border + whether forward # 正播反播 + Args: + is_last_speak_frame: 最后说话结束时刻 + Return: next_direction + """ + if cur_pos == segments[0][0]: + return 1, cur_pos + if cur_pos == segments[-1][1]: + return 0, cur_pos + + if is_last_speak_frame: + # 动作段在末尾,向前找静默 + if sub_seg_idx == len(segments) - 1: + return 0, cur_pos + # 动作段在开始, 向后 + if sub_seg_idx == 0: + return 1, cur_pos + # 动作段在中间,就近原则 + mid = left + (right - left + 1) // 2 + # 就近原则优先 + if cur_pos < mid: + return 0, cur_pos + else: + return 1, cur_pos + + else: + # 其他情况,播放方向一致 + if cur_direction == 1: + return 1, cur_pos + else: + return 0, cur_pos + + +def silent2action( + segments, + left, + right, + cur_pos, + cur_direction, + sub_seg_idx, + is_first_speak_frame: bool = False, +): + """ + 在静默区间但是在讲话 + loop_flag=True and is_silent == 0 + whether border + whether forward + + Return: next_direction + """ + # 向最近的动作段移动, 如果左面没有动作段 + # TODO: 确认下面逻辑是否正确 + if ( + cur_pos == segments[0][0] + ): # 如果发生过跳跃,新段无论是不是动作段,仍然都向后执行 + return 1, cur_pos + if cur_pos == segments[-1][1]: + return 0, cur_pos + # 在静默左边界处,且仍在讲话 + if cur_pos == left: + if cur_direction == 1: + return 1, cur_pos + else: + return 0, cur_pos + # 在静默右边界处,且仍在讲话 + elif cur_pos == right: + if cur_direction == 1: + return 1, cur_pos + else: + return 0, cur_pos + else: + mid = left + (right - left + 1) // 2 + # !!就近原则只对第一次说话有效,其他情况遵循上一次状态 + if is_first_speak_frame: + # 如果第一段 + if sub_seg_idx == 0 and segments[0][2]: + return 1, cur_pos + # 如果最后一段 + elif sub_seg_idx == len(segments) - 1 and segments[-1][2]: + return 0, cur_pos + + if cur_pos < mid: + return 0, cur_pos + else: + return 1, cur_pos + else: + if cur_direction == 1: + return 1, cur_pos + elif cur_direction == 0: + return 0, cur_pos + + +def action2silent( + segments, left, right, cur_pos, cur_direction, sub_seg_idx, is_last_speak_frame +): + """ + loop_flag=False and is_silent==1 + whether border + Return: next_direction + """ + if cur_pos == segments[0][0]: + return 1, cur_pos + if cur_pos == segments[-1][1]: + return 0, cur_pos + # 动作段,说话结束转静默情况下,就近原则,进入静默 + if is_last_speak_frame: + mid = left + (right - left + 1) // 2 + if cur_pos < mid: + return 0, cur_pos + else: + return 1, cur_pos + + else: + if cur_direction == 1: + return 1, cur_pos + else: + return 0, cur_pos + + +if __name__ == "__main__": + startfrom = 0 # 上一个batch的最后一帧 + frame_config= [[1, 200, True]] + audio_frame_length = len(mel_chunks) # TODO: 确认是否为 batch_size + startfrom = startfrom if startfrom>= frame_config[0][0] else frame_config[0][0] + first_speak, last_speak = True, False + is_silent= True # 当前batch是否为静默 + last_direction = 1 # -1 为反方向 + i = 0 + while i<=10: + start_idx_list, last_direction = play_in_loop_v2( + frame_config, + startfrom, + audio_frame_length, + last_direction, + is_silent, + first_speak, + last_speak, + ) + startfrom = start_idx_list[-1] + i+=1 \ No newline at end of file