diff --git a/human/__init__.py b/human/__init__.py index cb6be4d..06c491a 100644 --- a/human/__init__.py +++ b/human/__init__.py @@ -5,4 +5,4 @@ from .audio_mal_handler import AudioMalHandler from .audio_inference_handler import AudioInferenceHandler from .audio_inference_onnx_handler import AudioInferenceOnnxHandler from .huaman_status import HumanStatusEnum, HumanStatus -from .human_render import HumanRender +from .human_render import HumanRender, RenderStatus diff --git a/human/audio_inference_handler.py b/human/audio_inference_handler.py index 1874d43..3f64d84 100644 --- a/human/audio_inference_handler.py +++ b/human/audio_inference_handler.py @@ -103,10 +103,10 @@ class AudioInferenceHandler(AudioHandler): for i in range(batch_size): if not self._is_running: break - # self.on_next_handle((None, mirror_index(silence_length, index), - self.on_next_handle((None, human_status.get_index(), + self.on_next_handle((None, mirror_index(length, index), + # self.on_next_handle((None, human_status.get_index(), audio_frames[i * 2:i * 2 + 2]), 0) - # index = index + 1 + index = index + 1 else: human_status.start_talking() logger.info(f'infer======= {current_text}') @@ -116,9 +116,9 @@ class AudioInferenceHandler(AudioHandler): index_list = [] # for i in range(batch_size): for i in range(len(mel_batch)): - # idx = mirror_index(length, index + i) - idx = human_status.get_index() - index_list.append(idx) + idx = mirror_index(length, index + i) + # idx = human_status.get_index() + # index_list.append(idx) face = face_list_cycle[idx] img_batch.append(face) @@ -152,9 +152,10 @@ class AudioInferenceHandler(AudioHandler): if not self._is_running: break self.on_next_handle( - # (res_frame, mirror_index(length, index), audio_frames[i * 2:i * 2 + 2]), - (res_frame, index_list[i], audio_frames[i * 2:i * 2 + 2]), + (res_frame, mirror_index(length, index), audio_frames[i * 2:i * 2 + 2]), + # (res_frame, index_list[i], audio_frames[i * 2:i * 2 + 2]), 0) + index = index + 1 logger.info(f'total batch time: {time.perf_counter() - start_time}') else: @@ -174,6 +175,4 @@ class AudioInferenceHandler(AudioHandler): def pause_talk(self): print('AudioInferenceHandler pause_talk', self._audio_queue.size(), self._mal_queue.size()) self._audio_queue.clear() - print('AudioInferenceHandler111') self._mal_queue.clear() - print('AudioInferenceHandler222') diff --git a/human/audio_mal_handler.py b/human/audio_mal_handler.py index 2c81d42..bb78507 100644 --- a/human/audio_mal_handler.py +++ b/human/audio_mal_handler.py @@ -55,8 +55,7 @@ class AudioMalHandler(AudioHandler): logging.info('chunk2mal run') while self._exit_event.is_set() and self._is_running: self._run_step() - time.sleep(0.02) - + # time.sleep(0.01) logging.info('chunk2mal exit') def _run_step(self): @@ -107,6 +106,7 @@ class AudioMalHandler(AudioHandler): chunk = np.zeros(self.chunk, dtype=np.float32) frame = (chunk, '') type_ = 1 + # time.sleep(0.02) # logging.info(f'AudioMalHandler get_audio_frame type:{type_}') return frame, type_ diff --git a/human/human_context.py b/human/human_context.py index f6e6add..585803f 100644 --- a/human/human_context.py +++ b/human/human_context.py @@ -4,10 +4,8 @@ import os from asr import SherpaNcnnAsr from eventbus import EventBus -from .audio_inference_onnx_handler import AudioInferenceOnnxHandler from .audio_inference_handler import AudioInferenceHandler from .audio_mal_handler import AudioMalHandler -from .human_render import HumanRender from nlp import PunctuationSplit, DouBao, Kimi from tts import TTSEdge, TTSAudioSplitHandle, TTSEdgeHttp from utils import load_avatar, get_device, object_stop, load_avatar_from_processed, load_avatar_from_256_processed @@ -18,7 +16,7 @@ current_file_path = os.path.dirname(os.path.abspath(__file__)) class HumanContext: def __init__(self): - self._fps = 25 # 20 ms per frame + self._fps = 50 # 20 ms per frame self._image_size = 288 self._batch_size = 16 self._sample_rate = 16000 @@ -118,8 +116,8 @@ class HumanContext: else: logger.info(f'notify message:{message}') - def build(self): - self._render_handler = HumanRender(self, None) + def build(self, render_handler): + self._render_handler = render_handler self._infer_handler = AudioInferenceHandler(self, self._render_handler) self._mal_handler = AudioMalHandler(self, self._infer_handler) self._tts_handle = TTSAudioSplitHandle(self, self._mal_handler) diff --git a/human/human_render.py b/human/human_render.py index a6902ed..0c9a25c 100644 --- a/human/human_render.py +++ b/human/human_render.py @@ -2,61 +2,73 @@ import logging import time +from enum import Enum from queue import Empty -from threading import Event, Thread from eventbus import EventBus -from human.message_type import MessageType from human_handler import AudioHandler -from render import VoiceRender, VideoRender, PlayClock from utils import SyncQueue logger = logging.getLogger(__name__) +class RenderStatus(Enum): + E_Normal = 0, + E_Full = 1, + E_Empty = 2 + + class HumanRender(AudioHandler): def __init__(self, context, handler): super().__init__(context, handler) EventBus().register('stop', self._on_stop) - play_clock = PlayClock() - self._voice_render = VoiceRender(play_clock, context) - self._video_render = VideoRender(play_clock, context, self) - self._is_running = True self._queue = SyncQueue(context.batch_size, "HumanRender_queue") - self._exit_event = Event() - self._thread = Thread(target=self._on_run, name="AudioMalHandlerThread") - self._exit_event.set() - self._thread.start() - self._image_render = None - self._last_audio_ps = 0 - self._last_video_ps = 0 self._empty_log = True + self._should_exit = False + self._render_status = RenderStatus.E_Empty def __del__(self): EventBus().unregister('stop', self._on_stop) def _on_stop(self, *args, **kwargs): + self._should_exit = True self.stop() - def _on_run(self): + def _render(self, video_frame, voice_frame): + pass + + def run(self): logging.info('human render run') - while self._exit_event.is_set() and self._is_running: + delay = 1000 / self._context.fps * 0.001 + while not self._should_exit: + if self._render_status is RenderStatus.E_Full: + time.sleep(delay) + continue + + t = time.perf_counter() self._run_step() - delay = 0.075 - time.sleep(delay) + use = time.perf_counter() - t + if self._render_status is RenderStatus.E_Empty: + continue + real_delay = delay - use + print(f'send voice {use}') + if real_delay > 0: + time.sleep(real_delay) + else: + print(f'send voice {real_delay}') logging.info('human render exit') def _run_step(self): try: - value = self._queue.get(timeout=.005) + value = self._queue.get(timeout=1) if value is None: return res_frame, idx, audio_frames = value if not self._empty_log: self._empty_log = True - logging.info('render render:') + logging.info('human render:') except Empty: if self._empty_log: self._empty_log = False @@ -66,27 +78,15 @@ class HumanRender(AudioHandler): type_ = 1 if audio_frames[0][1] != 0 and audio_frames[1][1] != 0: type_ = 0 - if self._voice_render is not None: - self._voice_render.render(audio_frames, self._last_audio_ps) - self._last_audio_ps = self._last_audio_ps + 0.4 - if self._video_render is not None: - self._video_render.render((res_frame, idx, type_), self._last_video_ps) - self._last_video_ps = self._last_video_ps + 0.4 - def set_image_render(self, render): - self._image_render = render - - def put_image(self, image): - if self._image_render is not None: - self._image_render.on_render(image) + self._render((res_frame, idx, type_), audio_frames) def on_message(self, message): super().on_message(message) def on_handle(self, stream, index): - if not self._is_running: + if self._should_exit: return - self._queue.put(stream) def pause_talk(self): @@ -96,17 +96,7 @@ class HumanRender(AudioHandler): def stop(self): logging.info('hunan render stop') - self._is_running = False - if self._exit_event is None: - return - + self._should_exit = True self._queue.clear() - self._exit_event.clear() - if self._thread.is_alive(): - self._thread.join() - logging.info('hunan render stop') - # self._voice_render.stop() - # self._video_render.stop() - # self._exit_event.clear() - # self._thread.join() + logging.info('hunan render stop') diff --git a/main.py b/main.py index 2a36bb6..959b63f 100644 --- a/main.py +++ b/main.py @@ -3,6 +3,7 @@ import logging import os +from human import HumanContext from ui import IpcRender from utils import config_logging @@ -13,7 +14,9 @@ if __name__ == '__main__': config_logging('./logs/info.log', logging.INFO, logging.INFO) logger.info('------------start------------') - render = IpcRender() + context = HumanContext() + render = IpcRender(context) + context.build(render) render.run() render.stop() logger.info('------------finish------------') \ No newline at end of file diff --git a/render/__init__.py b/render/__init__.py deleted file mode 100644 index 8d7f244..0000000 --- a/render/__init__.py +++ /dev/null @@ -1,5 +0,0 @@ -#encoding = utf8 - -from .voice_render import VoiceRender -from .video_render import VideoRender -from .play_clock import PlayClock diff --git a/render/base_render.py b/render/base_render.py deleted file mode 100644 index 3bccaf5..0000000 --- a/render/base_render.py +++ /dev/null @@ -1,25 +0,0 @@ -#encoding = utf8 -import logging -import time -from abc import ABC, abstractmethod -from queue import Queue -from threading import Event, Thread - -from utils import SyncQueue - -logger = logging.getLogger(__name__) - - -class BaseRender(ABC): - def __init__(self, play_clock, context, type_): - self._play_clock = play_clock - self._context = context - # self._queue = SyncQueue(context.batch_size, f'{type_}RenderQueue') - # self._exit_event = Event() - # self._thread = Thread(target=self._on_run, name=thread_name) - # self._exit_event.set() - # self._thread.start() - - @abstractmethod - def render(self, frame, ps): - pass diff --git a/render/play_clock.py b/render/play_clock.py deleted file mode 100644 index 870aee4..0000000 --- a/render/play_clock.py +++ /dev/null @@ -1,37 +0,0 @@ -#encoding = utf8 -import time - - -class PlayClock: - def __init__(self): - self._start = time.time() - self._current_time = 0 - self._display_time = self._start - self._audio_diff_threshold = 0.01 - - @property - def start_time(self): - return self._start - - @property - def current_time(self): - return self._current_time - - @current_time.setter - def current_time(self, v): - self._current_time = v - - @property - def audio_diff_threshold(self): - return self._audio_diff_threshold - - @property - def display_time(self): - return self._display_time - - def update_display_time(self): - self._display_time = time.time() - - def clock_time(self): - elapsed = time.time() - self._display_time - return self.current_time + elapsed diff --git a/render/video_render.py b/render/video_render.py deleted file mode 100644 index 91426bd..0000000 --- a/render/video_render.py +++ /dev/null @@ -1,23 +0,0 @@ -#encoding = utf8 -import copy -import logging -import time - -import cv2 -import numpy as np - -from .base_render import BaseRender - - -class VideoRender(BaseRender): - def __init__(self, play_clock, context, human_render): - super().__init__(play_clock, context, 'Video') - self._human_render = human_render - self.index = 0 - - def render(self, frame, ps): - if self._human_render is not None: - self._human_render.put_image(frame) - - # image = cv2.cvtColor(image, cv2.COLOR_BGRA2RGBA) - diff --git a/render/voice_render.py b/render/voice_render.py deleted file mode 100644 index eff4f19..0000000 --- a/render/voice_render.py +++ /dev/null @@ -1,39 +0,0 @@ -#encoding = utf8 -import logging -import time -from queue import Empty - -import numpy as np - -from audio_render import AudioRender -from human.message_type import MessageType -from .base_render import BaseRender - -logger = logging.getLogger(__name__) - - -class VoiceRender(BaseRender): - def __init__(self, play_clock, context): - self._audio_render = AudioRender() - super().__init__(play_clock, context, 'Voice') - self._current_text = '' - - def render(self, frame, ps): - self._play_clock.update_display_time() - self._play_clock.current_time = ps - - for audio_frame in frame: - frame, type_ = audio_frame - chunk, txt = frame - if txt != self._current_text: - self._current_text = txt - logging.info(f'VoiceRender: {txt}') - chunk = (chunk * 32767).astype(np.int16) - - if self._audio_render is not None: - try: - chunk_len = int(chunk.shape[0] * 2) - # print('audio frame:', frame.shape, chunk_len) - self._audio_render.write(chunk.tobytes(), chunk_len) - except Exception as e: - logging.error(f'Error writing audio frame: {e}') diff --git a/tts/tts_audio_handle.py b/tts/tts_audio_handle.py index 361ecea..1860aee 100644 --- a/tts/tts_audio_handle.py +++ b/tts/tts_audio_handle.py @@ -72,7 +72,7 @@ class TTSAudioSplitHandle(TTSAudioHandle): if chunks is not None: for chunk in chunks: self.on_next_handle((chunk, txt), 0) - time.sleep(0.01) # Sleep briefly to prevent busy-waiting + time.sleep(0.001) # Sleep briefly to prevent busy-waiting def on_handle(self, stream, index): if not self._is_running: diff --git a/ui/ipc_render.py b/ui/ipc_render.py index 17c91d9..a15229e 100644 --- a/ui/ipc_render.py +++ b/ui/ipc_render.py @@ -3,9 +3,10 @@ import os import logging import time -from queue import Queue -from human import HumanContext +import numpy as np + +from human import HumanRender, RenderStatus from ipc import IPCUtil from utils import render_image @@ -13,60 +14,60 @@ logger = logging.getLogger(__name__) current_file_path = os.path.dirname(os.path.abspath(__file__)) -class IpcRender: - def __init__(self): - self._human_context = None - self._queue = None - self._exit = False - self._ipc = None +class IpcRender(HumanRender): + def __init__(self, context): + super().__init__(context, None) + self._ipc = IPCUtil('human_product', 'human_render') + self._current_text = '' - def _send_image(self, identifier, image): + def _send_image(self, image): height, width, channels = image.shape - + t = time.perf_counter() width_bytes = width.to_bytes(4, byteorder='little') height_bytes = height.to_bytes(4, byteorder='little') bit_depth_bytes = channels.to_bytes(4, byteorder='little') img_bytes = image.tobytes() + identifier = b'\x01' data = identifier + width_bytes + height_bytes + bit_depth_bytes + img_bytes self._ipc.send_binary(data, len(data)) + def _send_voice(self, voice): + voice_identifier = b'\x02' + data = voice_identifier + for audio_frame in voice: + frame, type_ = audio_frame + chunk, txt = frame + if txt != self._current_text: + self._current_text = txt + logging.info(f'VoiceRender: {txt}') + chunk = (chunk * 32767).astype(np.int16) + voice_bytes = chunk.tobytes() + data = data + voice_bytes + + self._ipc.send_binary(data, len(data)) + def _on_reader_callback(self, data_str, size): data_str = data_str.decode('utf-8') print(f'on_reader_callback: {data_str}, size:{size}') if 'quit' == data_str: - self._exit = True + self._context.stop() elif 'heartbeat' == data_str: pass + elif 'full' == data_str: + self._render_status = RenderStatus.E_Full + elif 'empty' == data_str: + self._render_status = RenderStatus.E_Empty + elif 'normal' == data_str: + self._render_status = RenderStatus.E_Normal def run(self): - self._queue = Queue() - self._human_context = HumanContext() - self._human_context.build() - self._ipc = IPCUtil('human_product', 'human_render') self._ipc.set_reader_callback(self._on_reader_callback) logger.info(f'ipc listen:{self._ipc.listen()}') + super().run() - render = self._human_context.render_handler - render.set_image_render(self) - - while not self._exit: - if not self._queue.empty(): - while self._queue.qsize() > 5: - self._queue.get() - print('render queue is slower') - - image = self._queue.get() - image = render_image(self._human_context, image) - self._send_image(b'\x01', image) - else: - time.sleep(0.02) - logger.info('ipc render exit') - - def stop(self): - if self._human_context is not None: - self._human_context.stop() - - def on_render(self, image): - self._queue.put(image) + def _render(self, video_frame, voice_frame): + image = render_image(self._context, video_frame) + self._send_image(image) + self._send_voice(voice_frame)