modify render

This commit is contained in:
jiegeaiai 2024-12-09 01:20:48 +08:00
parent 23cab9d86b
commit 34787ae4d4
13 changed files with 94 additions and 232 deletions

View File

@ -5,4 +5,4 @@ from .audio_mal_handler import AudioMalHandler
from .audio_inference_handler import AudioInferenceHandler from .audio_inference_handler import AudioInferenceHandler
from .audio_inference_onnx_handler import AudioInferenceOnnxHandler from .audio_inference_onnx_handler import AudioInferenceOnnxHandler
from .huaman_status import HumanStatusEnum, HumanStatus from .huaman_status import HumanStatusEnum, HumanStatus
from .human_render import HumanRender from .human_render import HumanRender, RenderStatus

View File

@ -103,10 +103,10 @@ class AudioInferenceHandler(AudioHandler):
for i in range(batch_size): for i in range(batch_size):
if not self._is_running: if not self._is_running:
break break
# self.on_next_handle((None, mirror_index(silence_length, index), self.on_next_handle((None, mirror_index(length, index),
self.on_next_handle((None, human_status.get_index(), # self.on_next_handle((None, human_status.get_index(),
audio_frames[i * 2:i * 2 + 2]), 0) audio_frames[i * 2:i * 2 + 2]), 0)
# index = index + 1 index = index + 1
else: else:
human_status.start_talking() human_status.start_talking()
logger.info(f'infer======= {current_text}') logger.info(f'infer======= {current_text}')
@ -116,9 +116,9 @@ class AudioInferenceHandler(AudioHandler):
index_list = [] index_list = []
# for i in range(batch_size): # for i in range(batch_size):
for i in range(len(mel_batch)): for i in range(len(mel_batch)):
# idx = mirror_index(length, index + i) idx = mirror_index(length, index + i)
idx = human_status.get_index() # idx = human_status.get_index()
index_list.append(idx) # index_list.append(idx)
face = face_list_cycle[idx] face = face_list_cycle[idx]
img_batch.append(face) img_batch.append(face)
@ -152,9 +152,10 @@ class AudioInferenceHandler(AudioHandler):
if not self._is_running: if not self._is_running:
break break
self.on_next_handle( self.on_next_handle(
# (res_frame, mirror_index(length, index), audio_frames[i * 2:i * 2 + 2]), (res_frame, mirror_index(length, index), audio_frames[i * 2:i * 2 + 2]),
(res_frame, index_list[i], audio_frames[i * 2:i * 2 + 2]), # (res_frame, index_list[i], audio_frames[i * 2:i * 2 + 2]),
0) 0)
index = index + 1
logger.info(f'total batch time: {time.perf_counter() - start_time}') logger.info(f'total batch time: {time.perf_counter() - start_time}')
else: else:
@ -174,6 +175,4 @@ class AudioInferenceHandler(AudioHandler):
def pause_talk(self): def pause_talk(self):
print('AudioInferenceHandler pause_talk', self._audio_queue.size(), self._mal_queue.size()) print('AudioInferenceHandler pause_talk', self._audio_queue.size(), self._mal_queue.size())
self._audio_queue.clear() self._audio_queue.clear()
print('AudioInferenceHandler111')
self._mal_queue.clear() self._mal_queue.clear()
print('AudioInferenceHandler222')

View File

@ -55,8 +55,7 @@ class AudioMalHandler(AudioHandler):
logging.info('chunk2mal run') logging.info('chunk2mal run')
while self._exit_event.is_set() and self._is_running: while self._exit_event.is_set() and self._is_running:
self._run_step() self._run_step()
time.sleep(0.02) # time.sleep(0.01)
logging.info('chunk2mal exit') logging.info('chunk2mal exit')
def _run_step(self): def _run_step(self):
@ -107,6 +106,7 @@ class AudioMalHandler(AudioHandler):
chunk = np.zeros(self.chunk, dtype=np.float32) chunk = np.zeros(self.chunk, dtype=np.float32)
frame = (chunk, '') frame = (chunk, '')
type_ = 1 type_ = 1
# time.sleep(0.02)
# logging.info(f'AudioMalHandler get_audio_frame type:{type_}') # logging.info(f'AudioMalHandler get_audio_frame type:{type_}')
return frame, type_ return frame, type_

View File

@ -4,10 +4,8 @@ import os
from asr import SherpaNcnnAsr from asr import SherpaNcnnAsr
from eventbus import EventBus from eventbus import EventBus
from .audio_inference_onnx_handler import AudioInferenceOnnxHandler
from .audio_inference_handler import AudioInferenceHandler from .audio_inference_handler import AudioInferenceHandler
from .audio_mal_handler import AudioMalHandler from .audio_mal_handler import AudioMalHandler
from .human_render import HumanRender
from nlp import PunctuationSplit, DouBao, Kimi from nlp import PunctuationSplit, DouBao, Kimi
from tts import TTSEdge, TTSAudioSplitHandle, TTSEdgeHttp from tts import TTSEdge, TTSAudioSplitHandle, TTSEdgeHttp
from utils import load_avatar, get_device, object_stop, load_avatar_from_processed, load_avatar_from_256_processed from utils import load_avatar, get_device, object_stop, load_avatar_from_processed, load_avatar_from_256_processed
@ -18,7 +16,7 @@ current_file_path = os.path.dirname(os.path.abspath(__file__))
class HumanContext: class HumanContext:
def __init__(self): def __init__(self):
self._fps = 25 # 20 ms per frame self._fps = 50 # 20 ms per frame
self._image_size = 288 self._image_size = 288
self._batch_size = 16 self._batch_size = 16
self._sample_rate = 16000 self._sample_rate = 16000
@ -118,8 +116,8 @@ class HumanContext:
else: else:
logger.info(f'notify message:{message}') logger.info(f'notify message:{message}')
def build(self): def build(self, render_handler):
self._render_handler = HumanRender(self, None) self._render_handler = render_handler
self._infer_handler = AudioInferenceHandler(self, self._render_handler) self._infer_handler = AudioInferenceHandler(self, self._render_handler)
self._mal_handler = AudioMalHandler(self, self._infer_handler) self._mal_handler = AudioMalHandler(self, self._infer_handler)
self._tts_handle = TTSAudioSplitHandle(self, self._mal_handler) self._tts_handle = TTSAudioSplitHandle(self, self._mal_handler)

View File

@ -2,61 +2,73 @@
import logging import logging
import time import time
from enum import Enum
from queue import Empty from queue import Empty
from threading import Event, Thread
from eventbus import EventBus from eventbus import EventBus
from human.message_type import MessageType
from human_handler import AudioHandler from human_handler import AudioHandler
from render import VoiceRender, VideoRender, PlayClock
from utils import SyncQueue from utils import SyncQueue
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class RenderStatus(Enum):
E_Normal = 0,
E_Full = 1,
E_Empty = 2
class HumanRender(AudioHandler): class HumanRender(AudioHandler):
def __init__(self, context, handler): def __init__(self, context, handler):
super().__init__(context, handler) super().__init__(context, handler)
EventBus().register('stop', self._on_stop) EventBus().register('stop', self._on_stop)
play_clock = PlayClock()
self._voice_render = VoiceRender(play_clock, context)
self._video_render = VideoRender(play_clock, context, self)
self._is_running = True
self._queue = SyncQueue(context.batch_size, "HumanRender_queue") self._queue = SyncQueue(context.batch_size, "HumanRender_queue")
self._exit_event = Event()
self._thread = Thread(target=self._on_run, name="AudioMalHandlerThread")
self._exit_event.set()
self._thread.start()
self._image_render = None
self._last_audio_ps = 0
self._last_video_ps = 0
self._empty_log = True self._empty_log = True
self._should_exit = False
self._render_status = RenderStatus.E_Empty
def __del__(self): def __del__(self):
EventBus().unregister('stop', self._on_stop) EventBus().unregister('stop', self._on_stop)
def _on_stop(self, *args, **kwargs): def _on_stop(self, *args, **kwargs):
self._should_exit = True
self.stop() self.stop()
def _on_run(self): def _render(self, video_frame, voice_frame):
pass
def run(self):
logging.info('human render run') logging.info('human render run')
while self._exit_event.is_set() and self._is_running: delay = 1000 / self._context.fps * 0.001
while not self._should_exit:
if self._render_status is RenderStatus.E_Full:
time.sleep(delay)
continue
t = time.perf_counter()
self._run_step() self._run_step()
delay = 0.075 use = time.perf_counter() - t
time.sleep(delay) if self._render_status is RenderStatus.E_Empty:
continue
real_delay = delay - use
print(f'send voice {use}')
if real_delay > 0:
time.sleep(real_delay)
else:
print(f'send voice {real_delay}')
logging.info('human render exit') logging.info('human render exit')
def _run_step(self): def _run_step(self):
try: try:
value = self._queue.get(timeout=.005) value = self._queue.get(timeout=1)
if value is None: if value is None:
return return
res_frame, idx, audio_frames = value res_frame, idx, audio_frames = value
if not self._empty_log: if not self._empty_log:
self._empty_log = True self._empty_log = True
logging.info('render render:') logging.info('human render:')
except Empty: except Empty:
if self._empty_log: if self._empty_log:
self._empty_log = False self._empty_log = False
@ -66,27 +78,15 @@ class HumanRender(AudioHandler):
type_ = 1 type_ = 1
if audio_frames[0][1] != 0 and audio_frames[1][1] != 0: if audio_frames[0][1] != 0 and audio_frames[1][1] != 0:
type_ = 0 type_ = 0
if self._voice_render is not None:
self._voice_render.render(audio_frames, self._last_audio_ps)
self._last_audio_ps = self._last_audio_ps + 0.4
if self._video_render is not None:
self._video_render.render((res_frame, idx, type_), self._last_video_ps)
self._last_video_ps = self._last_video_ps + 0.4
def set_image_render(self, render): self._render((res_frame, idx, type_), audio_frames)
self._image_render = render
def put_image(self, image):
if self._image_render is not None:
self._image_render.on_render(image)
def on_message(self, message): def on_message(self, message):
super().on_message(message) super().on_message(message)
def on_handle(self, stream, index): def on_handle(self, stream, index):
if not self._is_running: if self._should_exit:
return return
self._queue.put(stream) self._queue.put(stream)
def pause_talk(self): def pause_talk(self):
@ -96,17 +96,7 @@ class HumanRender(AudioHandler):
def stop(self): def stop(self):
logging.info('hunan render stop') logging.info('hunan render stop')
self._is_running = False self._should_exit = True
if self._exit_event is None:
return
self._queue.clear() self._queue.clear()
self._exit_event.clear()
if self._thread.is_alive():
self._thread.join()
logging.info('hunan render stop')
# self._voice_render.stop()
# self._video_render.stop()
# self._exit_event.clear()
# self._thread.join()
logging.info('hunan render stop')

View File

@ -3,6 +3,7 @@
import logging import logging
import os import os
from human import HumanContext
from ui import IpcRender from ui import IpcRender
from utils import config_logging from utils import config_logging
@ -13,7 +14,9 @@ if __name__ == '__main__':
config_logging('./logs/info.log', logging.INFO, logging.INFO) config_logging('./logs/info.log', logging.INFO, logging.INFO)
logger.info('------------start------------') logger.info('------------start------------')
render = IpcRender() context = HumanContext()
render = IpcRender(context)
context.build(render)
render.run() render.run()
render.stop() render.stop()
logger.info('------------finish------------') logger.info('------------finish------------')

View File

@ -1,5 +0,0 @@
#encoding = utf8
from .voice_render import VoiceRender
from .video_render import VideoRender
from .play_clock import PlayClock

View File

@ -1,25 +0,0 @@
#encoding = utf8
import logging
import time
from abc import ABC, abstractmethod
from queue import Queue
from threading import Event, Thread
from utils import SyncQueue
logger = logging.getLogger(__name__)
class BaseRender(ABC):
def __init__(self, play_clock, context, type_):
self._play_clock = play_clock
self._context = context
# self._queue = SyncQueue(context.batch_size, f'{type_}RenderQueue')
# self._exit_event = Event()
# self._thread = Thread(target=self._on_run, name=thread_name)
# self._exit_event.set()
# self._thread.start()
@abstractmethod
def render(self, frame, ps):
pass

View File

@ -1,37 +0,0 @@
#encoding = utf8
import time
class PlayClock:
def __init__(self):
self._start = time.time()
self._current_time = 0
self._display_time = self._start
self._audio_diff_threshold = 0.01
@property
def start_time(self):
return self._start
@property
def current_time(self):
return self._current_time
@current_time.setter
def current_time(self, v):
self._current_time = v
@property
def audio_diff_threshold(self):
return self._audio_diff_threshold
@property
def display_time(self):
return self._display_time
def update_display_time(self):
self._display_time = time.time()
def clock_time(self):
elapsed = time.time() - self._display_time
return self.current_time + elapsed

View File

@ -1,23 +0,0 @@
#encoding = utf8
import copy
import logging
import time
import cv2
import numpy as np
from .base_render import BaseRender
class VideoRender(BaseRender):
def __init__(self, play_clock, context, human_render):
super().__init__(play_clock, context, 'Video')
self._human_render = human_render
self.index = 0
def render(self, frame, ps):
if self._human_render is not None:
self._human_render.put_image(frame)
# image = cv2.cvtColor(image, cv2.COLOR_BGRA2RGBA)

View File

@ -1,39 +0,0 @@
#encoding = utf8
import logging
import time
from queue import Empty
import numpy as np
from audio_render import AudioRender
from human.message_type import MessageType
from .base_render import BaseRender
logger = logging.getLogger(__name__)
class VoiceRender(BaseRender):
def __init__(self, play_clock, context):
self._audio_render = AudioRender()
super().__init__(play_clock, context, 'Voice')
self._current_text = ''
def render(self, frame, ps):
self._play_clock.update_display_time()
self._play_clock.current_time = ps
for audio_frame in frame:
frame, type_ = audio_frame
chunk, txt = frame
if txt != self._current_text:
self._current_text = txt
logging.info(f'VoiceRender: {txt}')
chunk = (chunk * 32767).astype(np.int16)
if self._audio_render is not None:
try:
chunk_len = int(chunk.shape[0] * 2)
# print('audio frame:', frame.shape, chunk_len)
self._audio_render.write(chunk.tobytes(), chunk_len)
except Exception as e:
logging.error(f'Error writing audio frame: {e}')

View File

@ -72,7 +72,7 @@ class TTSAudioSplitHandle(TTSAudioHandle):
if chunks is not None: if chunks is not None:
for chunk in chunks: for chunk in chunks:
self.on_next_handle((chunk, txt), 0) self.on_next_handle((chunk, txt), 0)
time.sleep(0.01) # Sleep briefly to prevent busy-waiting time.sleep(0.001) # Sleep briefly to prevent busy-waiting
def on_handle(self, stream, index): def on_handle(self, stream, index):
if not self._is_running: if not self._is_running:

View File

@ -3,9 +3,10 @@
import os import os
import logging import logging
import time import time
from queue import Queue
from human import HumanContext import numpy as np
from human import HumanRender, RenderStatus
from ipc import IPCUtil from ipc import IPCUtil
from utils import render_image from utils import render_image
@ -13,60 +14,60 @@ logger = logging.getLogger(__name__)
current_file_path = os.path.dirname(os.path.abspath(__file__)) current_file_path = os.path.dirname(os.path.abspath(__file__))
class IpcRender: class IpcRender(HumanRender):
def __init__(self): def __init__(self, context):
self._human_context = None super().__init__(context, None)
self._queue = None self._ipc = IPCUtil('human_product', 'human_render')
self._exit = False self._current_text = ''
self._ipc = None
def _send_image(self, identifier, image): def _send_image(self, image):
height, width, channels = image.shape height, width, channels = image.shape
t = time.perf_counter()
width_bytes = width.to_bytes(4, byteorder='little') width_bytes = width.to_bytes(4, byteorder='little')
height_bytes = height.to_bytes(4, byteorder='little') height_bytes = height.to_bytes(4, byteorder='little')
bit_depth_bytes = channels.to_bytes(4, byteorder='little') bit_depth_bytes = channels.to_bytes(4, byteorder='little')
img_bytes = image.tobytes() img_bytes = image.tobytes()
identifier = b'\x01'
data = identifier + width_bytes + height_bytes + bit_depth_bytes + img_bytes data = identifier + width_bytes + height_bytes + bit_depth_bytes + img_bytes
self._ipc.send_binary(data, len(data)) self._ipc.send_binary(data, len(data))
def _send_voice(self, voice):
voice_identifier = b'\x02'
data = voice_identifier
for audio_frame in voice:
frame, type_ = audio_frame
chunk, txt = frame
if txt != self._current_text:
self._current_text = txt
logging.info(f'VoiceRender: {txt}')
chunk = (chunk * 32767).astype(np.int16)
voice_bytes = chunk.tobytes()
data = data + voice_bytes
self._ipc.send_binary(data, len(data))
def _on_reader_callback(self, data_str, size): def _on_reader_callback(self, data_str, size):
data_str = data_str.decode('utf-8') data_str = data_str.decode('utf-8')
print(f'on_reader_callback: {data_str}, size:{size}') print(f'on_reader_callback: {data_str}, size:{size}')
if 'quit' == data_str: if 'quit' == data_str:
self._exit = True self._context.stop()
elif 'heartbeat' == data_str: elif 'heartbeat' == data_str:
pass pass
elif 'full' == data_str:
self._render_status = RenderStatus.E_Full
elif 'empty' == data_str:
self._render_status = RenderStatus.E_Empty
elif 'normal' == data_str:
self._render_status = RenderStatus.E_Normal
def run(self): def run(self):
self._queue = Queue()
self._human_context = HumanContext()
self._human_context.build()
self._ipc = IPCUtil('human_product', 'human_render')
self._ipc.set_reader_callback(self._on_reader_callback) self._ipc.set_reader_callback(self._on_reader_callback)
logger.info(f'ipc listen:{self._ipc.listen()}') logger.info(f'ipc listen:{self._ipc.listen()}')
super().run()
render = self._human_context.render_handler def _render(self, video_frame, voice_frame):
render.set_image_render(self) image = render_image(self._context, video_frame)
self._send_image(image)
while not self._exit: self._send_voice(voice_frame)
if not self._queue.empty():
while self._queue.qsize() > 5:
self._queue.get()
print('render queue is slower')
image = self._queue.get()
image = render_image(self._human_context, image)
self._send_image(b'\x01', image)
else:
time.sleep(0.02)
logger.info('ipc render exit')
def stop(self):
if self._human_context is not None:
self._human_context.stop()
def on_render(self, image):
self._queue.put(image)