modify render vidoe async

This commit is contained in:
brige 2024-10-25 08:23:55 +08:00
parent 1cc2617022
commit 53edda7ebe
13 changed files with 128 additions and 69 deletions

View File

Before

Width:  |  Height:  |  Size: 452 KiB

After

Width:  |  Height:  |  Size: 452 KiB

View File

Before

Width:  |  Height:  |  Size: 556 KiB

After

Width:  |  Height:  |  Size: 556 KiB

View File

@ -35,6 +35,10 @@ class AudioInferenceHandler(AudioHandler):
elif type_ == 0: elif type_ == 0:
self._audio_queue.put(stream) self._audio_queue.put(stream)
def on_message(self, message):
print('human render notify:', message)
super().on_message(message)
def __on_run(self): def __on_run(self):
wav2lip_path = os.path.join(current_file_path, '..', 'checkpoints', 'wav2lip.pth') wav2lip_path = os.path.join(current_file_path, '..', 'checkpoints', 'wav2lip.pth')
logger.info(f'AudioInferenceHandler init, path:{wav2lip_path}') logger.info(f'AudioInferenceHandler init, path:{wav2lip_path}')

View File

@ -3,10 +3,11 @@ import logging
import queue import queue
import time import time
from queue import Queue from queue import Queue
from threading import Thread, Event from threading import Thread, Event, Condition
import numpy as np import numpy as np
from human.message_type import MessageType
from human_handler import AudioHandler from human_handler import AudioHandler
from utils import melspectrogram from utils import melspectrogram
@ -18,6 +19,8 @@ class AudioMalHandler(AudioHandler):
super().__init__(context, handler) super().__init__(context, handler)
self._queue = Queue() self._queue = Queue()
self._wait = False
self._condition = Condition()
self._exit_event = Event() self._exit_event = Event()
self._thread = Thread(target=self._on_run) self._thread = Thread(target=self._on_run)
self._exit_event.set() self._exit_event.set()
@ -27,14 +30,31 @@ class AudioMalHandler(AudioHandler):
self.chunk = context.sample_rate // context.fps self.chunk = context.sample_rate // context.fps
logger.info("AudioMalHandler init") logger.info("AudioMalHandler init")
def on_message(self, message):
if message['msg_id'] == MessageType.Video_Render_Queue_Empty:
with self._condition:
if self._wait:
self._wait = False
self._condition.notify()
print('AudioMalHandler notify')
elif message['msg_id'] == MessageType.Video_Render_Queue_Full:
if not self._wait:
self._wait = True
print('AudioMalHandler wait')
else:
super().on_message(message)
def on_handle(self, stream, index): def on_handle(self, stream, index):
self._queue.put(stream) self._queue.put(stream)
def _on_run(self): def _on_run(self):
logging.info('chunk2mal run') logging.info('chunk2mal run')
while self._exit_event.is_set(): while self._exit_event.is_set():
with self._condition:
self._condition.wait_for(lambda: not self._wait)
print('AudioMalHandler run')
self._run_step() self._run_step()
time.sleep(0.3) time.sleep(0.02)
logging.info('chunk2mal exit') logging.info('chunk2mal exit')

View File

@ -22,6 +22,7 @@ class HumanContext:
self._sample_rate = 16000 self._sample_rate = 16000
self._stride_left_size = 10 self._stride_left_size = 10
self._stride_right_size = 10 self._stride_right_size = 10
self._render_batch = 5
self._device = get_device() self._device = get_device()
print(f'device:{self._device}') print(f'device:{self._device}')
@ -61,6 +62,10 @@ class HumanContext:
def image_size(self): def image_size(self):
return self._image_size return self._image_size
@property
def render_batch(self):
return self._render_batch
@property @property
def device(self): def device(self):
return self._device return self._device
@ -97,6 +102,12 @@ class HumanContext:
def render_handler(self): def render_handler(self):
return self._render_handler return self._render_handler
def notify(self, message):
if self._tts_handle is not None:
self._tts_handle.on_message(message)
else:
logger.info(f'notify message:{message}')
def build(self): def build(self):
self._render_handler = HumanRender(self, None) self._render_handler = HumanRender(self, None)
self._infer_handler = AudioInferenceHandler(self, self._render_handler) self._infer_handler = AudioInferenceHandler(self, self._render_handler)

View File

@ -9,6 +9,7 @@ from threading import Thread, Event
import cv2 import cv2
import numpy as np import numpy as np
from human.message_type import MessageType
from human_handler import AudioHandler from human_handler import AudioHandler
from render import VoiceRender, VideoRender, PlayClock from render import VoiceRender, VideoRender, PlayClock
@ -20,10 +21,10 @@ class HumanRender(AudioHandler):
super().__init__(context, handler) super().__init__(context, handler)
play_clock = PlayClock() play_clock = PlayClock()
self._voice_render = VoiceRender(play_clock) self._voice_render = VoiceRender(play_clock, context)
self._video_render = VideoRender(play_clock, context, self) self._video_render = VideoRender(play_clock, context, self)
self._queue = Queue(context.batch_size * 2)
self._image_render = None self._image_render = None
self._last_ps = 0
def set_image_render(self, render): def set_image_render(self, render):
self._image_render = render self._image_render = render
@ -32,17 +33,21 @@ class HumanRender(AudioHandler):
if self._image_render is not None: if self._image_render is not None:
self._image_render.on_render(image) self._image_render.on_render(image)
def on_message(self, message):
print('human render notify:', message)
super().on_message(message)
def on_handle(self, stream, index): def on_handle(self, stream, index):
res_frame, idx, audio_frames = stream res_frame, idx, audio_frames = stream
self._voice_render.put(audio_frames) self._voice_render.put(audio_frames, self._last_ps)
type_ = 1 type_ = 1
if audio_frames[0][1] != 0 and audio_frames[1][1] != 0: if audio_frames[0][1] != 0 and audio_frames[1][1] != 0:
type_ = 0 type_ = 0
self._video_render.put((res_frame, idx, type_)) self._video_render.put((res_frame, idx, type_), self._last_ps)
self._last_ps = self._last_ps + 0.2
def pause_handle(self): if self._voice_render.is_full():
if self._video_render.size() > self._context.batch_size * 2: self._context.notify({'msg_id': MessageType.Video_Render_Queue_Full})
super().pause_handle()
def pause_talk(self): def pause_talk(self):
self._voice_render.pause_talk() self._voice_render.pause_talk()

9
human/message_type.py Normal file
View File

@ -0,0 +1,9 @@
#encoding = utf8
from enum import Enum
class MessageType(Enum):
Unknown = 0
Video_Render_Queue_Empty = 1
Video_Render_Queue_Not_Empty = 2
Video_Render_Queue_Full = 3

View File

@ -14,12 +14,16 @@ class AudioHandler(ABC):
def on_handle(self, stream, index): def on_handle(self, stream, index):
pass pass
@abstractmethod def on_message(self, message):
def pause_handle(self):
if self._handler is not None: if self._handler is not None:
self._handler.pause_handle() self._handler.on_message(message)
else:
logging.info(f'_handler is None') # @abstractmethod
# def pause_handle(self):
# if self._handler is not None:
# self._handler.pause_handle()
# else:
# logging.info(f'_handler is None')
@abstractmethod @abstractmethod
def stop(self): def stop(self):

View File

@ -9,8 +9,10 @@ logger = logging.getLogger(__name__)
class BaseRender(ABC): class BaseRender(ABC):
def __init__(self, play_clock, delay=0.02): def __init__(self, play_clock, context, type_, delay=0.02):
self._play_clock = play_clock self._play_clock = play_clock
self._context = context
self._type = type_
self._delay = delay self._delay = delay
self._queue = Queue() self._queue = Queue()
self._exit_event = Event() self._exit_event = Event()
@ -19,15 +21,14 @@ class BaseRender(ABC):
self._thread.start() self._thread.start()
def _on_run(self): def _on_run(self):
logging.info('Audio render run') logging.info(f'{self._type} render run')
while self._exit_event.is_set(): while self._exit_event.is_set():
self._run_step() self._run_step()
time.sleep(self._delay) time.sleep(self._delay)
logging.info('Audio render exit') logging.info(f'{self._type} render exit')
def put(self, frame): def put(self, frame, ps):
ps = time.time() - self._play_clock.start_time
self._queue.put_nowait((frame, ps)) self._queue.put_nowait((frame, ps))
def size(self): def size(self):

View File

@ -17,14 +17,14 @@ class PlayClock:
def current_time(self): def current_time(self):
return self._current_time return self._current_time
@property
def audio_diff_threshold(self):
return self._audio_diff_threshold
@current_time.setter @current_time.setter
def current_time(self, v): def current_time(self, v):
self._current_time = v self._current_time = v
@property
def audio_diff_threshold(self):
return self._audio_diff_threshold
@property @property
def display_time(self): def display_time(self):
return self._display_time return self._display_time

View File

@ -2,31 +2,47 @@
import copy import copy
import time import time
from queue import Empty from queue import Empty
from enum import Enum
import cv2 import cv2
import numpy as np import numpy as np
from .base_render import BaseRender from .base_render import BaseRender
from human.message_type import MessageType
class VideoRender(BaseRender): class VideoRender(BaseRender):
def __init__(self, play_clock, context, human_render): def __init__(self, play_clock, context, human_render):
super().__init__(play_clock, 0.02) super().__init__(play_clock, context, 'Video')
self._context = context
self._human_render = human_render self._human_render = human_render
def _run_step(self): def _run_step(self):
while self._exit_event.is_set():
try: try:
frame, ps = self._queue.get(block=True, timeout=0.01) frame, ps = self._queue.get(block=True, timeout=0.01)
res_frame, idx, type_ = frame res_frame, idx, type_ = frame
print('video render queue size', self._queue.qsize())
except Empty: except Empty:
return return
clock_time = self._play_clock.clock_time()
time_difference = clock_time - ps
print('video render:', ps, ' ', clock_time, ' ', time_difference)
if time_difference < -self._play_clock.audio_diff_threshold:
sleep_time = abs(time_difference + self._play_clock.audio_diff_threshold)
print("Video frame waiting to catch up with audio", sleep_time)
if sleep_time > 0:
time.sleep(sleep_time)
elif time_difference > self._play_clock.audio_diff_threshold: # 视频比音频快超过10ms
print("Video frame dropped to catch up with audio")
continue
print('get face', self._queue.qsize())
if type_ == 0: if type_ == 0:
combine_frame = self._context.frame_list_cycle[idx] combine_frame = self._context.frame_list_cycle[idx]
else: else:
print('get face', self._queue.qsize())
bbox = self._context.coord_list_cycle[idx] bbox = self._context.coord_list_cycle[idx]
combine_frame = copy.deepcopy(self._context.frame_list_cycle[idx]) combine_frame = copy.deepcopy(self._context.frame_list_cycle[idx])
y1, y2, x1, x2 = bbox y1, y2, x1, x2 = bbox
@ -35,31 +51,10 @@ class VideoRender(BaseRender):
except: except:
print('resize error') print('resize error')
return return
# combine_frame = get_image(ori_frame,res_frame,bbox)
# t=time.perf_counter()
combine_frame[y1:y2, x1:x2] = res_frame combine_frame[y1:y2, x1:x2] = res_frame
clock_time = self._play_clock.clock_time()
time_difference = clock_time - ps
print('video render:', ps, ' ', clock_time, ' ', time_difference)
if time_difference < -0.01: # 音频比视频快超过10ms
sleep_time = abs(time_difference + 0.01)
print("Video frame waiting to catch up with audio", sleep_time)
if sleep_time > 0:
time.sleep(sleep_time) # 只在正值时调用 sleep
return # 继续等待
elif time_difference < -0.01: # 视频比音频快超过10ms
print("Video frame dropped to catch up with audio")
return # 丢帧
# if time_difference > self._play_clock.audio_diff_threshold:
# # print('video is slow')
# return
# elif time_difference < self._play_clock.audio_diff_threshold:
image = combine_frame image = combine_frame
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
if self._human_render is not None: if self._human_render is not None:
self._human_render.put_image(image) self._human_render.put_image(image)
return

View File

@ -6,23 +6,35 @@ from queue import Empty
import numpy as np import numpy as np
from audio_render import AudioRender from audio_render import AudioRender
from human.message_type import MessageType
from .base_render import BaseRender from .base_render import BaseRender
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
class VoiceRender(BaseRender): class VoiceRender(BaseRender):
def __init__(self, play_clock): def __init__(self, play_clock, context):
super().__init__(play_clock) super().__init__(play_clock, context, 'Voice')
self._audio_render = AudioRender() self._audio_render = AudioRender()
def is_full(self):
return self._queue.qsize() >= self._context.render_batch * 2
def _run_step(self): def _run_step(self):
try: try:
audio_frames, ps = self._queue.get(block=True, timeout=0.01) audio_frames, ps = self._queue.get(block=True, timeout=0.01)
print('voice render queue size', self._queue.qsize()) print('voice render queue size', self._queue.qsize())
except Empty: except Empty:
self._context.notify({'msg_id': MessageType.Video_Render_Queue_Empty})
return return
status = MessageType.Video_Render_Queue_Not_Empty
if self._queue.qsize() < self._context.render_batch:
status = MessageType.Video_Render_Queue_Empty
elif self._queue.qsize() >= self._context.render_batch * 2:
status = MessageType.Video_Render_Queue_Full
self._context.notify({'msg_id': status})
self._play_clock.update_display_time() self._play_clock.update_display_time()
self._play_clock.current_time = ps self._play_clock.current_time = ps

View File

@ -33,8 +33,6 @@ class TTSEdgeHttp(TTSBase):
async with session.post(self._url, json=data) as response: async with session.post(self._url, json=data) as response:
if response.status == 200: if response.status == 200:
stream = BytesIO(await response.read()) stream = BytesIO(await response.read())
print("Audio data received and saved to output_audio.wav")
return stream return stream
else: else:
byte_stream = None byte_stream = None