From b659e227089188e49084175e231e460e6e23a77b Mon Sep 17 00:00:00 2001 From: brige Date: Wed, 23 Oct 2024 17:44:33 +0800 Subject: [PATCH] modify render --- asr/sherpa_ncnn_asr.py | 46 ++++++++++++++++++++++++------------------ human/human_render.py | 2 +- nlp/nlp_doubao.py | 2 +- render/base_render.py | 11 +++++----- render/play_clock.py | 2 +- render/video_render.py | 35 +++++++++++++++++++++++--------- render/voice_render.py | 5 +++-- ui.py | 1 - utils/utils.py | 4 ++-- 9 files changed, 66 insertions(+), 42 deletions(-) diff --git a/asr/sherpa_ncnn_asr.py b/asr/sherpa_ncnn_asr.py index 331320c..cb94bff 100644 --- a/asr/sherpa_ncnn_asr.py +++ b/asr/sherpa_ncnn_asr.py @@ -2,6 +2,7 @@ import logging import os import sys +import time try: import sounddevice as sd @@ -58,23 +59,28 @@ class SherpaNcnnAsr(AsrBase): segment_id = 0 last_result = "" logger.info(f'_recognize_loop') - with sd.InputStream(channels=1, dtype="float32", samplerate=self._sample_rate) as s: - while not self._stop_event.is_set(): - samples, _ = s.read(self._samples_per_read) # a blocking read - samples = samples.reshape(-1) - self._recognizer.accept_waveform(self._sample_rate, samples) - - is_endpoint = self._recognizer.is_endpoint - - result = self._recognizer.text - if result and (last_result != result): - last_result = result - print("\r{}:{}".format(segment_id, result), end=".", flush=True) - self._notify_process(result) - - if is_endpoint: - if result: - print("\r{}:{}".format(segment_id, result), flush=True) - self._notify_complete(result) - segment_id += 1 - self._recognizer.reset() + while not self._stop_event.is_set(): + self._notify_complete('中国人民万岁') + segment_id += 1 + time.sleep(10) + # + # with sd.InputStream(channels=1, dtype="float32", samplerate=self._sample_rate) as s: + # while not self._stop_event.is_set(): + # samples, _ = s.read(self._samples_per_read) # a blocking read + # samples = samples.reshape(-1) + # self._recognizer.accept_waveform(self._sample_rate, samples) + # + # is_endpoint = self._recognizer.is_endpoint + # + # result = self._recognizer.text + # if result and (last_result != result): + # last_result = result + # print("\r{}:{}".format(segment_id, result), end=".", flush=True) + # self._notify_process(result) + # + # if is_endpoint: + # if result: + # print("\r{}:{}".format(segment_id, result), flush=True) + # self._notify_complete(result) + # segment_id += 1 + # self._recognizer.reset() diff --git a/human/human_render.py b/human/human_render.py index aa0a0c6..b6825bd 100644 --- a/human/human_render.py +++ b/human/human_render.py @@ -37,7 +37,7 @@ class HumanRender(AudioHandler): self._voice_render.put(audio_frames) type_ = 1 if audio_frames[0][1] != 0 and audio_frames[1][1] != 0: - type_ = 1 + type_ = 0 self._video_render.put((res_frame, idx, type_)) def stop(self): diff --git a/nlp/nlp_doubao.py b/nlp/nlp_doubao.py index 1bd370c..449519f 100644 --- a/nlp/nlp_doubao.py +++ b/nlp/nlp_doubao.py @@ -44,7 +44,7 @@ class DouBao(NLPBase): sec = '' async for completion in stream: sec = sec + completion.choices[0].delta.content - print(sec) + # print(sec) sec, message = self._split_handle.handle(sec) if len(message) > 0: self._on_callback(message) diff --git a/render/base_render.py b/render/base_render.py index 4138ad7..3911e5c 100644 --- a/render/base_render.py +++ b/render/base_render.py @@ -9,8 +9,9 @@ logger = logging.getLogger(__name__) class BaseRender(ABC): - def __init__(self, play_clock): + def __init__(self, play_clock, delay=0.02): self._play_clock = play_clock + self._delay = delay self._queue = Queue() self._exit_event = Event() self._thread = Thread(target=self._on_run) @@ -20,13 +21,13 @@ class BaseRender(ABC): def _on_run(self): logging.info('Audio render run') while self._exit_event.is_set(): - self.__run_step() - time.sleep(0.02) + self._run_step() + time.sleep(self._delay) logging.info('Audio render exit') def put(self, frame): - ps = time.time() - self._play_clock.start_time() + ps = time.time() - self._play_clock.start_time self._queue.put_nowait((frame, ps)) def stop(self): @@ -35,7 +36,7 @@ class BaseRender(ABC): self._thread.join() @abstractmethod - def __run_step(self): + def _run_step(self): pass diff --git a/render/play_clock.py b/render/play_clock.py index 0fc02c6..8bcb767 100644 --- a/render/play_clock.py +++ b/render/play_clock.py @@ -6,7 +6,7 @@ class PlayClock: def __init__(self): self._start = time.time() self._current_time = 0 - self._display_time = 0 + self._display_time = self._start self._audio_diff_threshold = 0.01 @property diff --git a/render/video_render.py b/render/video_render.py index 8e5c215..4df39c9 100644 --- a/render/video_render.py +++ b/render/video_render.py @@ -1,44 +1,61 @@ #encoding = utf8 import copy +import time from queue import Empty import cv2 import numpy as np -from base_render import BaseRender +from .base_render import BaseRender class VideoRender(BaseRender): def __init__(self, play_clock, context, human_render): - super().__init__(play_clock) + super().__init__(play_clock, 0.02) self._context = context self._human_render = human_render - def __run_step(self): + def _run_step(self): try: - res_frame, idx, type_, ps = self._queue.get(block=True, timeout=0.01) + frame, ps = self._queue.get(block=True, timeout=0.01) + res_frame, idx, type_ = frame + print('video render queue size', self._queue.qsize()) except Empty: return - if type_ != 0: + if type_ == 0: combine_frame = self._context.frame_list_cycle[idx] else: + print('get face', self._queue.qsize()) bbox = self._context.coord_list_cycle[idx] combine_frame = copy.deepcopy(self._context.frame_list_cycle[idx]) y1, y2, x1, x2 = bbox try: res_frame = cv2.resize(res_frame.astype(np.uint8), (x2 - x1, y2 - y1)) except: + print('resize error') return # combine_frame = get_image(ori_frame,res_frame,bbox) # t=time.perf_counter() combine_frame[y1:y2, x1:x2] = res_frame clock_time = self._play_clock.clock_time() - time_difference = abs(clock_time - ps) - if time_difference > self._play_clock.audio_diff_threshold: - print('video is slow') - return + time_difference = clock_time - ps + + print('video render:', ps, ' ', clock_time, ' ', time_difference) + if time_difference < -0.01: # 音频比视频快超过10ms + sleep_time = abs(time_difference + 0.01) + print("Video frame waiting to catch up with audio", sleep_time) + if sleep_time > 0: + time.sleep(sleep_time) # 只在正值时调用 sleep + return # 继续等待 + + elif time_difference < -0.01: # 视频比音频快超过10ms + print("Video frame dropped to catch up with audio") + return # 丢帧 + # if time_difference > self._play_clock.audio_diff_threshold: + # # print('video is slow') + # return # elif time_difference < self._play_clock.audio_diff_threshold: image = combine_frame diff --git a/render/voice_render.py b/render/voice_render.py index 0a703b0..aa0f283 100644 --- a/render/voice_render.py +++ b/render/voice_render.py @@ -6,7 +6,7 @@ from queue import Empty import numpy as np from audio_render import AudioRender -from base_render import BaseRender +from .base_render import BaseRender logger = logging.getLogger(__name__) @@ -16,9 +16,10 @@ class VoiceRender(BaseRender): super().__init__(play_clock) self._audio_render = AudioRender() - def __run_step(self): + def _run_step(self): try: audio_frames, ps = self._queue.get(block=True, timeout=0.01) + print('voice render queue size', self._queue.qsize()) except Empty: return diff --git a/ui.py b/ui.py index c9dd9fa..459d670 100644 --- a/ui.py +++ b/ui.py @@ -67,7 +67,6 @@ class App(customtkinter.CTk): self._human_context.build() render = self._human_context.render_handler render.set_image_render(self) - render.set_audio_render(self._audio_render) self._render() # self.play_audio() diff --git a/utils/utils.py b/utils/utils.py index 5cad18a..92701ed 100644 --- a/utils/utils.py +++ b/utils/utils.py @@ -37,7 +37,7 @@ def read_files_path(path): file_paths = [] files = os.listdir(path) for file in files: - if not os.path.isdir(file): + if not os.path.isdir(file) and file.endswith('.png') or file.endswith('.jpg'): file_paths.append(os.path.join(path, file)) return file_paths @@ -177,7 +177,7 @@ def load_avatar(path, img_size, device): return full_list_cycle, face_frames, coord_frames -def config_logging(file_name: str, console_level: int=logging.INFO, file_level: int=logging.DEBUG): +def config_logging(file_name: str, console_level: int = logging.INFO, file_level: int = logging.DEBUG): file_handler = logging.FileHandler(file_name, mode='a', encoding="utf8") file_handler.setFormatter(logging.Formatter( '%(asctime)s [%(levelname)s] %(module)s.%(lineno)d %(name)s:\t%(message)s'