Compare commits

...

18 Commits
V1.0.0 ... main

Author SHA1 Message Date
b4e74dd556 Merge pull request 'develop' (#4) from develop into main
Reviewed-on: #4
2024-12-23 22:43:51 +00:00
2bd94b9680 Merge branch 'main' into develop 2024-12-23 22:43:01 +00:00
421bdfb825 Merge branch 'opti_render' into develop 2024-12-24 06:39:55 +08:00
cbf1e360e8 modify ipc 2024-12-11 01:28:41 +08:00
e5bc16133c modify handle 2024-12-10 23:20:49 +08:00
74f993bdad opti render 2024-12-09 02:03:33 +08:00
34787ae4d4 modify render 2024-12-09 01:20:48 +08:00
23cab9d86b modify break logic 2024-12-08 13:04:14 +08:00
0706a661d8 modify doubao key 2024-12-07 19:01:39 +08:00
0eefa7b1ce modify render talking 2024-12-06 08:27:18 +08:00
31f9ec50cb add kimi nlp 2024-12-06 00:16:41 +08:00
322ff33c84 modify ipc 2024-12-05 00:47:17 +08:00
2a2a3cd349 add modify human render 2024-12-04 02:07:18 +08:00
e9691dbc81 add main ipc render 2024-12-03 00:14:00 +08:00
a23a33af1c modify ipc util 2024-12-02 01:30:44 +08:00
f489dcaf5d add ipc 2024-11-27 01:36:17 +08:00
496bb7fe55 modify render 2024-11-26 06:14:03 +08:00
0760f92064 first opti render 2024-11-26 05:27:00 +08:00
42 changed files with 600 additions and 363 deletions

View File

@ -30,11 +30,11 @@ class AsrBase:
pass
def _notify_process(self, message: str):
EventBus().post('clear_cache')
for observer in self._observers:
observer.process(message)
def _notify_complete(self, message: str):
EventBus().post('clear_cache')
for observer in self._observers:
observer.completed(message)

View File

@ -83,15 +83,16 @@ class SherpaNcnnAsr(AsrBase):
self._notify_complete(result)
segment_id += 1
self._recognizer.reset()
'''
while self._stop_event.is_set():
while self._stop_event.is_set():
logger.info(f'_recognize_loop000')
self._notify_complete('介绍中国5000年历史文学')
logger.info(f'_recognize_loop111')
segment_id += 1
time.sleep(150)
logger.info(f'_recognize_loop222')
logger.info(f'_recognize_loop exit')
logger.info(f'_recognize_loop exit')
'''

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -1,3 +0,0 @@
#encoding = utf8
from .audio_render import AudioRender

View File

@ -1,39 +0,0 @@
#encoding = utf8
from ctypes import *
import os
import numpy as np
current = os.path.dirname(__file__)
dynamic_path = os.path.join(current, 'AudioRender.dll')
def audio_render_log_callback(level, log, size):
print(f'level={level}, log={log}, len={size}')
class AudioRender:
def __init__(self):
self.__audio_render_obj = WinDLL(dynamic_path)
print(self.__audio_render_obj)
if self.__audio_render_obj is not None:
CALLBACK_TYPE = CFUNCTYPE(None, c_int, c_ubyte, c_uint)
c_callback = CALLBACK_TYPE(audio_render_log_callback)
self.__init = self.__audio_render_obj.Initialize(c_callback)
print('AudioRender init', self.__init)
def __del__(self):
print('AudioRender __del__')
if self.__audio_render_obj is None:
return
if self.__init:
self.__audio_render_obj.Uninitialize()
def write(self, data, size):
if not self.__init:
return False
self.__audio_render_obj.argtypes = (POINTER(c_uint8), c_uint)
byte_data = np.frombuffer(data, dtype=np.uint8)
return self.__audio_render_obj.Write(byte_data.ctypes.data_as(POINTER(c_uint8)), size)

View File

@ -4,4 +4,5 @@ from .human_context import HumanContext
from .audio_mal_handler import AudioMalHandler
from .audio_inference_handler import AudioInferenceHandler
from .audio_inference_onnx_handler import AudioInferenceOnnxHandler
from .human_render import HumanRender
from .huaman_status import HumanStatusEnum, HumanStatus
from .human_render import HumanRender, RenderStatus

View File

@ -3,15 +3,16 @@ import logging
import os
import queue
import time
from queue import Queue
from threading import Event, Thread
import cv2
import numpy as np
import torch
from eventbus import EventBus
from human_handler import AudioHandler
from utils import load_model, mirror_index, get_device, SyncQueue
from .huaman_status import HumanStatus
logger = logging.getLogger(__name__)
current_file_path = os.path.dirname(os.path.abspath(__file__))
@ -22,7 +23,6 @@ class AudioInferenceHandler(AudioHandler):
super().__init__(context, handler)
EventBus().register('stop', self._on_stop)
EventBus().register('clear_cache', self.on_clear_cache)
self._mal_queue = SyncQueue(1, "AudioInferenceHandler_Mel")
self._audio_queue = SyncQueue(context.batch_size * 2, "AudioInferenceHandler_Audio")
@ -36,15 +36,10 @@ class AudioInferenceHandler(AudioHandler):
def __del__(self):
EventBus().unregister('stop', self._on_stop)
EventBus().unregister('clear_cache', self.on_clear_cache)
def _on_stop(self, *args, **kwargs):
self.stop()
def on_clear_cache(self, *args, **kwargs):
self._mal_queue.clear()
self._audio_queue.clear()
def on_handle(self, stream, type_):
if not self._is_running:
return
@ -67,12 +62,13 @@ class AudioInferenceHandler(AudioHandler):
logger.info("Model loaded")
face_list_cycle = self._context.face_list_cycle
length = len(face_list_cycle)
index = 0
count = 0
count_time = 0
logger.info('start inference')
silence_length = 133
human_status = HumanStatus(length, silence_length)
device = get_device()
logger.info(f'use device:{device}')
@ -107,16 +103,22 @@ class AudioInferenceHandler(AudioHandler):
for i in range(batch_size):
if not self._is_running:
break
self.on_next_handle((None, mirror_index(length, index), audio_frames[i * 2:i * 2 + 2]),
0)
index = index + 1
# self.on_next_handle((None, mirror_index(length, index),
self.on_next_handle((None, human_status.get_index(),
audio_frames[i * 2:i * 2 + 2]), 0)
# index = index + 1
else:
human_status.start_talking()
logger.info(f'infer======= {current_text}')
# human_status.try_to_talk()
t = time.perf_counter()
img_batch = []
index_list = []
# for i in range(batch_size):
for i in range(len(mel_batch)):
idx = mirror_index(length, index + i)
# idx = mirror_index(length, index + i)
idx = human_status.get_index()
index_list.append(idx)
face = face_list_cycle[idx]
img_batch.append(face)
@ -150,9 +152,11 @@ class AudioInferenceHandler(AudioHandler):
if not self._is_running:
break
self.on_next_handle(
(res_frame, mirror_index(length, index), audio_frames[i * 2:i * 2 + 2]),
# (res_frame, mirror_index(length, index), audio_frames[i * 2:i * 2 + 2]),
(res_frame, index_list[i], audio_frames[i * 2:i * 2 + 2]),
0)
index = index + 1
# index = index + 1
logger.info(f'total batch time: {time.perf_counter() - start_time}')
else:
time.sleep(1)
@ -171,6 +175,4 @@ class AudioInferenceHandler(AudioHandler):
def pause_talk(self):
print('AudioInferenceHandler pause_talk', self._audio_queue.size(), self._mal_queue.size())
self._audio_queue.clear()
print('AudioInferenceHandler111')
self._mal_queue.clear()
print('AudioInferenceHandler222')

View File

@ -5,7 +5,9 @@ import queue
import time
from threading import Event, Thread
from gfpgan import GFPGANer
import cv2
# from gfpgan import GFPGANer
from eventbus import EventBus
from human_handler import AudioHandler
from utils import load_model, mirror_index, get_device, SyncQueue
@ -16,32 +18,32 @@ current_file_path = os.path.dirname(os.path.abspath(__file__))
def load_gfpgan_model(model_path):
logger.info(f'load_gfpgan_model, path:{model_path}')
model = GFPGANer(
model_path=model_path,
upscale=1,
arch='clean',
channel_multiplier=2,
bg_upsampler=None,
)
return model
# model = GFPGANer(
# model_path=model_path,
# upscale=1,
# arch='clean',
# channel_multiplier=2,
# bg_upsampler=None,
# )
return None
#model
def load_model(model_path):
import onnxruntime as ort
sess_opt = ort.SessionOptions()
sess_opt.intra_op_num_threads = 8
sess = ort.InferenceSession(model_path, sess_options=sess_opt, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
return sess
# import onnxruntime as ort
# sess_opt = ort.SessionOptions()
# sess_opt.intra_op_num_threads = 8
# sess = ort.InferenceSession(model_path, sess_options=sess_opt, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
#
# return sess
return None
class AudioInferenceOnnxHandler(AudioHandler):
def __init__(self, context, handler):
super().__init__(context, handler)
EventBus().register('stop', self._on_stop)
EventBus().register('clear_cache', self.on_clear_cache)
self._mal_queue = SyncQueue(1, "AudioInferenceHandler_Mel")
self._audio_queue = SyncQueue(context.batch_size * 2, "AudioInferenceHandler_Audio")
@ -55,15 +57,10 @@ class AudioInferenceOnnxHandler(AudioHandler):
def __del__(self):
EventBus().unregister('stop', self._on_stop)
EventBus().unregister('clear_cache', self.on_clear_cache)
def _on_stop(self, *args, **kwargs):
self.stop()
def on_clear_cache(self, *args, **kwargs):
self._mal_queue.clear()
self._audio_queue.clear()
def on_handle(self, stream, type_):
if not self._is_running:
return
@ -92,9 +89,9 @@ class AudioInferenceOnnxHandler(AudioHandler):
gfpgan_model = load_gfpgan_model(gfpgan_model_path)
face_list_cycle = self._context.face_list_cycle
length = len(face_list_cycle)
for i in range(length):
cv2.imwrite(f'face_{i}.png', face_list_cycle[i])
index = 0
count = 0
count_time = 0
@ -156,18 +153,6 @@ class AudioInferenceOnnxHandler(AudioHandler):
onnx_out = model_g.run(onnx_names, onnx_input)[0]
pred = onnx_out
# onnxruntime_inputs = {"audio_seqs__0": mel_batch, }
# onnxruntime_names = [output.name for output in model_a.get_outputs()]
# embeddings = model_a.run(onnxruntime_names, onnxruntime_inputs)[0]
#
# onnxruntime_inputs = {"audio_embedings__0": embeddings, "img_seqs__1": img_batch}
# onnxruntime_names = [output.name for output in model_g.get_outputs()]
#
# start_model = time.time()
# onnxruntime_output = model_g.run(onnxruntime_names, onnxruntime_inputs)[0]
# end_model = time.time()
# pred = onnxruntime_output
count_time += (time.perf_counter() - t)
count += batch_size
@ -205,3 +190,4 @@ class AudioInferenceOnnxHandler(AudioHandler):
print('AudioInferenceHandler111')
self._mal_queue.clear()
print('AudioInferenceHandler222')
super().pause_talk()

View File

@ -55,8 +55,7 @@ class AudioMalHandler(AudioHandler):
logging.info('chunk2mal run')
while self._exit_event.is_set() and self._is_running:
self._run_step()
time.sleep(0.02)
# time.sleep(0.01)
logging.info('chunk2mal exit')
def _run_step(self):
@ -80,7 +79,6 @@ class AudioMalHandler(AudioHandler):
# print(mel.shape[0],mel.shape,len(mel[0]),len(self.frames))
# cut off stride
left = max(0, self._context.stride_left_size * 80 / self._context.fps)
right = min(len(mel[0]), len(mel[0]) - self._context.stride_right_size * 80 / 50)
mel_idx_multiplier = 80. * 2 / self._context.fps
mel_step_size = 16
i = 0
@ -107,6 +105,7 @@ class AudioMalHandler(AudioHandler):
chunk = np.zeros(self.chunk, dtype=np.float32)
frame = (chunk, '')
type_ = 1
# time.sleep(0.02)
# logging.info(f'AudioMalHandler get_audio_frame type:{type_}')
return frame, type_
@ -124,3 +123,4 @@ class AudioMalHandler(AudioHandler):
def pause_talk(self):
print('AudioMalHandler pause_talk', self._queue.size())
self._queue.clear()
super().pause_talk()

60
human/huaman_status.py Normal file
View File

@ -0,0 +1,60 @@
#encoding = utf8
import logging
from enum import Enum
class HumanStatusEnum(Enum):
silence = 1
talking = 2
class HumanStatus:
def __init__(self, total_frames=0, silence_length=0):
self._status = HumanStatusEnum.silence
self._total_frames = total_frames
self._silence_length = silence_length
self._talking_length = total_frames - silence_length
self._current_frame = 0
self._is_talking = False
def get_status(self):
return self._status
def set_status(self, status):
self._status = status
return self._status
def try_to_talk(self):
if self._status == HumanStatusEnum.silence:
if self._current_frame - self._silence_length < 0:
return False
self._status = HumanStatusEnum.talking
return True
def get_index(self):
if not self._is_talking:
index = self._current_frame % self._silence_length
if self._current_frame >= self._silence_length:
self._is_talking = True
self._current_frame = 0
else:
index = self._silence_length + (self._current_frame - self._silence_length) % self._talking_length
if self._current_frame >= self._silence_length + self._talking_length:
self._is_talking = False
self._current_frame = 0
self._current_frame = (self._current_frame + 1) % self._total_frames
return index
def start_talking(self):
self._is_talking = True
def stop_talking(self):
self._is_talking = False
self._current_frame = 0

View File

@ -4,11 +4,9 @@ import os
from asr import SherpaNcnnAsr
from eventbus import EventBus
from .audio_inference_onnx_handler import AudioInferenceOnnxHandler
from .audio_inference_handler import AudioInferenceHandler
from .audio_mal_handler import AudioMalHandler
from .human_render import HumanRender
from nlp import PunctuationSplit, DouBao
from nlp import PunctuationSplit, DouBao, Kimi
from tts import TTSEdge, TTSAudioSplitHandle, TTSEdgeHttp
from utils import load_avatar, get_device, object_stop, load_avatar_from_processed, load_avatar_from_256_processed
@ -18,13 +16,12 @@ current_file_path = os.path.dirname(os.path.abspath(__file__))
class HumanContext:
def __init__(self):
self._fps = 25 # 20 ms per frame
self._fps = 50 # 20 ms per frame
self._image_size = 288
self._batch_size = 16
self._batch_size = 6
self._sample_rate = 16000
self._stride_left_size = 10
self._stride_right_size = 10
self._render_batch = 5
self._stride_left_size = 4
self._stride_right_size = 4
self._asr = None
self._nlp = None
@ -64,10 +61,6 @@ class HumanContext:
def image_size(self):
return self._image_size
@property
def render_batch(self):
return self._render_batch
@property
def device(self):
return self._device
@ -118,14 +111,15 @@ class HumanContext:
else:
logger.info(f'notify message:{message}')
def build(self):
self._render_handler = HumanRender(self, None)
def build(self, render_handler):
self._render_handler = render_handler
self._infer_handler = AudioInferenceHandler(self, self._render_handler)
self._mal_handler = AudioMalHandler(self, self._infer_handler)
self._tts_handle = TTSAudioSplitHandle(self, self._mal_handler)
self._tts = TTSEdgeHttp(self._tts_handle)
split = PunctuationSplit()
self._nlp = DouBao(self, split, self._tts)
# self._nlp = Kimi(self, split, self._tts)
self._asr = SherpaNcnnAsr()
self._asr.attach(self._nlp)

View File

@ -2,66 +2,73 @@
import logging
import time
from enum import Enum
from queue import Empty
from threading import Event, Thread
from eventbus import EventBus
from human.message_type import MessageType
from human_handler import AudioHandler
from render import VoiceRender, VideoRender, PlayClock
from utils import SyncQueue
logger = logging.getLogger(__name__)
class RenderStatus(Enum):
E_Normal = 0,
E_Full = 1,
E_Empty = 2
class HumanRender(AudioHandler):
def __init__(self, context, handler):
super().__init__(context, handler)
EventBus().register('stop', self._on_stop)
EventBus().register('clear_cache', self.on_clear_cache)
play_clock = PlayClock()
self._voice_render = VoiceRender(play_clock, context)
self._video_render = VideoRender(play_clock, context, self)
self._is_running = True
self._queue = SyncQueue(context.batch_size, "HumanRender_queue")
self._exit_event = Event()
self._thread = Thread(target=self._on_run, name="AudioMalHandlerThread")
self._exit_event.set()
self._thread.start()
self._image_render = None
self._last_audio_ps = 0
self._last_video_ps = 0
self._empty_log = True
self._should_exit = False
self._render_status = RenderStatus.E_Empty
def __del__(self):
EventBus().unregister('stop', self._on_stop)
EventBus().unregister('clear_cache', self.on_clear_cache)
def _on_stop(self, *args, **kwargs):
self._should_exit = True
self.stop()
def on_clear_cache(self, *args, **kwargs):
self._queue.clear()
def _render(self, video_frame, voice_frame):
pass
def _on_run(self):
def run(self):
logging.info('human render run')
while self._exit_event.is_set() and self._is_running:
self._run_step()
delay = 0.04
delay = 1000 / self._context.fps * 0.001
while not self._should_exit:
if self._render_status is RenderStatus.E_Full:
time.sleep(delay)
continue
t = time.perf_counter()
self._run_step()
use = time.perf_counter() - t
if self._render_status is RenderStatus.E_Empty:
continue
real_delay = delay - use
# print(f'send voice {use}')
if real_delay > 0:
time.sleep(real_delay)
# else:
# print(f'send voice {real_delay}')
logging.info('human render exit')
def _run_step(self):
try:
value = self._queue.get(timeout=.005)
value = self._queue.get(timeout=1)
if value is None:
return
res_frame, idx, audio_frames = value
if not self._empty_log:
self._empty_log = True
logging.info('render render:')
logging.info('human render:')
except Empty:
if self._empty_log:
self._empty_log = False
@ -71,47 +78,25 @@ class HumanRender(AudioHandler):
type_ = 1
if audio_frames[0][1] != 0 and audio_frames[1][1] != 0:
type_ = 0
if self._voice_render is not None:
self._voice_render.render(audio_frames, self._last_audio_ps)
self._last_audio_ps = self._last_audio_ps + 0.4
if self._video_render is not None:
self._video_render.render((res_frame, idx, type_), self._last_video_ps)
self._last_video_ps = self._last_video_ps + 0.4
def set_image_render(self, render):
self._image_render = render
def put_image(self, image):
if self._image_render is not None:
self._image_render.on_render(image)
self._render((res_frame, idx, type_), audio_frames)
def on_message(self, message):
super().on_message(message)
def on_handle(self, stream, index):
if not self._is_running:
if self._should_exit:
return
self._queue.put(stream)
def pause_talk(self):
logging.info('hunan pause_talk')
# self._voice_render.pause_talk()
# self._video_render.pause_talk()
self._queue.clear()
super().pause_talk()
def stop(self):
logging.info('hunan render stop')
self._is_running = False
if self._exit_event is None:
return
self._should_exit = True
self._queue.clear()
self._exit_event.clear()
if self._thread.is_alive():
self._thread.join()
logging.info('hunan render stop')
# self._voice_render.stop()
# self._video_render.stop()
# self._exit_event.clear()
# self._thread.join()
logging.info('hunan render stop')

View File

@ -29,4 +29,8 @@ class AudioHandler(ABC):
logging.info(f'_handler is None')
def pause_talk(self):
pass
if self._handler is not None:
self._handler.pause_talk()
else:
logging.info(f'next_pause_talk _handler is None')

3
ipc/__init__.py Normal file
View File

@ -0,0 +1,3 @@
#encoding = utf8
from .ipc_util import IPCUtil

BIN
ipc/ipc.dll Normal file

Binary file not shown.

BIN
ipc/ipc.exp Normal file

Binary file not shown.

BIN
ipc/ipc.lib Normal file

Binary file not shown.

BIN
ipc/ipc.pdb Normal file

Binary file not shown.

62
ipc/ipc_mem.py Normal file
View File

@ -0,0 +1,62 @@
#encoding = utf8
import logging
import os
from ctypes import *
current = os.path.dirname(__file__)
dynamic_path = os.path.join(current, 'ipc.dll')
class IPCMem:
def __init__(self, sender, receiver):
self.__ipc_obj = WinDLL(dynamic_path)
print(self.__ipc_obj)
if self.__ipc_obj is not None:
self.__ipc_obj.initialize.argtypes = [c_char_p, c_char_p]
self.__ipc_obj.initialize.restype = c_bool
print('IPCUtil init', sender.encode('utf-8'), receiver.encode('utf-8'))
self.__init = self.__ipc_obj.initialize(sender.encode('utf-8'), receiver.encode('utf-8'))
print('IPCUtil init', self.__init)
def __del__(self):
print('IPCUtil __del__')
if self.__ipc_obj is None:
return
if self.__init:
self.__ipc_obj.uninitialize()
def listen(self):
if not self.__init:
return False
self.__ipc_obj.listen.restype = c_bool
return self.__ipc_obj.listen()
def send_text(self, data):
if not self.__init:
return False
self.__ipc_obj.send.argtypes = [c_char_p, c_uint]
self.__ipc_obj.send.restype = c_bool
send_data = data.encode('utf-8')
send_len = len(send_data) + 1
if not self.__ipc_obj.send(send_data, send_len):
self.__ipc_obj.reConnect()
return True
def send_binary(self, data, size):
if not self.__init:
return False
self.__ipc_obj.send.argtypes = [c_char_p, c_uint]
self.__ipc_obj.send.restype = c_bool
data_ptr = cast(data, c_char_p)
return self.__ipc_obj.send(data_ptr, size)
def set_reader_callback(self, callback):
if not self.__init:
return False
CALLBACK_TYPE = CFUNCTYPE(None, c_char_p, c_uint)
self.c_callback = CALLBACK_TYPE(callback) # Store the callback to prevent garbage collection
self.__ipc_obj.setReaderCallback.argtypes = [CALLBACK_TYPE]
self.__ipc_obj.setReaderCallback.restype = c_bool
return self.__ipc_obj.setReaderCallback(self.c_callback)

71
ipc/ipc_util.py Normal file
View File

@ -0,0 +1,71 @@
#encoding = utf8
import os
import time
from ctypes import *
current = os.path.dirname(__file__)
dynamic_path = os.path.join(current, 'ipc.dll')
class IPCUtil:
def __init__(self, sender, receiver):
self.__ipc_obj = WinDLL(dynamic_path)
print(self.__ipc_obj)
if self.__ipc_obj is not None:
self.__ipc_obj.initialize.argtypes = [c_char_p, c_char_p]
self.__ipc_obj.initialize.restype = c_bool
print('IPCUtil init', sender.encode('utf-8'), receiver.encode('utf-8'))
self.__init = self.__ipc_obj.initialize(sender.encode('utf-8'), receiver.encode('utf-8'))
print('IPCUtil init', self.__init)
def __del__(self):
print('IPCUtil __del__')
if self.__ipc_obj is None:
return
if self.__init:
self.__ipc_obj.uninitialize()
def listen(self):
if not self.__init:
return False
self.__ipc_obj.listen.restype = c_bool
return self.__ipc_obj.listen()
def send_text(self, data):
if not self.__init:
return False
self.__ipc_obj.send.argtypes = [c_char_p, c_uint]
self.__ipc_obj.send.restype = c_bool
send_data = data.encode('utf-8')
send_len = len(send_data) + 1
if not self.__ipc_obj.trySend(send_data, send_len):
self.__ipc_obj.reConnect()
return True
def send_binary(self, data, size):
if not self.__init:
return False
self.__ipc_obj.send.argtypes = [c_char_p, c_uint]
self.__ipc_obj.send.restype = c_bool
data_ptr = cast(data, c_char_p)
return self.__ipc_obj.trySend(data_ptr, size)
def set_reader_callback(self, callback):
if not self.__init:
return False
CALLBACK_TYPE = CFUNCTYPE(None, c_char_p, c_uint)
self.c_callback = CALLBACK_TYPE(callback) # Store the callback to prevent garbage collection
self.__ipc_obj.setReaderCallback.argtypes = [CALLBACK_TYPE]
self.__ipc_obj.setReaderCallback.restype = c_bool
return self.__ipc_obj.setReaderCallback(self.c_callback)
# def ipc_log_callback(log, size):
# print(f'log={log.decode("utf-8")}, len={size}')
#
#
# util = IPCUtil('ipc_sender', 'ipc_sender')
# util.set_reader_callback(ipc_log_callback)
# print(util.listen())
# print(util.send_text('hello'))
# time.sleep(200)

22
main.py Normal file
View File

@ -0,0 +1,22 @@
#encoding = utf8
import logging
import os
from human import HumanContext
from ui import IpcRender
from utils import config_logging
logger = logging.getLogger(__name__)
current_file_path = os.path.dirname(os.path.abspath(__file__))
if __name__ == '__main__':
config_logging('./logs/info.log', logging.INFO, logging.INFO)
logger.info('------------start------------')
context = HumanContext()
render = IpcRender(context)
context.build(render)
render.run()
render.stop()
logger.info('------------finish------------')

View File

@ -2,4 +2,5 @@
from .nlp_callback import NLPCallback
from .nlp_doubao import DouBao
from .nlp_kimi import Kimi
from .nlp_split import PunctuationSplit

View File

@ -28,7 +28,9 @@ class NLPBase(AsrObserver):
def on_clear_cache(self, *args, **kwargs):
logger.info('NLPBase clear_cache')
self._ask_queue.clear()
self.pause_talk()
if self._callback is not None:
self._callback.on_clear()
@property
def callback(self):
@ -74,6 +76,5 @@ class NLPBase(AsrObserver):
def pause_talk(self):
logger.info('NLPBase pause_talk')
self._is_running = False
self._ask_queue.clear()

View File

@ -7,3 +7,7 @@ class NLPCallback(ABC):
@abstractmethod
def on_message(self, txt: str):
pass
@abstractmethod
def on_clear(self):
pass

View File

@ -80,7 +80,7 @@ class DouBaoHttp:
}
data = {
"model": "ep-20241008152048-fsgzf",
"model": "ep-20241207182221-mhhzq",
"messages": question,
'stream': True
}
@ -93,7 +93,7 @@ class DouBaoHttp:
self._requesting = True
logger.info(f'-------dou_bao ask:{question}')
msg_list = [
{"role": "system", "content": "你是测试客服,是由字节跳动开发的 AI 人工智能助手"},
{"role": "system", "content": "你是库里,是一个学习小老师,擅长初中历史和数学。"},
{"role": "user", "content": question}
]
self._response = self.__request(msg_list)

117
nlp/nlp_kimi.py Normal file
View File

@ -0,0 +1,117 @@
#encoding = utf8
import json
import logging
import time
import requests
from nlp.nlp_base import NLPBase
logger = logging.getLogger(__name__)
class KimiHttp:
def __init__(self, token):
self.__token = token
self._response = None
self._requesting = False
def __request(self, question):
url = "https://api.moonshot.cn/v1/chat/completions"
headers = {
"Authorization": "Bearer " + self.__token,
"Content-Type": "application/json"
}
data = {
"model": "moonshot-v1-8k",
"messages": question,
'stream': True,
"temperature": 0.3
}
response = requests.post(url, headers=headers, json=data, stream=True)
return response
def request(self, question, handle, callback):
t = time.time()
self._requesting = True
logger.info(f'-------dou_bao ask:{question}')
msg_list = [
{"role": "system", "content": "你是 Kimi由 Moonshot AI 提供的人工智能助手,你更擅长中文和英文的对话。"
"你会为用户提供安全,有帮助,准确的回答。同时,你会拒绝一切涉及恐怖主义,种族歧视,"
"黄色暴力等问题的回答。Moonshot AI 为专有名词,不可翻译成其他语言。"},
{"role": "user", "content": question}
]
self._response = self.__request(msg_list)
if not self._response.ok:
logger.error(f"请求失败,状态码:{self._response.status_code}")
return
sec = ''
for chunk in self._response.iter_lines():
content = chunk.decode("utf-8").strip()
if len(content) < 1:
continue
content = content[5:]
content = content.strip()
if content == '[DONE]':
break
try:
content = json.loads(content)
except Exception as e:
logger.error(f"json解析失败错误信息{e, content}")
continue
sec = sec + content["choices"][0]["delta"]["content"]
sec, message = handle.handle(sec)
if len(message) > 0:
logger.info(f'-------dou_bao nlp time:{time.time() - t:.4f}s')
callback(message)
if len(sec) > 0:
callback(sec)
self._requesting = False
logger.info(f'-------dou_bao nlp time:{time.time() - t:.4f}s')
def close(self):
if self._response is not None and self._requesting:
self._response.close()
def aclose(self):
if self._response is not None and self._requesting:
self._response.close()
logger.info('DouBaoHttp close')
class Kimi(NLPBase):
def __init__(self, context, split, callback=None):
super().__init__(context, split, callback)
logger.info("DouBao init")
# Access Key ID
# AKLTYTdmOTBmNWFjODkxNDE2Zjk3MjU0NjRhM2JhM2IyN2Y
# AKLTNDZjNTdhNDlkZGE3NDZjMDlkMzk5YWQ3MDA4MTY1ZDc
# Secret Access Key
# WmpRelltRXhNbVkyWWpnNU5HRmpNamc0WTJZMFpUWmpOV1E1TTJFME1tTQ==
# TkRJMk1tTTFZamt4TkRVNE5HRTNZMkUyTnpFeU5qQmxNMkUwWXpaak1HRQ==
# endpoint_id
# ep-20241008152048-fsgzf
# api_key
# c9635f9e-0f9e-4ca1-ac90-8af25a541b74
# api_ky
# eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJpc3MiOiJhcmstY29uc29sZSIsImV4cCI6MTczMDk2NTMxOSwiaWF0IjoxNzI4MzczMzE5LCJ0IjoidXNlciIsImt2IjoxLCJhaWQiOiIyMTAyMjc3NDc1IiwidWlkIjoiMCIsImlzX291dGVyX3VzZXIiOnRydWUsInJlc291cmNlX3R5cGUiOiJlbmRwb2ludCIsInJlc291cmNlX2lkcyI6WyJlcC0yMDI0MTAwODE1MjA0OC1mc2d6ZiJdfQ.BHgFj-UKeu7IGG5VL2e6iPQEMNMkQrgmM46zYmTpoNG_ySgSFJLWYzbrIABZmqVDB4Rt58j8kvoORs-RHJUz81rXUlh3BYl9-ZwbggtAU7Z1pm54_qZ00jF0jQ6r-fUSXZo2PVCLxb_clNuEh06NyaV7ullZwUCyLKx3vhCsxPAuEvQvLc_qDBx-IYNT-UApVADaqMs-OyewoxahqQ7RvaHFF14R6ihmg9H0uvl00_JiGThJveszKvy_T-Qk6iPOy-EDI2pwJxdHMZ7By0bWK5EfZoK2hOvOSRD0BNTYnvrTfI0l2JgS0nwCVEPR4KSTXxU_oVVtuUSZp1UHvvkhvA
self.__token = 'sk-yCx0lZUmfGx0ECEQAp8jTnAisHwUIokoDXN7XNBuvMILxWnN'
self._dou_bao = KimiHttp(self.__token)
def _request(self, question):
self._dou_bao.request(question, self._split_handle, self._on_callback)
def _on_close(self):
if self._dou_bao is not None:
self._dou_bao.close()
logger.info('AsyncArk close')
def on_clear_cache(self, *args, **kwargs):
super().on_clear_cache(*args, **kwargs)
if self._dou_bao is not None:
self._dou_bao.aclose()
logger.info('DouBao clear_cache')

View File

@ -15,6 +15,7 @@ class PunctuationSplit(NLPSplit):
def handle(self, message: str):
message = message.replace('*', '')
message = message.replace('#', '')
match = re.search(self._pattern, message)
if match:
pos = match.start() + 1

View File

@ -1,5 +0,0 @@
#encoding = utf8
from .voice_render import VoiceRender
from .video_render import VideoRender
from .play_clock import PlayClock

View File

@ -1,25 +0,0 @@
#encoding = utf8
import logging
import time
from abc import ABC, abstractmethod
from queue import Queue
from threading import Event, Thread
from utils import SyncQueue
logger = logging.getLogger(__name__)
class BaseRender(ABC):
def __init__(self, play_clock, context, type_):
self._play_clock = play_clock
self._context = context
# self._queue = SyncQueue(context.batch_size, f'{type_}RenderQueue')
# self._exit_event = Event()
# self._thread = Thread(target=self._on_run, name=thread_name)
# self._exit_event.set()
# self._thread.start()
@abstractmethod
def render(self, frame, ps):
pass

View File

@ -1,37 +0,0 @@
#encoding = utf8
import time
class PlayClock:
def __init__(self):
self._start = time.time()
self._current_time = 0
self._display_time = self._start
self._audio_diff_threshold = 0.01
@property
def start_time(self):
return self._start
@property
def current_time(self):
return self._current_time
@current_time.setter
def current_time(self, v):
self._current_time = v
@property
def audio_diff_threshold(self):
return self._audio_diff_threshold
@property
def display_time(self):
return self._display_time
def update_display_time(self):
self._display_time = time.time()
def clock_time(self):
elapsed = time.time() - self._display_time
return self.current_time + elapsed

View File

@ -1,23 +0,0 @@
#encoding = utf8
import copy
import logging
import time
import cv2
import numpy as np
from .base_render import BaseRender
class VideoRender(BaseRender):
def __init__(self, play_clock, context, human_render):
super().__init__(play_clock, context, 'Video')
self._human_render = human_render
self.index = 0
def render(self, frame, ps):
if self._human_render is not None:
self._human_render.put_image(frame)
# image = cv2.cvtColor(image, cv2.COLOR_BGRA2RGBA)

View File

@ -1,39 +0,0 @@
#encoding = utf8
import logging
import time
from queue import Empty
import numpy as np
from audio_render import AudioRender
from human.message_type import MessageType
from .base_render import BaseRender
logger = logging.getLogger(__name__)
class VoiceRender(BaseRender):
def __init__(self, play_clock, context):
self._audio_render = AudioRender()
super().__init__(play_clock, context, 'Voice')
self._current_text = ''
def render(self, frame, ps):
self._play_clock.update_display_time()
self._play_clock.current_time = ps
for audio_frame in frame:
frame, type_ = audio_frame
chunk, txt = frame
if txt != self._current_text:
self._current_text = txt
logging.info(f'VoiceRender: {txt}')
chunk = (chunk * 32767).astype(np.int16)
if self._audio_render is not None:
try:
chunk_len = int(chunk.shape[0] * 2)
# print('audio frame:', frame.shape, chunk_len)
self._audio_render.write(chunk.tobytes(), chunk_len)
except Exception as e:
logging.error(f'Error writing audio frame: {e}')

View File

@ -20,18 +20,13 @@ class TTSAudioHandle(AudioHandler):
self._index = -1
EventBus().register('stop', self._on_stop)
EventBus().register('clear_cache', self.on_clear_cache)
def __del__(self):
EventBus().unregister('stop', self._on_stop)
EventBus().unregister('clear_cache', self.on_clear_cache)
def _on_stop(self, *args, **kwargs):
self.stop()
def on_clear_cache(self, *args, **kwargs):
self._index = -1
@property
def sample_rate(self):
return self._sample_rate
@ -51,7 +46,8 @@ class TTSAudioHandle(AudioHandler):
pass
def pause_talk(self):
pass
self._index = -1
super().pause_talk()
class TTSAudioSplitHandle(TTSAudioHandle):
@ -76,7 +72,7 @@ class TTSAudioSplitHandle(TTSAudioHandle):
if chunks is not None:
for chunk in chunks:
self.on_next_handle((chunk, txt), 0)
time.sleep(0.01) # Sleep briefly to prevent busy-waiting
time.sleep(0.001) # Sleep briefly to prevent busy-waiting
def on_handle(self, stream, index):
if not self._is_running:
@ -103,8 +99,8 @@ class TTSAudioSplitHandle(TTSAudioHandle):
self._is_running = False
self._thread.join()
def on_clear_cache(self, *args, **kwargs):
super().on_clear_cache()
def pause_talk(self):
super().pause_talk()
with self._lock:
self._current = 0
self._priority_queue.clear()

View File

@ -16,18 +16,18 @@ class TTSBase(NLPCallback):
self._message_queue = AsyncTaskQueue('TTSBaseQueue', 5)
self._is_running = True
EventBus().register('stop', self.on_stop)
EventBus().register('clear_cache', self.on_clear_cache)
def __del__(self):
EventBus().unregister('stop', self.on_stop)
EventBus().unregister('clear_cache', self.on_clear_cache)
def on_stop(self, *args, **kwargs):
self.stop()
def on_clear_cache(self, *args, **kwargs):
logger.info('TTSBase clear_cache')
self._message_queue.clear()
def on_clear(self):
self.pause_talk()
if self._handle is not None:
self._handle.pause_talk()
@property
def handle(self):

View File

@ -97,8 +97,8 @@ class TTSEdgeHttp(TTSBase):
# if self._byte_stream is not None and not self._byte_stream.closed:
# self._byte_stream.close()
def on_clear_cache(self, *args, **kwargs):
def on_clear(self):
logger.info('TTSEdgeHttp clear_cache')
super().on_clear_cache(*args, **kwargs)
for response in self._response_list:
response.close()
super().on_clear()

View File

@ -1,3 +1,3 @@
#encoding = utf8
from .ipc_render import IpcRender

79
ui/ipc_render.py Normal file
View File

@ -0,0 +1,79 @@
#encoding = utf8
import os
import logging
import time
import numpy as np
from human import HumanRender, RenderStatus
from ipc import IPCUtil
from utils import render_image
logger = logging.getLogger(__name__)
current_file_path = os.path.dirname(os.path.abspath(__file__))
class IpcRender(HumanRender):
def __init__(self, context):
super().__init__(context, None)
self._ipc = IPCUtil('human_product', 'human_render')
self._current_text = ''
def _send_image(self, image):
height, width, channels = image.shape
t = time.perf_counter()
width_bytes = width.to_bytes(4, byteorder='little')
height_bytes = height.to_bytes(4, byteorder='little')
bit_depth_bytes = channels.to_bytes(4, byteorder='little')
img_bytes = image.tobytes()
identifier = b'\x01'
data = identifier + width_bytes + height_bytes + bit_depth_bytes + img_bytes
self._ipc.send_binary(data, len(data))
def _send_voice(self, voice):
voice_identifier = b'\x02'
data = voice_identifier
for audio_frame in voice:
frame, type_ = audio_frame
chunk, txt = frame
if txt != self._current_text:
self._current_text = txt
logging.info(f'VoiceRender: {txt}')
chunk = (chunk * 32767).astype(np.int16)
voice_bytes = chunk.tobytes()
data = data + voice_bytes
self._ipc.send_binary(data, len(data))
def _on_reader_callback(self, data_str, size):
data_str = data_str.decode('utf-8')
# print(f'on_reader_callback: {data_str}, size:{size}')
if 'quit' == data_str:
self._context.stop()
elif 'heartbeat' == data_str:
pass
elif 'full' == data_str:
if self._render_status != RenderStatus.E_Full:
# logger.info(f'change to E_Full status')
self._render_status = RenderStatus.E_Full
elif 'empty' == data_str:
if self._render_status != RenderStatus.E_Empty:
# logger.info(f'change to E_Full status')
self._render_status = RenderStatus.E_Empty
elif 'normal' == data_str:
if self._render_status != RenderStatus.E_Normal:
# logger.info(f'change to E_Normal status')
self._render_status = RenderStatus.E_Normal
def run(self):
self._ipc.set_reader_callback(self._on_reader_callback)
logger.info(f'ipc listen:{self._ipc.listen()}')
super().run()
def _render(self, video_frame, voice_frame):
image = render_image(self._context, video_frame)
self._send_image(image)
self._send_voice(voice_frame)

View File

@ -1,70 +1,51 @@
#encoding = utf8
import copy
import logging
import os
import time
from queue import Queue
import cv2
import numpy as np
import pygame
from pygame.locals import *
from human import HumanContext
from utils import config_logging
from ipc import IPCUtil
from utils import config_logging, render_image
logger = logging.getLogger(__name__)
current_file_path = os.path.dirname(os.path.abspath(__file__))
def img_warp_back_inv_m(img, img_to, inv_m):
h_up, w_up, c = img_to.shape
mask = np.ones_like(img).astype(np.float32)
inv_mask = cv2.warpAffine(mask, inv_m, (w_up, h_up))
inv_img = cv2.warpAffine(img, inv_m, (w_up, h_up))
mask_indices = inv_mask == 1
if 4 == c:
img_to[:, :, :3][mask_indices] = inv_img[mask_indices]
else:
img_to[inv_mask == 1] = inv_img[inv_mask == 1]
return img_to
ipc = IPCUtil('ipc_sender', 'ipc_sender')
def render_image(context, frame):
res_frame, idx, type_ = frame
def send_image(identifier, image):
height, width, channels = image.shape
if type_ == 0:
combine_frame = context.frame_list_cycle[idx]
else:
bbox = context.coord_list_cycle[idx]
combine_frame = copy.deepcopy(context.frame_list_cycle[idx])
af = context.align_frames[idx]
inv_m = context.inv_m_frames[idx]
y1, y2, x1, x2 = bbox
try:
t = time.perf_counter()
res_frame = cv2.resize(res_frame.astype(np.uint8), (x2 - x1, y2 - y1))
af[y1:y2, x1:x2] = res_frame
combine_frame = img_warp_back_inv_m(af, combine_frame, inv_m)
except Exception as e:
logging.error(f'resize error:{e}')
return
width_bytes = width.to_bytes(4, byteorder='little')
height_bytes = height.to_bytes(4, byteorder='little')
bit_depth_bytes = channels.to_bytes(4, byteorder='little')
image = combine_frame
return image
img_bytes = image.tobytes()
data = identifier + width_bytes + height_bytes + bit_depth_bytes + img_bytes
ipc.send_binary(data, len(data))
def cal_box(inv_m, p):
x = inv_m[0][0] * p[0] + inv_m[0][1] * p[1] + inv_m[0][2]
y = inv_m[1][0] * p[0] + inv_m[1][1] * p[1] + inv_m[1][2]
return x, y
class PyGameUI:
def __init__(self):
self._human_context = None
self._queue = None
self.screen_ = pygame.display.set_mode((1920, 1080), HWSURFACE | DOUBLEBUF | RESIZABLE)
self.screen_ = pygame.display.set_mode((920, 860), HWSURFACE | DOUBLEBUF | RESIZABLE)
self.clock = pygame.time.Clock()
background = os.path.join(current_file_path, '..', 'data', 'background', 'background.jpg')
logger.info(f'background: {background}')
self._background = pygame.image.load(background).convert()
self.background_display_ = pygame.transform.scale(self._background, (1920, 1080))
self.background_display_ = pygame.transform.scale(self._background, (920, 860))
self._human_image = None
self.running = True
@ -87,7 +68,7 @@ class PyGameUI:
self.screen_.blit(self.background_display_, (0, 0))
self._update_human()
if self._human_image is not None:
self.screen_.blit(self._human_image, (760, -300))
self.screen_.blit(self._human_image, (0, -300))
fps = self.clock.get_fps()
pygame.display.set_caption('fps:{:.2f}'.format(fps))

View File

@ -5,4 +5,5 @@ from .sync_queue import SyncQueue
from .utils import mirror_index, load_model, get_device, load_avatar, config_logging
from .utils import read_image, object_stop
from .utils import load_avatar_from_processed, load_avatar_from_256_processed
from .utils import render_image
from .audio_utils import melspectrogram, save_wav

View File

@ -1,4 +1,5 @@
#encoding = utf8
import copy
import glob
import logging
import os
@ -274,3 +275,38 @@ def config_logging(file_name: str, console_level: int = logging.INFO, file_level
def object_stop(obj):
if obj is not None:
obj.stop()
def img_warp_back_inv_m(img, img_to, inv_m):
h_up, w_up, c = img_to.shape
mask = np.ones_like(img).astype(np.float32)
inv_mask = cv2.warpAffine(mask, inv_m, (w_up, h_up))
inv_img = cv2.warpAffine(img, inv_m, (w_up, h_up))
mask_indices = inv_mask == 1
if 4 == c:
img_to[:, :, :3][mask_indices] = inv_img[mask_indices]
else:
img_to[inv_mask == 1] = inv_img[inv_mask == 1]
return img_to
def render_image(context, frame):
res_frame, idx, type_ = frame
if type_ == 0:
combine_frame = context.frame_list_cycle[idx]
else:
bbox = context.coord_list_cycle[idx]
combine_frame = copy.deepcopy(context.frame_list_cycle[idx])
af = context.align_frames[idx]
inv_m = context.inv_m_frames[idx]
y1, y2, x1, x2 = bbox
try:
res_frame = cv2.resize(res_frame.astype(np.uint8), (x2 - x1, y2 - y1))
af[y1:y2, x1:x2] = res_frame
combine_frame = img_warp_back_inv_m(af, combine_frame, inv_m)
except Exception as e:
logging.error(f'resize error:{e}')
return None
return combine_frame