add audio play
121
Human.py
@ -19,6 +19,7 @@ import pyaudio
|
|||||||
import audio
|
import audio
|
||||||
import face_detection
|
import face_detection
|
||||||
import utils
|
import utils
|
||||||
|
from audio_render import AudioRender
|
||||||
from infer import Infer, read_images
|
from infer import Infer, read_images
|
||||||
from models import Wav2Lip
|
from models import Wav2Lip
|
||||||
from tts.Chunk2Mal import Chunk2Mal
|
from tts.Chunk2Mal import Chunk2Mal
|
||||||
@ -189,84 +190,6 @@ img_size = 96
|
|||||||
wav2lip_batch_size = 128
|
wav2lip_batch_size = 128
|
||||||
|
|
||||||
|
|
||||||
def datagen(frames, mels):
|
|
||||||
img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
|
|
||||||
|
|
||||||
face_det_results = face_detect(frames) # BGR2RGB for CNN face detection
|
|
||||||
|
|
||||||
# for i, m in enumerate(mels):
|
|
||||||
for i in range(mels.qsize()):
|
|
||||||
idx = 0 if True else i%len(frames)
|
|
||||||
frame_to_save = frames[mirror_index(1, i)].copy()
|
|
||||||
face, coords = face_det_results[idx].copy()
|
|
||||||
|
|
||||||
face = cv2.resize(face, (img_size, img_size))
|
|
||||||
m = mels.get()
|
|
||||||
|
|
||||||
img_batch.append(face)
|
|
||||||
mel_batch.append(m)
|
|
||||||
frame_batch.append(frame_to_save)
|
|
||||||
coords_batch.append(coords)
|
|
||||||
|
|
||||||
if len(img_batch) >= wav2lip_batch_size:
|
|
||||||
img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
|
|
||||||
|
|
||||||
img_masked = img_batch.copy()
|
|
||||||
img_masked[:, img_size//2:] = 0
|
|
||||||
img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
|
|
||||||
mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
|
|
||||||
|
|
||||||
yield img_batch, mel_batch, frame_batch, coords_batch
|
|
||||||
img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
|
|
||||||
|
|
||||||
if len(img_batch) > 0:
|
|
||||||
img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
|
|
||||||
img_masked = img_batch.copy()
|
|
||||||
img_masked[:, img_size//2:] = 0
|
|
||||||
|
|
||||||
img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
|
|
||||||
mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
|
|
||||||
|
|
||||||
yield img_batch, mel_batch, frame_batch, coords_batch
|
|
||||||
|
|
||||||
|
|
||||||
def datagen_signal(frame, mel, face_det_results):
|
|
||||||
img_batch, mel_batch, frame_batch, coord_batch = [], [], [], []
|
|
||||||
|
|
||||||
# for i, m in enumerate(mels):
|
|
||||||
idx = 0
|
|
||||||
frame_to_save = frame.copy()
|
|
||||||
face, coord = face_det_results[idx].copy()
|
|
||||||
|
|
||||||
face = cv2.resize(face, (img_size, img_size))
|
|
||||||
|
|
||||||
for i, m in enumerate(mel):
|
|
||||||
img_batch.append(face)
|
|
||||||
mel_batch.append(m)
|
|
||||||
frame_batch.append(frame_to_save)
|
|
||||||
coord_batch.append(coord)
|
|
||||||
|
|
||||||
if len(img_batch) >= wav2lip_batch_size:
|
|
||||||
img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
|
|
||||||
|
|
||||||
img_masked = img_batch.copy()
|
|
||||||
img_masked[:, img_size // 2:] = 0
|
|
||||||
img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
|
|
||||||
mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
|
|
||||||
|
|
||||||
return img_batch, mel_batch, frame_batch, coord_batch
|
|
||||||
|
|
||||||
if len(img_batch) > 0:
|
|
||||||
img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
|
|
||||||
img_masked = img_batch.copy()
|
|
||||||
img_masked[:, img_size//2:] = 0
|
|
||||||
|
|
||||||
img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
|
|
||||||
mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
|
|
||||||
|
|
||||||
return img_batch, mel_batch, frame_batch, coord_batch
|
|
||||||
|
|
||||||
|
|
||||||
# 从字节流加载音频数据
|
# 从字节流加载音频数据
|
||||||
def load_audio_from_bytes(byte_data):
|
def load_audio_from_bytes(byte_data):
|
||||||
# 使用 BytesIO 创建一个字节流
|
# 使用 BytesIO 创建一个字节流
|
||||||
@ -288,20 +211,20 @@ class Human:
|
|||||||
self._output_queue = mp.Queue()
|
self._output_queue = mp.Queue()
|
||||||
self._res_frame_queue = mp.Queue(self._batch_size * 2)
|
self._res_frame_queue = mp.Queue(self._batch_size * 2)
|
||||||
|
|
||||||
# full_images, face_frames, coord_frames = self._avatar()
|
full_images, face_frames, coord_frames = self._avatar()
|
||||||
# self._frame_list_cycle = full_images
|
self._frame_list_cycle = full_images
|
||||||
# self._face_list_cycle = face_frames
|
self._face_list_cycle = face_frames
|
||||||
# self._coord_list_cycle = coord_frames
|
self._coord_list_cycle = coord_frames
|
||||||
# face_images_length = len(self._face_list_cycle)
|
face_images_length = len(self._face_list_cycle)
|
||||||
# logging.info(f'face images length: {face_images_length}')
|
logging.info(f'face images length: {face_images_length}')
|
||||||
# print(f'face images length: {face_images_length}')
|
print(f'face images length: {face_images_length}')
|
||||||
self.avatar_id = 'wav2lip_avatar1'
|
|
||||||
self.avatar_path = f"./data/{self.avatar_id}"
|
|
||||||
self.full_imgs_path = f"{self.avatar_path}/full_imgs"
|
|
||||||
self.face_imgs_path = f"{self.avatar_path}/face_imgs"
|
|
||||||
self.coords_path = f"{self.avatar_path}/coords.pkl"
|
|
||||||
|
|
||||||
self.__loadavatar()
|
# self.avatar_id = 'wav2lip_avatar1'
|
||||||
|
# self.avatar_path = f"./data/{self.avatar_id}"
|
||||||
|
# self.full_imgs_path = f"{self.avatar_path}/full_imgs"
|
||||||
|
# self.face_imgs_path = f"{self.avatar_path}/face_imgs"
|
||||||
|
# self.coords_path = f"{self.avatar_path}/coords.pkl"
|
||||||
|
# self.__loadavatar()
|
||||||
|
|
||||||
self.mel_chunks_queue_ = Queue()
|
self.mel_chunks_queue_ = Queue()
|
||||||
self.audio_chunks_queue_ = Queue()
|
self.audio_chunks_queue_ = Queue()
|
||||||
@ -315,6 +238,8 @@ class Human:
|
|||||||
self._infer = Infer(self)
|
self._infer = Infer(self)
|
||||||
self.chunk_2_mal.warm_up()
|
self.chunk_2_mal.warm_up()
|
||||||
|
|
||||||
|
self.audio_render = AudioRender()
|
||||||
|
|
||||||
#
|
#
|
||||||
# self._thread = None
|
# self._thread = None
|
||||||
# thread = threading.Thread(target=self.test)
|
# thread = threading.Thread(target=self.test)
|
||||||
@ -361,7 +286,8 @@ class Human:
|
|||||||
face_frames = []
|
face_frames = []
|
||||||
coord_frames = []
|
coord_frames = []
|
||||||
for face, coord in face_det_results:
|
for face, coord in face_det_results:
|
||||||
face_frames.append(face)
|
resized_crop_frame = cv2.resize(face, (img_size, img_size))
|
||||||
|
face_frames.append(resized_crop_frame)
|
||||||
coord_frames.append(coord)
|
coord_frames.append(coord)
|
||||||
|
|
||||||
return full_list_cycle, face_frames, coord_frames
|
return full_list_cycle, face_frames, coord_frames
|
||||||
@ -395,7 +321,8 @@ class Human:
|
|||||||
print("self.mel_chunks_queue_ len:", self.mel_chunks_queue_.qsize())
|
print("self.mel_chunks_queue_ len:", self.mel_chunks_queue_.qsize())
|
||||||
m = self.mel_chunks_queue_.get()
|
m = self.mel_chunks_queue_.get()
|
||||||
# mel_batch = np.reshape(m, [len(m), mel_batch.shape[1], mel_batch.shape[2], 1])
|
# mel_batch = np.reshape(m, [len(m), mel_batch.shape[1], mel_batch.shape[2], 1])
|
||||||
img_batch, mel_batch, frames, coords = datagen_signal(face_list_cycle[0], m, face_det_results)
|
img_batch, mel_batch, frames, coords = utils.datagen_signal(face_list_cycle[0],
|
||||||
|
m, face_det_results, img_size)
|
||||||
|
|
||||||
img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
|
img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
|
||||||
mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
|
mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
|
||||||
@ -561,6 +488,14 @@ class Human:
|
|||||||
|
|
||||||
image = combine_frame
|
image = combine_frame
|
||||||
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
|
||||||
|
|
||||||
|
for audio_frame in audio_frames:
|
||||||
|
frame, type_ = audio_frame
|
||||||
|
frame = (frame * 32767).astype(np.int16)
|
||||||
|
self.audio_render.write(frame.tobytes(), int(frame.shape[0]*2))
|
||||||
|
# new_frame = AudioFrame(format='s16', layout='mono', samples=frame.shape[0])
|
||||||
|
# new_frame.planes[0].update(frame.tobytes())
|
||||||
|
# new_frame.sample_rate = 16000
|
||||||
return image
|
return image
|
||||||
|
|
||||||
|
|
||||||
|
@ -2,6 +2,9 @@
|
|||||||
|
|
||||||
from ctypes import *
|
from ctypes import *
|
||||||
import os
|
import os
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
current = os.path.dirname(__file__)
|
current = os.path.dirname(__file__)
|
||||||
dynamic_path = os.path.join(current, 'AudioRender.dll')
|
dynamic_path = os.path.join(current, 'AudioRender.dll')
|
||||||
|
|
||||||
@ -31,5 +34,6 @@ class AudioRender:
|
|||||||
if not self.__init:
|
if not self.__init:
|
||||||
return False
|
return False
|
||||||
|
|
||||||
self.__audio_render_obj.argtypes = (POINTER(c_ubyte), c_uint)
|
self.__audio_render_obj.argtypes = (POINTER(c_uint8), c_uint)
|
||||||
return self.__audio_render_obj.Write(data.ctypes.data_as(POINTER(c_ubyte)), size)
|
byte_data = np.frombuffer(data, dtype=np.uint8)
|
||||||
|
return self.__audio_render_obj.Write(byte_data.ctypes.data_as(POINTER(c_uint8)), size)
|
||||||
|
Before Width: | Height: | Size: 14 KiB After Width: | Height: | Size: 16 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 15 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 14 KiB |
Before Width: | Height: | Size: 15 KiB |