添加chunk处理

2024-09-09 08:23:04 +08:00 · 2024-09-09 08:23:04 +08:00 · c9f8ff6541
commit c9f8ff6541
parent cf9fc3545d
3 changed files with 140 additions and 14 deletions
--- a/Human.py
+++ b/Human.py
@ -1,11 +1,131 @@
 #encoding = utf8
 import logging
 import multiprocessing as mp
 import queue
 import time
 import numpy as np
 from models import Wav2Lip
 from tts.Chunk2Mal import Chunk2Mal
 import torch
 import cv2
 from tqdm import tqdm
 logger = logging.getLogger(__name__)
 device = 'cuda' if torch.cuda.is_available() else 'cpu'
 print('Using {} for inference.'.format(device))
 def _load(checkpoint_path):
    if device == 'cuda':
        checkpoint = torch.load(checkpoint_path)
    else:
        checkpoint = torch.load(checkpoint_path,
                                map_location=lambda storage, loc: storage)
    return checkpoint
 def load_model(path):
    model = Wav2Lip()
    print("Load checkpoint from: {}".format(path))
    checkpoint = _load(path)
    s = checkpoint["state_dict"]
    new_s = {}
    for k, v in s.items():
        new_s[k.replace('module.', '')] = v
    model.load_state_dict(new_s)
    model = model.to(device)
    return model.eval()
 def read_images(img_list):
    frames = []
    print('reading images...')
    for img_path in tqdm(img_list):
        frame = cv2.imread(img_path)
        frames.append(frame)
    return frames
 def __mirror_index(size, index):
    # size = len(self.coord_list_cycle)
    turn = index // size
    res = index % size
    if turn % 2 == 0:
        return res
    else:
        return size - res - 1
 #  python.exe .\inference.py --checkpoint_path .\checkpoints\wav2lip.pth --face
 #  .\face\img00016.jpg --audio .\audio\audio1.wav
 def inference(render_event, batch_size, face_images_path, audio_feat_queue, audio_out_queue, res_frame_queue):
    model = load_model(r'.\checkpoints\wav2lip.pth')
    face_list_cycle = read_images(face_images_path)
    face_images_length = len(face_list_cycle)
    logger.info(f'face images length: {face_images_length}')
    length = len(face_list_cycle)
    index = 0
    count = 0
    count_time = 0
    logger.info('start inference')
    while render_event.is_set():
        try:
            mel_batch = audio_feat_queue.get(block=True, timeout=1)
        except queue.Empty:
            continue
        audio_frames = []
        is_all_silence = True
        for _ in range(batch_size * 2):
            frame, type = audio_feat_queue.get()
            audio_frames.append((frame, type))
            if type == 0:
                is_all_silence = False
        if is_all_silence:
            for i in range(batch_size):
                res_frame_queue.put((None, __mirror_index(length, index), audio_frames[i*2:i*2+2]))
                index = index + 1
        else:
            t = time.perf_counter()
            image_batch = []
            for i in range(batch_size):
                idx = __mirror_index(length, index + i)
                face = face_list_cycle[idx]
                image_batch.append(face)
            image_batch, mel_batch = np.asarray(image_batch), np.asarray(mel_batch)
            image_masked = image_batch.copy()
            image_masked[:, face.shape[0]//2:] = 0
            image_batch = np.concatenate((image_masked, image_batch), axis=3) / 255.
            mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
            image_batch = torch.FloatTensor(np.transpose(image_batch, (0, 3, 1, 2))).to(device)
            mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
            with torch.no_grad():
                pred = model(mel_batch, image_batch)
            pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
            count_time += (time.perf_counter() - t)
            count += batch_size
            if count >= 100:
                logger.info(f"------actual avg infer fps:{count/count_time:.4f}")
                count = 0
                count_time = 0
            for i, res_frame in enumerate(pred):
                res_frame_queue.put((res_frame, __mirror_index(length, index), audio_frames[i*2 : i*2+2]))
                index = index + 1
    logger.info('finish inference')
 class Human:
    def __init__(self):
@ -18,6 +138,14 @@ class Human:
        self._stride_left_size = 10
        self._stride_right_size = 10
        self._feat_queue = mp.Queue(2)
        self._output_queue = mp.Queue()
        self._res_frame_queue = mp.Queue(self._batch_size * 2)
        self.face_images_path = r'.\face'
        self.render_event = mp.Event()
        mp.Process(target=inference, args=(self.render_event, self._batch_size, self.face_images_path,
                                           self._feat_queue, self._output_queue, self._res_frame_queue,
                                           )).start()
    def get_fps(self):
        return self._fps
@ -35,6 +163,8 @@ class Human:
        return self._stride_right_size
    def on_destroy(self):
        self.render_event.set()
        self._chunk_2_mal.stop()
        if self._tts is not None:
@ -60,4 +190,6 @@ class Human:
        self._chunk_2_mal.push_chunk(chunk)
    def push_feat_queue(self, mel_chunks):
        print("21")
        self._feat_queue.put(mel_chunks)
        print("22")
--- a/tts/Chunk2Mal.py
+++ b/tts/Chunk2Mal.py
@ -22,6 +22,7 @@ class Chunk2Mal:
            try:
                chunk, type = self.pull_chunk()
                self._chunks.append(chunk)
                print("1")
            except queue.Empty:
                continue
@ -38,6 +39,7 @@ class Chunk2Mal:
            mel_chunks = []
            while i < (len(self._chunks) - self._human.get_stride_left_size()
                       - self._human.get_stride_right_size()) / 2:
                print("14")
                start_idx = int(left + i * mel_idx_multiplier)
                # print(start_idx)
                if start_idx + mel_step_size > len(mel[0]):
@ -45,10 +47,13 @@ class Chunk2Mal:
                else:
                    mel_chunks.append(mel[:, start_idx: start_idx + mel_step_size])
                i += 1
                print("13")
            self._human.push_feat_queue(mel_chunks)
            print("15")
            # discard the old part to save memory
            self._chunks = self._chunks[-(self._human.get_stride_left_size() + self._human.get_stride_right_size()):]
            print("12")
        logging.info('chunk2mal exit')
@ -65,6 +70,7 @@ class Chunk2Mal:
            return
        self._exit_event.set()
        if self._thread.is_alive():
            self._thread.join()
        logging.info('chunk2mal stop')
@ -73,7 +79,7 @@ class Chunk2Mal:
    def pull_chunk(self):
        try:
-            chunk = self._audio_chunk_queue.get(block=True, timeout=1.0)
+            chunk = self._audio_chunk_queue.get(block=True, timeout=1)
            type = 1
        except queue.Empty:
            chunk = np.zeros(self._human.get_chunk(), dtype=np.float32)
--- a/ui.py
+++ b/ui.py
@ -120,18 +120,6 @@ def config_logging(file_name: str, console_level: int=logging.INFO, file_level:
 if __name__ == "__main__":
    # logging.basicConfig(filename='./logs/info.log', level=logging.INFO)
    config_logging('./logs/info.log', logging.INFO, logging.INFO)
    # logger = logging.getLogger('manager')
    # # 输出到控制台, 级别为DEBUG
    # console = logging.StreamHandler()
    # console.setLevel(logging.DEBUG)
    # logger.addHandler(console)
    #
    # # 输出到文件, 级别为INFO, 文件按大小切分
    # filelog = logging.handlers.RotatingFileHandler(filename='./logs/info.log', level=logging.INFO,
    #                                                maxBytes=1024 * 1024, backupCount=5)
    # filelog.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
    # logger.setLevel(logging.INFO)
    # logger.addHandler(filelog)
    logger.info('------------start------------')
    app = App()
    app.mainloop()