diff --git a/Human.py b/Human.py index ee5fa27..0f17b20 100644 --- a/Human.py +++ b/Human.py @@ -254,12 +254,12 @@ def datagen_signal(frame, mel, face_det_results): face, coord = face_det_results[idx].copy() face = cv2.resize(face, (img_size, img_size)) - m = mel - img_batch.append(face) - mel_batch.append(m) - frame_batch.append(frame_to_save) - coord_batch.append(coord) + for i, m in enumerate(mel): + img_batch.append(face) + mel_batch.append(m) + frame_batch.append(frame_to_save) + coord_batch.append(coord) if len(img_batch) >= wav2lip_batch_size: img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch) @@ -342,6 +342,7 @@ class Human: def inter(self, model, chunks, face_list_cycle, face_det_results, out, j): inputs = np.concatenate(chunks) # [5 * chunk] mel = audio.melspectrogram(inputs) + print("inter", len(mel[0])) if np.isnan(mel.reshape(-1)).sum() > 0: raise ValueError( 'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again') @@ -352,19 +353,21 @@ class Human: mel_idx_multiplier = 80. / self._fps print('mel_idx_multiplier:', mel_idx_multiplier) i = 0 + mel_chunks = [] while 1: start_idx = int(i * mel_idx_multiplier) if start_idx + mel_step_size > len(mel[0]): - # mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:]) - self.mel_chunks_queue_.put(mel[:, len(mel[0]) - mel_step_size:]) + mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:]) + # self.mel_chunks_queue_.put(mel[:, len(mel[0]) - mel_step_size:]) break - # mel_chunks.append(mel[:, start_idx: start_idx + mel_step_size]) - self.mel_chunks_queue_.put(mel[:, start_idx: start_idx + mel_step_size]) + mel_chunks.append(mel[:, start_idx: start_idx + mel_step_size]) + # self.mel_chunks_queue_.put(mel[:, start_idx: start_idx + mel_step_size]) i += 1 - + self.mel_chunks_queue_.put(mel_chunks) while not self.mel_chunks_queue_.empty(): print("self.mel_chunks_queue_ len:", self.mel_chunks_queue_.qsize()) m = self.mel_chunks_queue_.get() + # mel_batch = np.reshape(m, [len(m), mel_batch.shape[1], mel_batch.shape[2], 1]) img_batch, mel_batch, frames, coords = datagen_signal(face_list_cycle[0], m, face_det_results) img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device) @@ -413,7 +416,8 @@ class Human: print('wav length:', stream_len) _audio_chunk_queue = queue.Queue() index = 0 - chunk_len = 6400 + chunk_len = 640# // 200 + print('chunk_len:', chunk_len) while stream_len >= chunk_len: audio_chunk = stream[index:index + chunk_len] _audio_chunk_queue.put(audio_chunk) @@ -430,7 +434,7 @@ class Human: j = 0 while not _audio_chunk_queue.empty(): chunks = [] - length = min(5, _audio_chunk_queue.qsize()) + length = min(64, _audio_chunk_queue.qsize()) for i in range(length): chunks.append(_audio_chunk_queue.get())