From a68fbbc0dee0e0ee169dd1c00342b9a6e0baec66 Mon Sep 17 00:00:00 2001
From: brige <jiegeaiai@163.com>
Date: Mon, 30 Sep 2024 01:45:49 +0800
Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=B5=8B=E8=AF=95=E7=9A=84?=
 =?UTF-8?q?=E4=BB=A3=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Human.py | 28 ++++++++++++++++------------
 1 file changed, 16 insertions(+), 12 deletions(-)

diff --git a/Human.py b/Human.py
index ee5fa27..0f17b20 100644
--- a/Human.py
+++ b/Human.py
@@ -254,12 +254,12 @@ def datagen_signal(frame, mel, face_det_results):
     face, coord = face_det_results[idx].copy()
 
     face = cv2.resize(face, (img_size, img_size))
-    m = mel
 
-    img_batch.append(face)
-    mel_batch.append(m)
-    frame_batch.append(frame_to_save)
-    coord_batch.append(coord)
+    for i, m in enumerate(mel):
+        img_batch.append(face)
+        mel_batch.append(m)
+        frame_batch.append(frame_to_save)
+        coord_batch.append(coord)
 
     if len(img_batch) >= wav2lip_batch_size:
         img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
@@ -342,6 +342,7 @@ class Human:
     def inter(self, model, chunks, face_list_cycle, face_det_results, out, j):
         inputs = np.concatenate(chunks)  # [5 * chunk]
         mel = audio.melspectrogram(inputs)
+        print("inter", len(mel[0]))
         if np.isnan(mel.reshape(-1)).sum() > 0:
             raise ValueError(
                 'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
@@ -352,19 +353,21 @@ class Human:
         mel_idx_multiplier = 80. / self._fps
         print('mel_idx_multiplier:', mel_idx_multiplier)
         i = 0
+        mel_chunks = []
         while 1:
             start_idx = int(i * mel_idx_multiplier)
             if start_idx + mel_step_size > len(mel[0]):
-                # mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
-                self.mel_chunks_queue_.put(mel[:, len(mel[0]) - mel_step_size:])
+                mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
+                # self.mel_chunks_queue_.put(mel[:, len(mel[0]) - mel_step_size:])
                 break
-            # mel_chunks.append(mel[:, start_idx: start_idx + mel_step_size])
-            self.mel_chunks_queue_.put(mel[:, start_idx: start_idx + mel_step_size])
+            mel_chunks.append(mel[:, start_idx: start_idx + mel_step_size])
+            # self.mel_chunks_queue_.put(mel[:, start_idx: start_idx + mel_step_size])
             i += 1
-
+        self.mel_chunks_queue_.put(mel_chunks)
         while not self.mel_chunks_queue_.empty():
             print("self.mel_chunks_queue_ len:", self.mel_chunks_queue_.qsize())
             m = self.mel_chunks_queue_.get()
+            # mel_batch = np.reshape(m, [len(m), mel_batch.shape[1], mel_batch.shape[2], 1])
             img_batch, mel_batch, frames, coords = datagen_signal(face_list_cycle[0], m, face_det_results)
 
             img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
@@ -413,7 +416,8 @@ class Human:
         print('wav length:', stream_len)
         _audio_chunk_queue = queue.Queue()
         index = 0
-        chunk_len = 6400
+        chunk_len = 640# // 200
+        print('chunk_len:', chunk_len)
         while stream_len >= chunk_len:
             audio_chunk = stream[index:index + chunk_len]
             _audio_chunk_queue.put(audio_chunk)
@@ -430,7 +434,7 @@ class Human:
         j = 0
         while not _audio_chunk_queue.empty():
             chunks = []
-            length = min(5, _audio_chunk_queue.qsize())
+            length = min(64, _audio_chunk_queue.qsize())
             for i in range(length):
                 chunks.append(_audio_chunk_queue.get())