modify audio

2024-09-29 15:12:49 +08:00 · 2024-09-29 15:12:49 +08:00 · 3aec7b7103
commit 3aec7b7103
parent 472a17f896
4 changed files with 94 additions and 50 deletions
--- a/Human.py
+++ b/Human.py
@ -311,9 +311,9 @@ class Human:
        self.audio_chunks_queue_ = Queue()
        self._test_image_queue = Queue()
        #
-        # self._thread = None
-        # thread = threading.Thread(target=self.test)
-        # thread.start()
+        self._thread = None
+        thread = threading.Thread(target=self.test)
+        thread.start()
        # self.test()
        # self.play_pcm()

@ -339,16 +339,9 @@ class Human:
    #     stream.close()
    #     p.terminate()

-    def test(self):
-        wav = audio.load_wav(r'./audio/audio.wav', 16000)
-        # with open(r'./audio/test.wav', 'rb') as f:
-        #     byte_data = f.read()
-        #
-        # byte_data = byte_data[16:]
-        # inputs = np.concatenate(byte_data)  # [N * chunk]
-        # wav = load_audio_from_bytes(inputs)
-        print('wav length:', len(wav))
-        mel = audio.melspectrogram(wav)
+    def inter(self, model, chunks, face_list_cycle, face_det_results, out, j):
+        inputs = np.concatenate(chunks)  # [5 * chunk]
+        mel = audio.melspectrogram(inputs)
        if np.isnan(mel.reshape(-1)).sum() > 0:
            raise ValueError(
                'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
@ -369,26 +362,6 @@ class Human:
            self.mel_chunks_queue_.put(mel[:, start_idx: start_idx + mel_step_size])
            i += 1

-        batch_size = 128
-        print('batch_size:', batch_size, ' mel_chunks len:', self.mel_chunks_queue_.qsize())
-
-        face_images_path = r'./face/'
-        face_images_path = utils.read_files_path(face_images_path)
-        face_list_cycle = read_images(face_images_path)
-        face_images_length = len(face_list_cycle)
-        logging.info(f'face images length: {face_images_length}')
-        print(f'face images length: {face_images_length}')
-
-        model = load_model(r'.\checkpoints\wav2lip.pth')
-        print("Model loaded")
-
-        frame_h, frame_w = face_list_cycle[0].shape[:-1]
-        # out = cv2.VideoWriter('temp/resul_tttt.avi',
-        #                       cv2.VideoWriter_fourcc(*'DIVX'), 25, (frame_w, frame_h))
-
-        face_det_results = face_detect(face_list_cycle)
-
-        j = 0
        while not self.mel_chunks_queue_.empty():
            print("self.mel_chunks_queue_ len:", self.mel_chunks_queue_.qsize())
            m = self.mel_chunks_queue_.get()
@ -411,12 +384,62 @@ class Human:
                j = j + 1
                p = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
                self._test_image_queue.put(p)
-                # out.write(f)
-        #
-        # out.release()
-        # command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format('./audio/audio1.wav', 'temp/resul_tttt.avi',
-        #                                                               'temp/resul_tttt.mp4')
-        # subprocess.call(command, shell=platform.system() != 'Windows')
+                out.write(f)
+        return j
+
+    def test(self):
+        batch_size = 128
+        print('batch_size:', batch_size, ' mel_chunks len:', self.mel_chunks_queue_.qsize())
+
+        face_images_path = r'./face/'
+        face_images_path = utils.read_files_path(face_images_path)
+        face_list_cycle = read_images(face_images_path)
+        face_images_length = len(face_list_cycle)
+        logging.info(f'face images length: {face_images_length}')
+        print(f'face images length: {face_images_length}')
+
+        model = load_model(r'.\checkpoints\wav2lip.pth')
+        print("Model loaded")
+
+        frame_h, frame_w = face_list_cycle[0].shape[:-1]
+        out = cv2.VideoWriter('temp/resul_tttt.avi',
+                              cv2.VideoWriter_fourcc(*'DIVX'), 25, (frame_w, frame_h))
+
+        face_det_results = face_detect(face_list_cycle)
+
+        audio_path = r'./temp/audio/chunk_0.wav'
+        stream = audio.load_wav(audio_path, 16000)
+        stream_len = stream.shape[0]
+        print('wav length:', stream_len)
+        _audio_chunk_queue = queue.Queue()
+        index = 0
+        chunk_len = 6400
+        while stream_len >= chunk_len:
+            audio_chunk = stream[index:index + chunk_len]
+            _audio_chunk_queue.put(audio_chunk)
+            stream_len -= chunk_len
+            index += chunk_len
+        if stream_len > 0:
+            audio_chunk = stream[index:index + stream_len]
+            _audio_chunk_queue.put(audio_chunk)
+            index += stream_len
+            stream_len -= stream_len
+
+        print('_audio_chunk_queue:', _audio_chunk_queue.qsize())
+
+        j = 0
+        while not _audio_chunk_queue.empty():
+            chunks = []
+            length = min(5, _audio_chunk_queue.qsize())
+            for i in range(length):
+                chunks.append(_audio_chunk_queue.get())
+
+            j = self.inter(model, chunks, face_list_cycle, face_det_results, out, j)
+
+        out.release()
+        command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format(audio_path, 'temp/resul_tttt.avi',
+                                                                      'temp/resul_tttt.mp4')
+        subprocess.call(command, shell=platform.system() != 'Windows')


        # gen = datagen(face_list_cycle, self.mel_chunks_queue_)
--- a/audio.py
+++ b/audio.py
@ -138,7 +138,6 @@ def _denormalize(D):
        return ((D * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)


-
 def load_audio(file_path, sr=16000):
    """加载音频文件并返回音频数据和采样率"""
    wav, sr = librosa.load(file_path, sr=sr)
--- a/tts/Chunk2Mal.py
+++ b/tts/Chunk2Mal.py
@ -27,10 +27,14 @@ class Chunk2Mal:
        self._exit_event.set()
        self._thread.start()
        self._audio_render = AudioRender()
+        self._stream_len = 0
        logging.info('chunk2mal start')

    def _concatenate(self):
        logging.info('np.concatenate')
+        if len(self._chunks) < 3:
+            logging.info(f'np.concatenate: {len(self._chunks)}')
+            return
        inputs = np.concatenate(self._chunks)  # [5 * chunk]
        self._chunks = []
        mel = audio.melspectrogram(inputs)
@ -54,10 +58,12 @@ class Chunk2Mal:
            self._human.push_mel_chunks_queue(mel[:, start_idx: start_idx + mel_step_size])
            i += 1

-        wav = np.concatenate(self._audio_chunks)  # [5 * chunk]self._audio_chunks
-        wav *= 32767 / max(0.01, np.max(np.abs(wav)))
-        wav = wav.astype(np.int16)
-        self._audio_render.write(wav, len(wav))
+        # wav = np.concatenate(self._audio_chunks)  # [5 * chunk]self._audio_chunks
+        # print('_concatenate', len(wav))
+        # audio.save_chunks([wav], 16000, "./temp/audio/")
+        # wav *= 32767 / max(0.01, np.max(np.abs(wav)))
+        # wav = wav.astype(np.int16)
+        # self._audio_render.write(wav, len(wav))
        self._audio_chunks = []

        print('mel_chunks count:', count)
@ -74,11 +80,12 @@ class Chunk2Mal:
            try:
                chunk = self._audio_chunk_queue.get(block=True, timeout=1)
                self._chunks.append(chunk)
+                self._stream_len = self._stream_len + len(chunk)
+                print('Chunk2Mal _stream_len:', self._stream_len)
                self._audio_chunks.append(chunk.copy())
-                # print(type(chunk))

                # self._human.push_audio_frames(chunk, 0)
-                if len(self._chunks) < 102: # 200ms
+                if len(self._chunks) < 10: # 200ms
                    continue
            except queue.Empty:
                # print('Chunk2Mal queue.Empty')
--- a/tts/TTSBase.py
+++ b/tts/TTSBase.py
@ -14,10 +14,9 @@ from io import BytesIO
 from queue import Queue
 from threading import Thread, Event

-from IPython.core.display_functions import display
-from pydub import AudioSegment

 import audio
+from audio_render import AudioRender

 logger = logging.getLogger(__name__)

@ -37,6 +36,7 @@ class TTSBase:
        # self._pcm_player = pyaudio.PyAudio()
        # self._pcm_stream = self._pcm_player.open(format=pyaudio.paInt16,
        #                                          channels=1, rate=24000, output=True)
+        # self._audio_render = AudioRender()
        logging.info('tts start')

    def _on_run(self):
@ -57,9 +57,18 @@ class TTSBase:

        self._io_stream.seek(0)
        stream = self.__create_bytes_stream(self._io_stream)
+        audio.save_chunks([stream], 16000, './temp/audio/')
+        # wav = audio.split_audio(stream, 16000, 0.04)
+
+        # audio.save_chunks(wav, 16000, './temp/audio/')
        # audio.save_wav(stream, "./temp/audio/test1.wav", 16000)
        stream_len = stream.shape[0]

+        # wav = stream #np.concatenate(stream)  # [5 * chunk]self._audio_chunks
+        # print('_concatenate', len(wav))
+
+        # self._audio_chunks = []
+
        print("stream_len:", stream_len, " _chunk_len:", self._chunk_len)
        index = 0
        segment = 0
@ -69,6 +78,12 @@ class TTSBase:
            stream_len -= self._chunk_len
            index += self._chunk_len
            segment = segment + 1
+
+        if stream_len > 0:
+            audio_chunk = stream[index:index + stream_len]
+            self._human.push_audio_chunk(audio_chunk)
+            segment = segment + 1
+
        print("segment:", segment)
        self._io_stream.seek(0)
        self._io_stream.truncate()
@ -84,7 +99,7 @@ class TTSBase:

        if sample_rate != self._human.get_audio_sample_rate() and stream.shape[0] > 0:
            logger.warning(f'tts audio sample rate is {sample_rate}, resample to {self._human.get_audio_sample_rate() }')
-            stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self._human.get_audio_sample_rate() )
+            stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self._human.get_audio_sample_rate())

        return stream