modify human mel

2024-09-27 19:31:36 +08:00 · 2024-09-27 19:31:36 +08:00 · f848529859
commit f848529859
parent e606fb6ef5
4 changed files with 58 additions and 7 deletions
--- a/Human.py
+++ b/Human.py
@ -310,10 +310,10 @@ class Human:
        self.mel_chunks_queue_ = Queue()
        self.audio_chunks_queue_ = Queue()
        self._test_image_queue = Queue()
-
-        self._thread = None
-        thread = threading.Thread(target=self.test)
-        thread.start()
+        #
+        # self._thread = None
+        # thread = threading.Thread(target=self.test)
+        # thread.start()
        # self.test()
        # self.play_pcm()

--- a/audio.py
+++ b/audio.py
@ -5,6 +5,8 @@ import numpy as np
 from scipy import signal
 from scipy.io import wavfile
 from hparams import hparams as hp
+import soundfile as sf
+from IPython.display import Audio

 def load_wav(path, sr):
    return librosa.core.load(path, sr=sr)[0]
@ -134,3 +136,39 @@ def _denormalize(D):
        return (((D + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)) + hp.min_level_db)
    else:
        return ((D * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)
+
+
+
+def load_audio(file_path, sr=16000):
+    """加载音频文件并返回音频数据和采样率"""
+    wav, sr = librosa.load(file_path, sr=sr)
+    return wav, sr
+
+
+def split_audio(wav, sr, chunk_duration):
+    """将音频按指定时长切割"""
+    # 计算每个片段包含的采样点数量
+    chunk_size = int(chunk_duration * sr)
+    num_chunks = int(np.ceil(len(wav) / chunk_size))
+
+    audio_chunks = []
+    for i in range(num_chunks):
+        start_idx = i * chunk_size
+        end_idx = min((i + 1) * chunk_size, len(wav))
+        chunk = wav[start_idx:end_idx]
+        audio_chunks.append(chunk)
+
+    return audio_chunks
+
+
+def save_chunks(chunks, sr, output_folder, base_filename="chunk"):
+    """保存切割的音频块"""
+    for idx, chunk in enumerate(chunks):
+        output_path = f"{output_folder}/{base_filename}_{idx}.wav"
+        sf.write(output_path, chunk, sr)
+        print(f"Saved {output_path}")
+
+
+def play_audio_chunk(chunk, sr):
+    """播放指定音频块"""
+    return Audio(chunk, rate=sr)
--- a/infer.py
+++ b/infer.py
@ -189,9 +189,9 @@ class Infer:
                p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))

                f[y1:y2, x1:x2] = p
-                # name = "%04d" % j
-                # cv2.imwrite(f'temp/images/{j}.jpg', p)
-                # j = j + 1
+                name = "%04d" % j
+                cv2.imwrite(f'temp/images/{j}.jpg', p)
+                j = j + 1
                p = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
                self._human.push_render_image(p)
                # out.write(f)
--- a/tts/TTSBase.py
+++ b/tts/TTSBase.py
@ -14,8 +14,11 @@ from io import BytesIO
 from queue import Queue
 from threading import Thread, Event

+from IPython.core.display_functions import display
 from pydub import AudioSegment

+import audio
+
 logger = logging.getLogger(__name__)


@ -55,6 +58,16 @@ class TTSBase:
        self._io_stream.seek(0)
        stream = self.__create_bytes_stream(self._io_stream)
        stream_len = stream.shape[0]
+
+        sr = 16000
+        soundfile.read('./temp/audio/audio.wav', stream, sr)
+        # audio_chunks = audio.split_audio(stream, sr, 4)
+
+        # display(audio.play_audio_chunk(audio_chunks[0], sr=sr))
+
+        # 保存切割后的片段
+        # audio.save_chunks(stream[0:-1], sr, './temp/audio/')
+        # audio.save_chunks(audio_chunks, sr, './temp/audio/')
        # try:
        #     sounddevice.play(stream, samplerate=self._human.get_audio_sample_rate())
        #     sounddevice.wait()  # 等待音频播放完毕