From f8485298594041c7f39284b5fe9c980dcc1d9895 Mon Sep 17 00:00:00 2001 From: brige Date: Fri, 27 Sep 2024 19:31:36 +0800 Subject: [PATCH] modify human mel --- Human.py | 8 ++++---- audio.py | 38 ++++++++++++++++++++++++++++++++++++++ infer.py | 6 +++--- tts/TTSBase.py | 13 +++++++++++++ 4 files changed, 58 insertions(+), 7 deletions(-) diff --git a/Human.py b/Human.py index 80709fc..2450c20 100644 --- a/Human.py +++ b/Human.py @@ -310,10 +310,10 @@ class Human: self.mel_chunks_queue_ = Queue() self.audio_chunks_queue_ = Queue() self._test_image_queue = Queue() - - self._thread = None - thread = threading.Thread(target=self.test) - thread.start() + # + # self._thread = None + # thread = threading.Thread(target=self.test) + # thread.start() # self.test() # self.play_pcm() diff --git a/audio.py b/audio.py index 32ab5fa..9892f65 100644 --- a/audio.py +++ b/audio.py @@ -5,6 +5,8 @@ import numpy as np from scipy import signal from scipy.io import wavfile from hparams import hparams as hp +import soundfile as sf +from IPython.display import Audio def load_wav(path, sr): return librosa.core.load(path, sr=sr)[0] @@ -134,3 +136,39 @@ def _denormalize(D): return (((D + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)) + hp.min_level_db) else: return ((D * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db) + + + +def load_audio(file_path, sr=16000): + """加载音频文件并返回音频数据和采样率""" + wav, sr = librosa.load(file_path, sr=sr) + return wav, sr + + +def split_audio(wav, sr, chunk_duration): + """将音频按指定时长切割""" + # 计算每个片段包含的采样点数量 + chunk_size = int(chunk_duration * sr) + num_chunks = int(np.ceil(len(wav) / chunk_size)) + + audio_chunks = [] + for i in range(num_chunks): + start_idx = i * chunk_size + end_idx = min((i + 1) * chunk_size, len(wav)) + chunk = wav[start_idx:end_idx] + audio_chunks.append(chunk) + + return audio_chunks + + +def save_chunks(chunks, sr, output_folder, base_filename="chunk"): + """保存切割的音频块""" + for idx, chunk in enumerate(chunks): + output_path = f"{output_folder}/{base_filename}_{idx}.wav" + sf.write(output_path, chunk, sr) + print(f"Saved {output_path}") + + +def play_audio_chunk(chunk, sr): + """播放指定音频块""" + return Audio(chunk, rate=sr) diff --git a/infer.py b/infer.py index b18f972..5415dd1 100644 --- a/infer.py +++ b/infer.py @@ -189,9 +189,9 @@ class Infer: p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1)) f[y1:y2, x1:x2] = p - # name = "%04d" % j - # cv2.imwrite(f'temp/images/{j}.jpg', p) - # j = j + 1 + name = "%04d" % j + cv2.imwrite(f'temp/images/{j}.jpg', p) + j = j + 1 p = cv2.cvtColor(f, cv2.COLOR_BGR2RGB) self._human.push_render_image(p) # out.write(f) diff --git a/tts/TTSBase.py b/tts/TTSBase.py index 9c93c1a..e747517 100644 --- a/tts/TTSBase.py +++ b/tts/TTSBase.py @@ -14,8 +14,11 @@ from io import BytesIO from queue import Queue from threading import Thread, Event +from IPython.core.display_functions import display from pydub import AudioSegment +import audio + logger = logging.getLogger(__name__) @@ -55,6 +58,16 @@ class TTSBase: self._io_stream.seek(0) stream = self.__create_bytes_stream(self._io_stream) stream_len = stream.shape[0] + + sr = 16000 + soundfile.read('./temp/audio/audio.wav', stream, sr) + # audio_chunks = audio.split_audio(stream, sr, 4) + + # display(audio.play_audio_chunk(audio_chunks[0], sr=sr)) + + # 保存切割后的片段 + # audio.save_chunks(stream[0:-1], sr, './temp/audio/') + # audio.save_chunks(audio_chunks, sr, './temp/audio/') # try: # sounddevice.play(stream, samplerate=self._human.get_audio_sample_rate()) # sounddevice.wait() # 等待音频播放完毕