modify audio

This commit is contained in:
brige 2024-09-29 15:12:49 +08:00
parent 472a17f896
commit 3aec7b7103
4 changed files with 94 additions and 50 deletions

101
Human.py
View File

@ -311,9 +311,9 @@ class Human:
self.audio_chunks_queue_ = Queue()
self._test_image_queue = Queue()
#
# self._thread = None
# thread = threading.Thread(target=self.test)
# thread.start()
self._thread = None
thread = threading.Thread(target=self.test)
thread.start()
# self.test()
# self.play_pcm()
@ -339,16 +339,9 @@ class Human:
# stream.close()
# p.terminate()
def test(self):
wav = audio.load_wav(r'./audio/audio.wav', 16000)
# with open(r'./audio/test.wav', 'rb') as f:
# byte_data = f.read()
#
# byte_data = byte_data[16:]
# inputs = np.concatenate(byte_data) # [N * chunk]
# wav = load_audio_from_bytes(inputs)
print('wav length:', len(wav))
mel = audio.melspectrogram(wav)
def inter(self, model, chunks, face_list_cycle, face_det_results, out, j):
inputs = np.concatenate(chunks) # [5 * chunk]
mel = audio.melspectrogram(inputs)
if np.isnan(mel.reshape(-1)).sum() > 0:
raise ValueError(
'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
@ -369,26 +362,6 @@ class Human:
self.mel_chunks_queue_.put(mel[:, start_idx: start_idx + mel_step_size])
i += 1
batch_size = 128
print('batch_size:', batch_size, ' mel_chunks len:', self.mel_chunks_queue_.qsize())
face_images_path = r'./face/'
face_images_path = utils.read_files_path(face_images_path)
face_list_cycle = read_images(face_images_path)
face_images_length = len(face_list_cycle)
logging.info(f'face images length: {face_images_length}')
print(f'face images length: {face_images_length}')
model = load_model(r'.\checkpoints\wav2lip.pth')
print("Model loaded")
frame_h, frame_w = face_list_cycle[0].shape[:-1]
# out = cv2.VideoWriter('temp/resul_tttt.avi',
# cv2.VideoWriter_fourcc(*'DIVX'), 25, (frame_w, frame_h))
face_det_results = face_detect(face_list_cycle)
j = 0
while not self.mel_chunks_queue_.empty():
print("self.mel_chunks_queue_ len:", self.mel_chunks_queue_.qsize())
m = self.mel_chunks_queue_.get()
@ -411,12 +384,62 @@ class Human:
j = j + 1
p = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
self._test_image_queue.put(p)
# out.write(f)
#
# out.release()
# command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format('./audio/audio1.wav', 'temp/resul_tttt.avi',
# 'temp/resul_tttt.mp4')
# subprocess.call(command, shell=platform.system() != 'Windows')
out.write(f)
return j
def test(self):
batch_size = 128
print('batch_size:', batch_size, ' mel_chunks len:', self.mel_chunks_queue_.qsize())
face_images_path = r'./face/'
face_images_path = utils.read_files_path(face_images_path)
face_list_cycle = read_images(face_images_path)
face_images_length = len(face_list_cycle)
logging.info(f'face images length: {face_images_length}')
print(f'face images length: {face_images_length}')
model = load_model(r'.\checkpoints\wav2lip.pth')
print("Model loaded")
frame_h, frame_w = face_list_cycle[0].shape[:-1]
out = cv2.VideoWriter('temp/resul_tttt.avi',
cv2.VideoWriter_fourcc(*'DIVX'), 25, (frame_w, frame_h))
face_det_results = face_detect(face_list_cycle)
audio_path = r'./temp/audio/chunk_0.wav'
stream = audio.load_wav(audio_path, 16000)
stream_len = stream.shape[0]
print('wav length:', stream_len)
_audio_chunk_queue = queue.Queue()
index = 0
chunk_len = 6400
while stream_len >= chunk_len:
audio_chunk = stream[index:index + chunk_len]
_audio_chunk_queue.put(audio_chunk)
stream_len -= chunk_len
index += chunk_len
if stream_len > 0:
audio_chunk = stream[index:index + stream_len]
_audio_chunk_queue.put(audio_chunk)
index += stream_len
stream_len -= stream_len
print('_audio_chunk_queue:', _audio_chunk_queue.qsize())
j = 0
while not _audio_chunk_queue.empty():
chunks = []
length = min(5, _audio_chunk_queue.qsize())
for i in range(length):
chunks.append(_audio_chunk_queue.get())
j = self.inter(model, chunks, face_list_cycle, face_det_results, out, j)
out.release()
command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format(audio_path, 'temp/resul_tttt.avi',
'temp/resul_tttt.mp4')
subprocess.call(command, shell=platform.system() != 'Windows')
# gen = datagen(face_list_cycle, self.mel_chunks_queue_)

View File

@ -138,7 +138,6 @@ def _denormalize(D):
return ((D * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)
def load_audio(file_path, sr=16000):
"""加载音频文件并返回音频数据和采样率"""
wav, sr = librosa.load(file_path, sr=sr)

View File

@ -27,10 +27,14 @@ class Chunk2Mal:
self._exit_event.set()
self._thread.start()
self._audio_render = AudioRender()
self._stream_len = 0
logging.info('chunk2mal start')
def _concatenate(self):
logging.info('np.concatenate')
if len(self._chunks) < 3:
logging.info(f'np.concatenate: {len(self._chunks)}')
return
inputs = np.concatenate(self._chunks) # [5 * chunk]
self._chunks = []
mel = audio.melspectrogram(inputs)
@ -54,10 +58,12 @@ class Chunk2Mal:
self._human.push_mel_chunks_queue(mel[:, start_idx: start_idx + mel_step_size])
i += 1
wav = np.concatenate(self._audio_chunks) # [5 * chunk]self._audio_chunks
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
wav = wav.astype(np.int16)
self._audio_render.write(wav, len(wav))
# wav = np.concatenate(self._audio_chunks) # [5 * chunk]self._audio_chunks
# print('_concatenate', len(wav))
# audio.save_chunks([wav], 16000, "./temp/audio/")
# wav *= 32767 / max(0.01, np.max(np.abs(wav)))
# wav = wav.astype(np.int16)
# self._audio_render.write(wav, len(wav))
self._audio_chunks = []
print('mel_chunks count:', count)
@ -74,11 +80,12 @@ class Chunk2Mal:
try:
chunk = self._audio_chunk_queue.get(block=True, timeout=1)
self._chunks.append(chunk)
self._stream_len = self._stream_len + len(chunk)
print('Chunk2Mal _stream_len:', self._stream_len)
self._audio_chunks.append(chunk.copy())
# print(type(chunk))
# self._human.push_audio_frames(chunk, 0)
if len(self._chunks) < 102: # 200ms
if len(self._chunks) < 10: # 200ms
continue
except queue.Empty:
# print('Chunk2Mal queue.Empty')

View File

@ -14,10 +14,9 @@ from io import BytesIO
from queue import Queue
from threading import Thread, Event
from IPython.core.display_functions import display
from pydub import AudioSegment
import audio
from audio_render import AudioRender
logger = logging.getLogger(__name__)
@ -37,6 +36,7 @@ class TTSBase:
# self._pcm_player = pyaudio.PyAudio()
# self._pcm_stream = self._pcm_player.open(format=pyaudio.paInt16,
# channels=1, rate=24000, output=True)
# self._audio_render = AudioRender()
logging.info('tts start')
def _on_run(self):
@ -57,9 +57,18 @@ class TTSBase:
self._io_stream.seek(0)
stream = self.__create_bytes_stream(self._io_stream)
audio.save_chunks([stream], 16000, './temp/audio/')
# wav = audio.split_audio(stream, 16000, 0.04)
# audio.save_chunks(wav, 16000, './temp/audio/')
# audio.save_wav(stream, "./temp/audio/test1.wav", 16000)
stream_len = stream.shape[0]
# wav = stream #np.concatenate(stream) # [5 * chunk]self._audio_chunks
# print('_concatenate', len(wav))
# self._audio_chunks = []
print("stream_len:", stream_len, " _chunk_len:", self._chunk_len)
index = 0
segment = 0
@ -69,6 +78,12 @@ class TTSBase:
stream_len -= self._chunk_len
index += self._chunk_len
segment = segment + 1
if stream_len > 0:
audio_chunk = stream[index:index + stream_len]
self._human.push_audio_chunk(audio_chunk)
segment = segment + 1
print("segment:", segment)
self._io_stream.seek(0)
self._io_stream.truncate()
@ -84,7 +99,7 @@ class TTSBase:
if sample_rate != self._human.get_audio_sample_rate() and stream.shape[0] > 0:
logger.warning(f'tts audio sample rate is {sample_rate}, resample to {self._human.get_audio_sample_rate() }')
stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self._human.get_audio_sample_rate() )
stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self._human.get_audio_sample_rate())
return stream