diff --git a/Human.py b/Human.py index de253c4..ee5fa27 100644 --- a/Human.py +++ b/Human.py @@ -311,9 +311,9 @@ class Human: self.audio_chunks_queue_ = Queue() self._test_image_queue = Queue() # - # self._thread = None - # thread = threading.Thread(target=self.test) - # thread.start() + self._thread = None + thread = threading.Thread(target=self.test) + thread.start() # self.test() # self.play_pcm() @@ -339,23 +339,16 @@ class Human: # stream.close() # p.terminate() - def test(self): - wav = audio.load_wav(r'./audio/audio.wav', 16000) - # with open(r'./audio/test.wav', 'rb') as f: - # byte_data = f.read() - # - # byte_data = byte_data[16:] - # inputs = np.concatenate(byte_data) # [N * chunk] - # wav = load_audio_from_bytes(inputs) - print('wav length:', len(wav)) - mel = audio.melspectrogram(wav) + def inter(self, model, chunks, face_list_cycle, face_det_results, out, j): + inputs = np.concatenate(chunks) # [5 * chunk] + mel = audio.melspectrogram(inputs) if np.isnan(mel.reshape(-1)).sum() > 0: raise ValueError( 'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again') mel_step_size = 16 - print('fps:', self._fps) + print('fps:', self._fps) mel_idx_multiplier = 80. / self._fps print('mel_idx_multiplier:', mel_idx_multiplier) i = 0 @@ -369,26 +362,6 @@ class Human: self.mel_chunks_queue_.put(mel[:, start_idx: start_idx + mel_step_size]) i += 1 - batch_size = 128 - print('batch_size:', batch_size, ' mel_chunks len:', self.mel_chunks_queue_.qsize()) - - face_images_path = r'./face/' - face_images_path = utils.read_files_path(face_images_path) - face_list_cycle = read_images(face_images_path) - face_images_length = len(face_list_cycle) - logging.info(f'face images length: {face_images_length}') - print(f'face images length: {face_images_length}') - - model = load_model(r'.\checkpoints\wav2lip.pth') - print("Model loaded") - - frame_h, frame_w = face_list_cycle[0].shape[:-1] - # out = cv2.VideoWriter('temp/resul_tttt.avi', - # cv2.VideoWriter_fourcc(*'DIVX'), 25, (frame_w, frame_h)) - - face_det_results = face_detect(face_list_cycle) - - j = 0 while not self.mel_chunks_queue_.empty(): print("self.mel_chunks_queue_ len:", self.mel_chunks_queue_.qsize()) m = self.mel_chunks_queue_.get() @@ -411,12 +384,62 @@ class Human: j = j + 1 p = cv2.cvtColor(f, cv2.COLOR_BGR2RGB) self._test_image_queue.put(p) - # out.write(f) - # - # out.release() - # command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format('./audio/audio1.wav', 'temp/resul_tttt.avi', - # 'temp/resul_tttt.mp4') - # subprocess.call(command, shell=platform.system() != 'Windows') + out.write(f) + return j + + def test(self): + batch_size = 128 + print('batch_size:', batch_size, ' mel_chunks len:', self.mel_chunks_queue_.qsize()) + + face_images_path = r'./face/' + face_images_path = utils.read_files_path(face_images_path) + face_list_cycle = read_images(face_images_path) + face_images_length = len(face_list_cycle) + logging.info(f'face images length: {face_images_length}') + print(f'face images length: {face_images_length}') + + model = load_model(r'.\checkpoints\wav2lip.pth') + print("Model loaded") + + frame_h, frame_w = face_list_cycle[0].shape[:-1] + out = cv2.VideoWriter('temp/resul_tttt.avi', + cv2.VideoWriter_fourcc(*'DIVX'), 25, (frame_w, frame_h)) + + face_det_results = face_detect(face_list_cycle) + + audio_path = r'./temp/audio/chunk_0.wav' + stream = audio.load_wav(audio_path, 16000) + stream_len = stream.shape[0] + print('wav length:', stream_len) + _audio_chunk_queue = queue.Queue() + index = 0 + chunk_len = 6400 + while stream_len >= chunk_len: + audio_chunk = stream[index:index + chunk_len] + _audio_chunk_queue.put(audio_chunk) + stream_len -= chunk_len + index += chunk_len + if stream_len > 0: + audio_chunk = stream[index:index + stream_len] + _audio_chunk_queue.put(audio_chunk) + index += stream_len + stream_len -= stream_len + + print('_audio_chunk_queue:', _audio_chunk_queue.qsize()) + + j = 0 + while not _audio_chunk_queue.empty(): + chunks = [] + length = min(5, _audio_chunk_queue.qsize()) + for i in range(length): + chunks.append(_audio_chunk_queue.get()) + + j = self.inter(model, chunks, face_list_cycle, face_det_results, out, j) + + out.release() + command = 'ffmpeg -y -i {} -i {} -strict -2 -q:v 1 {}'.format(audio_path, 'temp/resul_tttt.avi', + 'temp/resul_tttt.mp4') + subprocess.call(command, shell=platform.system() != 'Windows') # gen = datagen(face_list_cycle, self.mel_chunks_queue_) diff --git a/audio.py b/audio.py index 9892f65..379e3a8 100644 --- a/audio.py +++ b/audio.py @@ -138,7 +138,6 @@ def _denormalize(D): return ((D * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db) - def load_audio(file_path, sr=16000): """加载音频文件并返回音频数据和采样率""" wav, sr = librosa.load(file_path, sr=sr) diff --git a/tts/Chunk2Mal.py b/tts/Chunk2Mal.py index 16b71c3..062ccfd 100644 --- a/tts/Chunk2Mal.py +++ b/tts/Chunk2Mal.py @@ -27,10 +27,14 @@ class Chunk2Mal: self._exit_event.set() self._thread.start() self._audio_render = AudioRender() + self._stream_len = 0 logging.info('chunk2mal start') def _concatenate(self): logging.info('np.concatenate') + if len(self._chunks) < 3: + logging.info(f'np.concatenate: {len(self._chunks)}') + return inputs = np.concatenate(self._chunks) # [5 * chunk] self._chunks = [] mel = audio.melspectrogram(inputs) @@ -54,10 +58,12 @@ class Chunk2Mal: self._human.push_mel_chunks_queue(mel[:, start_idx: start_idx + mel_step_size]) i += 1 - wav = np.concatenate(self._audio_chunks) # [5 * chunk]self._audio_chunks - wav *= 32767 / max(0.01, np.max(np.abs(wav))) - wav = wav.astype(np.int16) - self._audio_render.write(wav, len(wav)) + # wav = np.concatenate(self._audio_chunks) # [5 * chunk]self._audio_chunks + # print('_concatenate', len(wav)) + # audio.save_chunks([wav], 16000, "./temp/audio/") + # wav *= 32767 / max(0.01, np.max(np.abs(wav))) + # wav = wav.astype(np.int16) + # self._audio_render.write(wav, len(wav)) self._audio_chunks = [] print('mel_chunks count:', count) @@ -74,11 +80,12 @@ class Chunk2Mal: try: chunk = self._audio_chunk_queue.get(block=True, timeout=1) self._chunks.append(chunk) + self._stream_len = self._stream_len + len(chunk) + print('Chunk2Mal _stream_len:', self._stream_len) self._audio_chunks.append(chunk.copy()) - # print(type(chunk)) # self._human.push_audio_frames(chunk, 0) - if len(self._chunks) < 102: # 200ms + if len(self._chunks) < 10: # 200ms continue except queue.Empty: # print('Chunk2Mal queue.Empty') diff --git a/tts/TTSBase.py b/tts/TTSBase.py index b5e7f8f..4fff9f3 100644 --- a/tts/TTSBase.py +++ b/tts/TTSBase.py @@ -14,10 +14,9 @@ from io import BytesIO from queue import Queue from threading import Thread, Event -from IPython.core.display_functions import display -from pydub import AudioSegment import audio +from audio_render import AudioRender logger = logging.getLogger(__name__) @@ -37,6 +36,7 @@ class TTSBase: # self._pcm_player = pyaudio.PyAudio() # self._pcm_stream = self._pcm_player.open(format=pyaudio.paInt16, # channels=1, rate=24000, output=True) + # self._audio_render = AudioRender() logging.info('tts start') def _on_run(self): @@ -57,9 +57,18 @@ class TTSBase: self._io_stream.seek(0) stream = self.__create_bytes_stream(self._io_stream) + audio.save_chunks([stream], 16000, './temp/audio/') + # wav = audio.split_audio(stream, 16000, 0.04) + + # audio.save_chunks(wav, 16000, './temp/audio/') # audio.save_wav(stream, "./temp/audio/test1.wav", 16000) stream_len = stream.shape[0] + # wav = stream #np.concatenate(stream) # [5 * chunk]self._audio_chunks + # print('_concatenate', len(wav)) + + # self._audio_chunks = [] + print("stream_len:", stream_len, " _chunk_len:", self._chunk_len) index = 0 segment = 0 @@ -69,6 +78,12 @@ class TTSBase: stream_len -= self._chunk_len index += self._chunk_len segment = segment + 1 + + if stream_len > 0: + audio_chunk = stream[index:index + stream_len] + self._human.push_audio_chunk(audio_chunk) + segment = segment + 1 + print("segment:", segment) self._io_stream.seek(0) self._io_stream.truncate() @@ -84,7 +99,7 @@ class TTSBase: if sample_rate != self._human.get_audio_sample_rate() and stream.shape[0] > 0: logger.warning(f'tts audio sample rate is {sample_rate}, resample to {self._human.get_audio_sample_rate() }') - stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self._human.get_audio_sample_rate() ) + stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self._human.get_audio_sample_rate()) return stream