diff --git a/Human.py b/Human.py index 0f17b20..02a2479 100644 --- a/Human.py +++ b/Human.py @@ -1,4 +1,5 @@ #encoding = utf8 +import copy import io import logging @@ -25,6 +26,7 @@ from queue import Queue from tts.EdgeTTS import EdgeTTS from tts.TTSBase import TTSBase +from utils import mirror_index device = 'cuda' if torch.cuda.is_available() else 'cpu' @@ -62,16 +64,6 @@ def read_images(img_list): return frames -def __mirror_index(size, index): - # size = len(self.coord_list_cycle) - turn = index // size - res = index % size - if turn % 2 == 0: - return res - else: - return size - res - 1 - - # python.exe .\inference.py --checkpoint_path .\checkpoints\wav2lip.pth --face # .\face\img00016.jpg --audio .\audio\audio1.wav def inference(render_event, batch_size, face_images_path, audio_feat_queue, audio_out_queue, res_frame_queue): @@ -111,13 +103,13 @@ def inference(render_event, batch_size, face_images_path, audio_feat_queue, audi print(f'is_all_silence {is_all_silence}') if is_all_silence: for i in range(batch_size): - res_frame_queue.put((None, __mirror_index(length, index), audio_frames[i*2:i*2+2])) + res_frame_queue.put((None, mirror_index(length, index), audio_frames[i*2:i*2+2])) index = index + 1 else: t = time.perf_counter() image_batch = [] for i in range(batch_size): - idx = __mirror_index(length, index + i) + idx = mirror_index(length, index + i) face = face_list_cycle[idx] image_batch.append(face) image_batch, mel_batch = np.asarray(image_batch), np.asarray(mel_batch) @@ -143,7 +135,7 @@ def inference(render_event, batch_size, face_images_path, audio_feat_queue, audi count_time = 0 for i, res_frame in enumerate(pred): - res_frame_queue.put((res_frame, __mirror_index(length, index), audio_frames[i*2 : i*2+2])) + res_frame_queue.put((res_frame, mirror_index(length, index), audio_frames[i*2 : i*2+2])) index = index + 1 logging.info('finish inference') @@ -212,7 +204,7 @@ def datagen(frames, mels): # for i, m in enumerate(mels): for i in range(mels.qsize()): idx = 0 if True else i%len(frames) - frame_to_save = frames[__mirror_index(1, i)].copy() + frame_to_save = frames[mirror_index(1, i)].copy() face, coords = face_det_results[idx].copy() face = cv2.resize(face, (img_size, img_size)) @@ -294,7 +286,7 @@ def load_audio_from_bytes(byte_data): class Human: def __init__(self): - self._fps = 25 # 40 ms per frame + self._fps = 50 # 20 ms per frame self._batch_size = 16 self._sample_rate = 16000 self._stride_left_size = 10 @@ -303,17 +295,27 @@ class Human: self._output_queue = mp.Queue() self._res_frame_queue = mp.Queue(self._batch_size * 2) - self._chunk_2_mal = Chunk2Mal(self) - self._tts = TTSBase(self) - self._infer = Infer(self) + full_images, face_frames, coord_frames = self._avatar() + self._frame_list_cycle = full_images + self._face_list_cycle = face_frames + self._coord_list_cycle = coord_frames + face_images_length = len(self._face_list_cycle) + logging.info(f'face images length: {face_images_length}') + print(f'face images length: {face_images_length}') self.mel_chunks_queue_ = Queue() self.audio_chunks_queue_ = Queue() self._test_image_queue = Queue() - # - self._thread = None - thread = threading.Thread(target=self.test) - thread.start() + self._res_render_queue = Queue() + + self._chunk_2_mal = Chunk2Mal(self) + self._tts = TTSBase(self) + self._infer = Infer(self) + + # # + # self._thread = None + # thread = threading.Thread(target=self.test) + # thread.start() # self.test() # self.play_pcm() @@ -339,6 +341,21 @@ class Human: # stream.close() # p.terminate() + def _avatar(self): + face_images_path = r'./face/' + face_images_path = utils.read_files_path(face_images_path) + full_list_cycle = read_images(face_images_path) + + face_det_results = face_detect(full_list_cycle) + + face_frames = [] + coord_frames = [] + for face, coord in face_det_results: + face_frames.append(face) + coord_frames.append(coord) + + return full_list_cycle, face_frames, coord_frames + def inter(self, model, chunks, face_list_cycle, face_det_results, out, j): inputs = np.concatenate(chunks) # [5 * chunk] mel = audio.melspectrogram(inputs) @@ -445,9 +462,11 @@ class Human: 'temp/resul_tttt.mp4') subprocess.call(command, shell=platform.system() != 'Windows') - # gen = datagen(face_list_cycle, self.mel_chunks_queue_) + def get_face_list_cycle(self): + return self._face_list_cycle + def get_fps(self): return self._fps @@ -476,12 +495,20 @@ class Human: return self._tts.push_txt(txt) - def push_audio_chunk(self, audio_chunk): - self._chunk_2_mal.push_chunk(audio_chunk) + def put_audio_frame(self, audio_chunk): + self._chunk_2_mal.put_audio_frame(audio_chunk) - def push_mel_chunks_queue(self, mel_chunk): - self._infer.push(mel_chunk) - # self.audio_chunks_queue_.put(audio_chunk) + # def push_audio_chunk(self, audio_chunk): + # self._chunk_2_mal.push_chunk(audio_chunk) + + def push_mel_chunks(self, mel_chunks): + self._infer.push(mel_chunks) + + def push_out_put(self, frame, type_): + self._infer.push_out_queue(frame, type_) + + def push_mel_chunks_queue(self, audio_chunk): + self.audio_chunks_queue_.put(audio_chunk) def push_feat_queue(self, mel_chunks): print("push_feat_queue") @@ -493,14 +520,38 @@ class Human: def push_render_image(self, image): self._test_image_queue.put(image) + def push_res_frame(self, res_frame, idx, audio_frames): + self._res_render_queue.put((res_frame, idx, audio_frames)) + def render(self): try: # img, aud = self._res_frame_queue.get(block=True, timeout=.3) - img = self._test_image_queue.get(block=True, timeout=.3) + # img = self._test_image_queue.get(block=True, timeout=.3) + res_frame, idx, audio_frames = self._res_render_queue.get(block=True, timeout=.3) except queue.Empty: # print('render queue.Empty:') return None - return img + + if audio_frames[0][1] != 0 and audio_frames[1][1] != 0: + combine_frame = self._frame_list_cycle[idx] + else: + bbox = self._coord_list_cycle[idx] + combine_frame = copy.deepcopy(self._frame_list_cycle[idx]) + y1, y2, x1, x2 = bbox + try: + res_frame = cv2.resize(res_frame.astype(np.uint8), (x2 - x1, y2 - y1)) + except: + return None + # combine_frame = get_image(ori_frame,res_frame,bbox) + # t=time.perf_counter() + combine_frame[y1:y2, x1:x2] = res_frame + + image = combine_frame + image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) + return image + + + # print('blending time:',time.perf_counter()-t) # def pull_audio_chunk(self): # try: diff --git a/infer.py b/infer.py index d6f34eb..420b51b 100644 --- a/infer.py +++ b/infer.py @@ -11,8 +11,8 @@ import torch from tqdm import tqdm import face_detection -import utils from models import Wav2Lip +from utils import mirror_index logger = logging.getLogger(__name__) @@ -147,7 +147,8 @@ def datagen_signal(frame, mel, face_det_results): class Infer: def __init__(self, human): self._human = human - self._queue = Queue() + self._feat_queue = Queue() + self._audio_out_queue = Queue() self._exit_event = Event() self._run_thread = Thread(target=self.__on_run) @@ -155,17 +156,17 @@ class Infer: self._run_thread.start() def __on_run(self): - face_images_path = r'./face/' - face_images_path = utils.read_files_path(face_images_path) - face_list_cycle = read_images(face_images_path) - face_images_length = len(face_list_cycle) - logging.info(f'face images length: {face_images_length}') - print(f'face images length: {face_images_length}') - model = load_model(r'.\checkpoints\wav2lip.pth') print("Model loaded") + face_list_cycle = self._human.get_face_list_cycle() + + # self.__do_run1(face_list_cycle, model) + self.__do_run2(face_list_cycle, model) + # frame_h, frame_w = face_list_cycle[0].shape[:-1] + + def __do_run1(self, face_list_cycle, model): face_det_results = face_detect(face_list_cycle) j = 0 @@ -173,7 +174,7 @@ class Infer: count = 0 while self._exit_event.is_set(): try: - m = self._queue.get(block=True, timeout=1) + m = self._feat_queue.get(block=True, timeout=1) except queue.Empty: continue @@ -202,5 +203,77 @@ class Infer: # out.write(f) # print('infer count:', count) - def push(self, chunk): - self._queue.put(chunk) \ No newline at end of file + def __do_run2(self, face_list_cycle, model): + length = len(face_list_cycle) + index = 0 + count = 0 + count_time = 0 + print('start inference') + while True: + if self._exit_event.is_set(): + start_time = time.perf_counter() + try: + mel_batch = self._feat_queue.get(block=True, timeout=1) + except queue.Empty: + continue + + is_all_silence = True + audio_frames = [] + for _ in range(self._human.get_batch_size() * 2): + frame, type_ = self._audio_out_queue.get() + audio_frames.append((frame, type_)) + if type_ == 0: + is_all_silence = False + + if is_all_silence: + for i in range(self._human.get_batch_size()): + # res_frame_queue.put((None, mirror_index(length, index), audio_frames[i * 2:i * 2 + 2])) + self._human.push_res_frame(None, mirror_index(length, index), audio_frames[i * 2:i * 2 + 2]) + index = index + 1 + else: + print('infer=======') + t = time.perf_counter() + img_batch = [] + for i in range(self._human.get_batch_size()): + idx = mirror_index(length, index + i) + face = face_list_cycle[idx] + img_batch.append(face) + img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch) + + img_masked = img_batch.copy() + img_masked[:, face.shape[0] // 2:] = 0 + + img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255. + mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1]) + + img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device) + mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device) + + with torch.no_grad(): + pred = model(mel_batch, img_batch) + pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255. + + count_time += (time.perf_counter() - t) + count += self._human.batch_size() + # _totalframe += 1 + if count >= 100: + print(f"------actual avg infer fps:{count / count_time:.4f}") + count = 0 + count_time = 0 + for i, res_frame in enumerate(pred): + # self.__pushmedia(res_frame,loop,audio_track,video_track) + # res_frame_queue.put( + # (res_frame, __mirror_index(length, index), audio_frames[i * 2:i * 2 + 2])) + self._human.push_res_frame(res_frame, mirror_index(length, index), + audio_frames[i * 2:i * 2 + 2]) + index = index + 1 + # print('total batch time:',time.perf_counter()-start_time) + else: + time.sleep(1) + print('musereal inference processor stop') + + def push(self, mel_chunks): + self._feat_queue.put(mel_chunks) + + def push_out_queue(self, frame, type_): + self._audio_out_queue.put((frame, type_)) diff --git a/tts/Chunk2Mal.py b/tts/Chunk2Mal.py index 062ccfd..21d2cc2 100644 --- a/tts/Chunk2Mal.py +++ b/tts/Chunk2Mal.py @@ -1,9 +1,10 @@ #encoding = utf8 -import ctypes + import logging import queue import time from queue import Queue +import multiprocessing as mp from threading import Thread, Event import numpy as np @@ -13,90 +14,69 @@ from audio_render import AudioRender class Chunk2Mal: def __init__(self, human): - self._audio_chunk_queue = Queue() + # self._audio_chunk_queue = Queue() self._human = human self._thread = None - self._chunks = [] - self._audio_chunks = [] + self.frames = [] + self.queue = Queue() + # self.output_queue = mp.Queue() + # self.feat_queue = mp.Queue(2) + # 320 samples per chunk (20ms * 16000 / 1000)audio_chunk - self._chunk_len = self._human.get_audio_sample_rate() // self._human.get_fps() + self.chunk = self._human.get_audio_sample_rate() // self._human.get_fps() self._exit_event = Event() self._thread = Thread(target=self._on_run) self._exit_event.set() self._thread.start() - self._audio_render = AudioRender() + # self._audio_render = AudioRender() self._stream_len = 0 logging.info('chunk2mal start') - def _concatenate(self): - logging.info('np.concatenate') - if len(self._chunks) < 3: - logging.info(f'np.concatenate: {len(self._chunks)}') - return - inputs = np.concatenate(self._chunks) # [5 * chunk] - self._chunks = [] - mel = audio.melspectrogram(inputs) - if np.isnan(mel.reshape(-1)).sum() > 0: - raise ValueError( - 'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again') - - mel_step_size = 16 - # print('fps:', self._human.get_fps()) - mel_idx_multiplier = 80. / self._human.get_fps() - # print('mel_idx_multiplier:', mel_idx_multiplier) - count = 0 - i = 0 - while 1: - count = count + 1 - start_idx = int(i * mel_idx_multiplier) - print('i', i, 'start_idx', start_idx, 'mel len:', len(mel[0])) - if start_idx + mel_step_size > len(mel[0]): - self._human.push_mel_chunks_queue(mel[:, len(mel[0]) - mel_step_size:]) - break - self._human.push_mel_chunks_queue(mel[:, start_idx: start_idx + mel_step_size]) - i += 1 - - # wav = np.concatenate(self._audio_chunks) # [5 * chunk]self._audio_chunks - # print('_concatenate', len(wav)) - # audio.save_chunks([wav], 16000, "./temp/audio/") - # wav *= 32767 / max(0.01, np.max(np.abs(wav))) - # wav = wav.astype(np.int16) - # self._audio_render.write(wav, len(wav)) - self._audio_chunks = [] - - print('mel_chunks count:', count) - def _on_run(self): logging.info('chunk2mal run') while self._exit_event.is_set(): - if self._audio_chunk_queue.empty(): - if len(self._chunks) > 0: - self._concatenate() - else: - time.sleep(0.5) - continue - try: - chunk = self._audio_chunk_queue.get(block=True, timeout=1) - self._chunks.append(chunk) - self._stream_len = self._stream_len + len(chunk) - print('Chunk2Mal _stream_len:', self._stream_len) - self._audio_chunks.append(chunk.copy()) - - # self._human.push_audio_frames(chunk, 0) - if len(self._chunks) < 10: # 200ms - continue - except queue.Empty: - # print('Chunk2Mal queue.Empty') - continue - - print('len(self._chunks):', len(self._chunks)) - self._concatenate() - + self._run_step() + time.sleep(0.01) logging.info('chunk2mal exit') + def _run_step(self): + for _ in range(self._human.get_batch_size() * 2): + frame, _type = self.get_audio_frame() + self.frames.append(frame) + # put to output + self._human.push_out_put(frame, _type) + # self.output_queue.put((frame, _type)) + # context not enough, do not run network. + if len(self.frames) <= self._human.get_stride_left_size() + self._human.get_stride_right_size(): + return + + inputs = np.concatenate(self.frames) # [N * chunk] + mel = audio.melspectrogram(inputs) + # print(mel.shape[0],mel.shape,len(mel[0]),len(self.frames)) + # cut off stride + left = max(0, self._human.get_stride_left_size() * 80 / 50) + right = min(len(mel[0]), len(mel[0]) - self._human.get_stride_right_size() * 80 / 50) + mel_idx_multiplier = 80. * 2 / self._human.get_fps() + mel_step_size = 16 + i = 0 + mel_chunks = [] + while i < (len(self.frames) - self._human.get_stride_left_size() - self._human.get_stride_right_size()) / 2: + start_idx = int(left + i * mel_idx_multiplier) + # print(start_idx) + if start_idx + mel_step_size > len(mel[0]): + mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:]) + else: + mel_chunks.append(mel[:, start_idx: start_idx + mel_step_size]) + i += 1 + self._human.push_mel_chunks(mel_chunks) + # self.feat_queue.put(mel_chunks) + + # discard the old part to save memory + self.frames = self.frames[-(self._human.get_stride_left_size() + self._human.get_stride_right_size()):] + def stop(self): if self._exit_event is None: return @@ -106,15 +86,34 @@ class Chunk2Mal: self._thread.join() logging.info('chunk2mal stop') - def push_chunk(self, chunk): - self._audio_chunk_queue.put(chunk) + def pause_talk(self): + self.queue.queue.clear() - def pull_chunk(self): + def put_audio_frame(self, audio_chunk): #16khz 20ms pcm + self.queue.put(audio_chunk) + + def get_audio_frame(self): try: - chunk = self._audio_chunk_queue.get(block=True, timeout=1) - type = 1 - except queue.Empty: - chunk = np.zeros(self._chunk_len, dtype=np.float32) + frame = self.queue.get(block=True, timeout=0.01) type = 0 - return chunk, type + # print(f'[INFO] get frame {frame.shape}') + except queue.Empty: + frame = np.zeros(self.chunk, dtype=np.float32) + type = 1 + + return frame, type + + def get_audio_out(self): # get origin audio pcm to nerf + return self.output_queue.get() + + def warm_up(self): + for _ in range(self._human.get_stride_left_size() + self._human.get_stride_right_size()): + audio_frame, _type = self.get_audio_frame() + self.frames.append(audio_frame) + self.output_queue.put((audio_frame, type)) + for _ in range(self._human.get_stride_right_size()): + self.output_queue.get() + + def get_next_feat(self, block, timeout): + return self.feat_queue.get(block, timeout) diff --git a/tts/EdgeTTS.py b/tts/EdgeTTS.py index fdfa2ba..a57e06c 100644 --- a/tts/EdgeTTS.py +++ b/tts/EdgeTTS.py @@ -27,6 +27,7 @@ class EdgeTTS(TTSBase): stream = self.__create_bytes_stream(self._io_stream) stream_len = stream.shape[0] index = 0 + while stream_len >= self._chunk: self._human.push_audio_chunk(stream[index:index + self._chunk]) stream_len -= self._chunk @@ -56,4 +57,3 @@ class EdgeTTS(TTSBase): if chuck['type'] == 'audio': self._io_stream.write(chuck['data']) - diff --git a/tts/TTSBase.py b/tts/TTSBase.py index 4fff9f3..454839e 100644 --- a/tts/TTSBase.py +++ b/tts/TTSBase.py @@ -5,9 +5,7 @@ import time import edge_tts import numpy as np -import pyaudio -import soundfile -import sounddevice +import soundfile as sf import resampy import queue from io import BytesIO @@ -16,7 +14,6 @@ from threading import Thread, Event import audio -from audio_render import AudioRender logger = logging.getLogger(__name__) @@ -26,17 +23,13 @@ class TTSBase: self._human = human self._thread = None self._queue = Queue() - self._io_stream = BytesIO() - self._chunk_len = self._human.get_audio_sample_rate() // self._human.get_fps() + self.input_stream = BytesIO() + self.chunk = self._human.get_audio_sample_rate() // self._human.get_fps() self._exit_event = Event() self._thread = Thread(target=self._on_run) self._exit_event.set() self._thread.start() - # self._pcm_player = pyaudio.PyAudio() - # self._pcm_stream = self._pcm_player.open(format=pyaudio.paInt16, - # channels=1, rate=24000, output=True) - # self._audio_render = AudioRender() logging.info('tts start') def _on_run(self): @@ -52,94 +45,54 @@ class TTSBase: def _request(self, txt): voice = 'zh-CN-XiaoyiNeural' t = time.time() - asyncio.new_event_loop().run_until_complete(self.__on_request(voice, txt)) - logger.info(f'edge tts time:{time.time() - t : 0.4f}s') + asyncio.new_event_loop().run_until_complete(self.__main(voice, txt)) + print(f'-------edge tts time:{time.time() - t:.4f}s') - self._io_stream.seek(0) - stream = self.__create_bytes_stream(self._io_stream) - audio.save_chunks([stream], 16000, './temp/audio/') - # wav = audio.split_audio(stream, 16000, 0.04) + self.input_stream.seek(0) + stream = self.__create_bytes_stream(self.input_stream) + streamlen = stream.shape[0] + idx = 0 + print('-------tts start push chunk') + while streamlen >= self.chunk: + self._human.put_audio_frame(stream[idx:idx + self.chunk]) + streamlen -= self.chunk + idx += self.chunk + # if streamlen>0: #skip last frame(not 20ms) + # self.queue.put(stream[idx:]) + self.input_stream.seek(0) + self.input_stream.truncate() + print('-------tts finish push chunk') - # audio.save_chunks(wav, 16000, './temp/audio/') - # audio.save_wav(stream, "./temp/audio/test1.wav", 16000) - stream_len = stream.shape[0] + def __create_bytes_stream(self, byte_stream): + # byte_stream=BytesIO(buffer) + stream, sample_rate = sf.read(byte_stream) # [T*sample_rate,] float64 + print(f'[INFO]tts audio stream {sample_rate}: {stream.shape}') + stream = stream.astype(np.float32) - # wav = stream #np.concatenate(stream) # [5 * chunk]self._audio_chunks - # print('_concatenate', len(wav)) + if stream.ndim > 1: + print(f'[WARN] audio has {stream.shape[1]} channels, only use the first.') + stream = stream[:, 0] - # self._audio_chunks = [] + if sample_rate != self._human.get_audio_sample_rate() and stream.shape[0] > 0: + print(f'[WARN] audio sample rate is {sample_rate}, resampling into {self._human.get_audio_sample_rate()}.') + stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self._human.get_audio_sample_rate()) - print("stream_len:", stream_len, " _chunk_len:", self._chunk_len) - index = 0 - segment = 0 - while stream_len >= self._chunk_len: - audio_chunk = stream[index:index + self._chunk_len] - self._human.push_audio_chunk(audio_chunk) - stream_len -= self._chunk_len - index += self._chunk_len - segment = segment + 1 + return stream - if stream_len > 0: - audio_chunk = stream[index:index + stream_len] - self._human.push_audio_chunk(audio_chunk) - segment = segment + 1 + async def __main(self, voicename: str, text: str): + communicate = edge_tts.Communicate(text, voicename) - print("segment:", segment) - self._io_stream.seek(0) - self._io_stream.truncate() - - def __create_bytes_stream(self, io_stream): - stream, sample_rate = soundfile.read(io_stream) - logger.info(f'tts audio stream {sample_rate} : {stream.shape}') - stream = stream.astype(np.float32) - - if stream.ndim > 1: - logger.warning(f'tts audio has {stream.shape[1]} channels, only use the first') - stream = stream[:, 1] - - if sample_rate != self._human.get_audio_sample_rate() and stream.shape[0] > 0: - logger.warning(f'tts audio sample rate is {sample_rate}, resample to {self._human.get_audio_sample_rate() }') - stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self._human.get_audio_sample_rate()) - - return stream - - async def __on_request(self, voice, txt): - communicate = edge_tts.Communicate(txt, voice) + #with open(OUTPUT_FILE, "wb") as file: first = True - total_data = b'' - CHUNK_SIZE = self._chunk_len async for chunk in communicate.stream(): - if chunk["type"] == "audio" and chunk["data"]: - data = chunk['data'] - self._io_stream.write(data) + if first: + first = False + if chunk["type"] == "audio": + #self.push_audio(chunk["data"]) + self.input_stream.write(chunk["data"]) + #file.write(chunk["data"]) elif chunk["type"] == "WordBoundary": pass - ''' - total_data += chunk["data"] - if len(total_data) >= CHUNK_SIZE: - # print(f"Time elapsed: {time.time() - start_time:.2f} seconds") # Print time - audio_data = AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])) #.raw_data - audio_data = audio_data.set_frame_rate(self._human.get_audio_sample_rate()) - # self._human.push_audio_chunk(audio_data) - self._pcm_stream.write(audio_data.raw_data) - # play_audio(total_data[:CHUNK_SIZE], stream) # Play first CHUNK_SIZE bytes - total_data = total_data[CHUNK_SIZE:] # Remove played data - ''' - - # if first: - # first = False - - # if chuck['type'] == 'audio': - # # self._io_stream.write(chuck['data']) - # self._io_stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data) - - # if len(total_data) > 0: - # self._pcm_stream.write(AudioSegment.from_mp3(BytesIO(total_data)).raw_data) - # audio_data = AudioSegment.from_mp3(BytesIO(total_data)) # .raw_data - # audio_data = audio_data.set_frame_rate(self._human.get_audio_sample_rate()) - # self._pcm_stream.write(audio_data.raw_data) - # self._human.push_audio_chunk(audio_data) - # self._io_stream.write(AudioSegment.from_mp3(BytesIO(total_data)).raw_data) def stop(self): self._pcm_stream.stop_stream() diff --git a/ui.py b/ui.py index 303453e..f7b7e86 100644 --- a/ui.py +++ b/ui.py @@ -44,7 +44,7 @@ class App(customtkinter.CTk): # self.logo_label.grid(row=0, column=0, padx=20, pady=(20, 10)) self.entry = customtkinter.CTkEntry(self, placeholder_text="输入内容") - self.entry.insert(0, "基本信息,北京九零科技有限公司,成立于2015年,位于北京市,是一家以从事科技推广和应用服务业为主的企业。企业注册资本500万人民币。") + self.entry.insert(0, "你好,我是中国湘西人。") self.entry.grid(row=2, column=0, columnspan=2, padx=(20, 0), pady=(20, 20), sticky="nsew") self.main_button_1 = customtkinter.CTkButton(master=self, fg_color="transparent", border_width=2, @@ -83,7 +83,7 @@ class App(customtkinter.CTk): self.after(100, self._render) return - self.play_audio() + # self.play_audio() iheight, iwidth = image.shape[0], image.shape[1] width = self.winfo_width() height = self.winfo_height() diff --git a/utils.py b/utils.py index f5b614f..e1bbb86 100644 --- a/utils.py +++ b/utils.py @@ -10,3 +10,13 @@ def read_files_path(path): file_paths.append(path + file) return file_paths + +def mirror_index(size, index): + # size = len(self.coord_list_cycle) + turn = index // size + res = index % size + if turn % 2 == 0: + return res + else: + return size - res - 1 +