modify human and tts

This commit is contained in:
brige 2024-10-04 01:52:49 +08:00
parent a68fbbc0de
commit 1ef9225dda
7 changed files with 293 additions and 207 deletions

111
Human.py
View File

@ -1,4 +1,5 @@
#encoding = utf8 #encoding = utf8
import copy
import io import io
import logging import logging
@ -25,6 +26,7 @@ from queue import Queue
from tts.EdgeTTS import EdgeTTS from tts.EdgeTTS import EdgeTTS
from tts.TTSBase import TTSBase from tts.TTSBase import TTSBase
from utils import mirror_index
device = 'cuda' if torch.cuda.is_available() else 'cpu' device = 'cuda' if torch.cuda.is_available() else 'cpu'
@ -62,16 +64,6 @@ def read_images(img_list):
return frames return frames
def __mirror_index(size, index):
# size = len(self.coord_list_cycle)
turn = index // size
res = index % size
if turn % 2 == 0:
return res
else:
return size - res - 1
# python.exe .\inference.py --checkpoint_path .\checkpoints\wav2lip.pth --face # python.exe .\inference.py --checkpoint_path .\checkpoints\wav2lip.pth --face
# .\face\img00016.jpg --audio .\audio\audio1.wav # .\face\img00016.jpg --audio .\audio\audio1.wav
def inference(render_event, batch_size, face_images_path, audio_feat_queue, audio_out_queue, res_frame_queue): def inference(render_event, batch_size, face_images_path, audio_feat_queue, audio_out_queue, res_frame_queue):
@ -111,13 +103,13 @@ def inference(render_event, batch_size, face_images_path, audio_feat_queue, audi
print(f'is_all_silence {is_all_silence}') print(f'is_all_silence {is_all_silence}')
if is_all_silence: if is_all_silence:
for i in range(batch_size): for i in range(batch_size):
res_frame_queue.put((None, __mirror_index(length, index), audio_frames[i*2:i*2+2])) res_frame_queue.put((None, mirror_index(length, index), audio_frames[i*2:i*2+2]))
index = index + 1 index = index + 1
else: else:
t = time.perf_counter() t = time.perf_counter()
image_batch = [] image_batch = []
for i in range(batch_size): for i in range(batch_size):
idx = __mirror_index(length, index + i) idx = mirror_index(length, index + i)
face = face_list_cycle[idx] face = face_list_cycle[idx]
image_batch.append(face) image_batch.append(face)
image_batch, mel_batch = np.asarray(image_batch), np.asarray(mel_batch) image_batch, mel_batch = np.asarray(image_batch), np.asarray(mel_batch)
@ -143,7 +135,7 @@ def inference(render_event, batch_size, face_images_path, audio_feat_queue, audi
count_time = 0 count_time = 0
for i, res_frame in enumerate(pred): for i, res_frame in enumerate(pred):
res_frame_queue.put((res_frame, __mirror_index(length, index), audio_frames[i*2 : i*2+2])) res_frame_queue.put((res_frame, mirror_index(length, index), audio_frames[i*2 : i*2+2]))
index = index + 1 index = index + 1
logging.info('finish inference') logging.info('finish inference')
@ -212,7 +204,7 @@ def datagen(frames, mels):
# for i, m in enumerate(mels): # for i, m in enumerate(mels):
for i in range(mels.qsize()): for i in range(mels.qsize()):
idx = 0 if True else i%len(frames) idx = 0 if True else i%len(frames)
frame_to_save = frames[__mirror_index(1, i)].copy() frame_to_save = frames[mirror_index(1, i)].copy()
face, coords = face_det_results[idx].copy() face, coords = face_det_results[idx].copy()
face = cv2.resize(face, (img_size, img_size)) face = cv2.resize(face, (img_size, img_size))
@ -294,7 +286,7 @@ def load_audio_from_bytes(byte_data):
class Human: class Human:
def __init__(self): def __init__(self):
self._fps = 25 # 40 ms per frame self._fps = 50 # 20 ms per frame
self._batch_size = 16 self._batch_size = 16
self._sample_rate = 16000 self._sample_rate = 16000
self._stride_left_size = 10 self._stride_left_size = 10
@ -303,17 +295,27 @@ class Human:
self._output_queue = mp.Queue() self._output_queue = mp.Queue()
self._res_frame_queue = mp.Queue(self._batch_size * 2) self._res_frame_queue = mp.Queue(self._batch_size * 2)
self._chunk_2_mal = Chunk2Mal(self) full_images, face_frames, coord_frames = self._avatar()
self._tts = TTSBase(self) self._frame_list_cycle = full_images
self._infer = Infer(self) self._face_list_cycle = face_frames
self._coord_list_cycle = coord_frames
face_images_length = len(self._face_list_cycle)
logging.info(f'face images length: {face_images_length}')
print(f'face images length: {face_images_length}')
self.mel_chunks_queue_ = Queue() self.mel_chunks_queue_ = Queue()
self.audio_chunks_queue_ = Queue() self.audio_chunks_queue_ = Queue()
self._test_image_queue = Queue() self._test_image_queue = Queue()
# self._res_render_queue = Queue()
self._thread = None
thread = threading.Thread(target=self.test) self._chunk_2_mal = Chunk2Mal(self)
thread.start() self._tts = TTSBase(self)
self._infer = Infer(self)
# #
# self._thread = None
# thread = threading.Thread(target=self.test)
# thread.start()
# self.test() # self.test()
# self.play_pcm() # self.play_pcm()
@ -339,6 +341,21 @@ class Human:
# stream.close() # stream.close()
# p.terminate() # p.terminate()
def _avatar(self):
face_images_path = r'./face/'
face_images_path = utils.read_files_path(face_images_path)
full_list_cycle = read_images(face_images_path)
face_det_results = face_detect(full_list_cycle)
face_frames = []
coord_frames = []
for face, coord in face_det_results:
face_frames.append(face)
coord_frames.append(coord)
return full_list_cycle, face_frames, coord_frames
def inter(self, model, chunks, face_list_cycle, face_det_results, out, j): def inter(self, model, chunks, face_list_cycle, face_det_results, out, j):
inputs = np.concatenate(chunks) # [5 * chunk] inputs = np.concatenate(chunks) # [5 * chunk]
mel = audio.melspectrogram(inputs) mel = audio.melspectrogram(inputs)
@ -445,9 +462,11 @@ class Human:
'temp/resul_tttt.mp4') 'temp/resul_tttt.mp4')
subprocess.call(command, shell=platform.system() != 'Windows') subprocess.call(command, shell=platform.system() != 'Windows')
# gen = datagen(face_list_cycle, self.mel_chunks_queue_) # gen = datagen(face_list_cycle, self.mel_chunks_queue_)
def get_face_list_cycle(self):
return self._face_list_cycle
def get_fps(self): def get_fps(self):
return self._fps return self._fps
@ -476,12 +495,20 @@ class Human:
return return
self._tts.push_txt(txt) self._tts.push_txt(txt)
def push_audio_chunk(self, audio_chunk): def put_audio_frame(self, audio_chunk):
self._chunk_2_mal.push_chunk(audio_chunk) self._chunk_2_mal.put_audio_frame(audio_chunk)
def push_mel_chunks_queue(self, mel_chunk): # def push_audio_chunk(self, audio_chunk):
self._infer.push(mel_chunk) # self._chunk_2_mal.push_chunk(audio_chunk)
# self.audio_chunks_queue_.put(audio_chunk)
def push_mel_chunks(self, mel_chunks):
self._infer.push(mel_chunks)
def push_out_put(self, frame, type_):
self._infer.push_out_queue(frame, type_)
def push_mel_chunks_queue(self, audio_chunk):
self.audio_chunks_queue_.put(audio_chunk)
def push_feat_queue(self, mel_chunks): def push_feat_queue(self, mel_chunks):
print("push_feat_queue") print("push_feat_queue")
@ -493,14 +520,38 @@ class Human:
def push_render_image(self, image): def push_render_image(self, image):
self._test_image_queue.put(image) self._test_image_queue.put(image)
def push_res_frame(self, res_frame, idx, audio_frames):
self._res_render_queue.put((res_frame, idx, audio_frames))
def render(self): def render(self):
try: try:
# img, aud = self._res_frame_queue.get(block=True, timeout=.3) # img, aud = self._res_frame_queue.get(block=True, timeout=.3)
img = self._test_image_queue.get(block=True, timeout=.3) # img = self._test_image_queue.get(block=True, timeout=.3)
res_frame, idx, audio_frames = self._res_render_queue.get(block=True, timeout=.3)
except queue.Empty: except queue.Empty:
# print('render queue.Empty:') # print('render queue.Empty:')
return None return None
return img
if audio_frames[0][1] != 0 and audio_frames[1][1] != 0:
combine_frame = self._frame_list_cycle[idx]
else:
bbox = self._coord_list_cycle[idx]
combine_frame = copy.deepcopy(self._frame_list_cycle[idx])
y1, y2, x1, x2 = bbox
try:
res_frame = cv2.resize(res_frame.astype(np.uint8), (x2 - x1, y2 - y1))
except:
return None
# combine_frame = get_image(ori_frame,res_frame,bbox)
# t=time.perf_counter()
combine_frame[y1:y2, x1:x2] = res_frame
image = combine_frame
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
return image
# print('blending time:',time.perf_counter()-t)
# def pull_audio_chunk(self): # def pull_audio_chunk(self):
# try: # try:

View File

@ -11,8 +11,8 @@ import torch
from tqdm import tqdm from tqdm import tqdm
import face_detection import face_detection
import utils
from models import Wav2Lip from models import Wav2Lip
from utils import mirror_index
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -147,7 +147,8 @@ def datagen_signal(frame, mel, face_det_results):
class Infer: class Infer:
def __init__(self, human): def __init__(self, human):
self._human = human self._human = human
self._queue = Queue() self._feat_queue = Queue()
self._audio_out_queue = Queue()
self._exit_event = Event() self._exit_event = Event()
self._run_thread = Thread(target=self.__on_run) self._run_thread = Thread(target=self.__on_run)
@ -155,17 +156,17 @@ class Infer:
self._run_thread.start() self._run_thread.start()
def __on_run(self): def __on_run(self):
face_images_path = r'./face/'
face_images_path = utils.read_files_path(face_images_path)
face_list_cycle = read_images(face_images_path)
face_images_length = len(face_list_cycle)
logging.info(f'face images length: {face_images_length}')
print(f'face images length: {face_images_length}')
model = load_model(r'.\checkpoints\wav2lip.pth') model = load_model(r'.\checkpoints\wav2lip.pth')
print("Model loaded") print("Model loaded")
face_list_cycle = self._human.get_face_list_cycle()
# self.__do_run1(face_list_cycle, model)
self.__do_run2(face_list_cycle, model)
# frame_h, frame_w = face_list_cycle[0].shape[:-1] # frame_h, frame_w = face_list_cycle[0].shape[:-1]
def __do_run1(self, face_list_cycle, model):
face_det_results = face_detect(face_list_cycle) face_det_results = face_detect(face_list_cycle)
j = 0 j = 0
@ -173,7 +174,7 @@ class Infer:
count = 0 count = 0
while self._exit_event.is_set(): while self._exit_event.is_set():
try: try:
m = self._queue.get(block=True, timeout=1) m = self._feat_queue.get(block=True, timeout=1)
except queue.Empty: except queue.Empty:
continue continue
@ -202,5 +203,77 @@ class Infer:
# out.write(f) # out.write(f)
# print('infer count:', count) # print('infer count:', count)
def push(self, chunk): def __do_run2(self, face_list_cycle, model):
self._queue.put(chunk) length = len(face_list_cycle)
index = 0
count = 0
count_time = 0
print('start inference')
while True:
if self._exit_event.is_set():
start_time = time.perf_counter()
try:
mel_batch = self._feat_queue.get(block=True, timeout=1)
except queue.Empty:
continue
is_all_silence = True
audio_frames = []
for _ in range(self._human.get_batch_size() * 2):
frame, type_ = self._audio_out_queue.get()
audio_frames.append((frame, type_))
if type_ == 0:
is_all_silence = False
if is_all_silence:
for i in range(self._human.get_batch_size()):
# res_frame_queue.put((None, mirror_index(length, index), audio_frames[i * 2:i * 2 + 2]))
self._human.push_res_frame(None, mirror_index(length, index), audio_frames[i * 2:i * 2 + 2])
index = index + 1
else:
print('infer=======')
t = time.perf_counter()
img_batch = []
for i in range(self._human.get_batch_size()):
idx = mirror_index(length, index + i)
face = face_list_cycle[idx]
img_batch.append(face)
img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
img_masked = img_batch.copy()
img_masked[:, face.shape[0] // 2:] = 0
img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
with torch.no_grad():
pred = model(mel_batch, img_batch)
pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
count_time += (time.perf_counter() - t)
count += self._human.batch_size()
# _totalframe += 1
if count >= 100:
print(f"------actual avg infer fps:{count / count_time:.4f}")
count = 0
count_time = 0
for i, res_frame in enumerate(pred):
# self.__pushmedia(res_frame,loop,audio_track,video_track)
# res_frame_queue.put(
# (res_frame, __mirror_index(length, index), audio_frames[i * 2:i * 2 + 2]))
self._human.push_res_frame(res_frame, mirror_index(length, index),
audio_frames[i * 2:i * 2 + 2])
index = index + 1
# print('total batch time:',time.perf_counter()-start_time)
else:
time.sleep(1)
print('musereal inference processor stop')
def push(self, mel_chunks):
self._feat_queue.put(mel_chunks)
def push_out_queue(self, frame, type_):
self._audio_out_queue.put((frame, type_))

View File

@ -1,9 +1,10 @@
#encoding = utf8 #encoding = utf8
import ctypes
import logging import logging
import queue import queue
import time import time
from queue import Queue from queue import Queue
import multiprocessing as mp
from threading import Thread, Event from threading import Thread, Event
import numpy as np import numpy as np
@ -13,90 +14,69 @@ from audio_render import AudioRender
class Chunk2Mal: class Chunk2Mal:
def __init__(self, human): def __init__(self, human):
self._audio_chunk_queue = Queue() # self._audio_chunk_queue = Queue()
self._human = human self._human = human
self._thread = None self._thread = None
self._chunks = [] self.frames = []
self._audio_chunks = [] self.queue = Queue()
# self.output_queue = mp.Queue()
# self.feat_queue = mp.Queue(2)
# 320 samples per chunk (20ms * 16000 / 1000)audio_chunk # 320 samples per chunk (20ms * 16000 / 1000)audio_chunk
self._chunk_len = self._human.get_audio_sample_rate() // self._human.get_fps() self.chunk = self._human.get_audio_sample_rate() // self._human.get_fps()
self._exit_event = Event() self._exit_event = Event()
self._thread = Thread(target=self._on_run) self._thread = Thread(target=self._on_run)
self._exit_event.set() self._exit_event.set()
self._thread.start() self._thread.start()
self._audio_render = AudioRender() # self._audio_render = AudioRender()
self._stream_len = 0 self._stream_len = 0
logging.info('chunk2mal start') logging.info('chunk2mal start')
def _concatenate(self):
logging.info('np.concatenate')
if len(self._chunks) < 3:
logging.info(f'np.concatenate: {len(self._chunks)}')
return
inputs = np.concatenate(self._chunks) # [5 * chunk]
self._chunks = []
mel = audio.melspectrogram(inputs)
if np.isnan(mel.reshape(-1)).sum() > 0:
raise ValueError(
'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
mel_step_size = 16
# print('fps:', self._human.get_fps())
mel_idx_multiplier = 80. / self._human.get_fps()
# print('mel_idx_multiplier:', mel_idx_multiplier)
count = 0
i = 0
while 1:
count = count + 1
start_idx = int(i * mel_idx_multiplier)
print('i', i, 'start_idx', start_idx, 'mel len:', len(mel[0]))
if start_idx + mel_step_size > len(mel[0]):
self._human.push_mel_chunks_queue(mel[:, len(mel[0]) - mel_step_size:])
break
self._human.push_mel_chunks_queue(mel[:, start_idx: start_idx + mel_step_size])
i += 1
# wav = np.concatenate(self._audio_chunks) # [5 * chunk]self._audio_chunks
# print('_concatenate', len(wav))
# audio.save_chunks([wav], 16000, "./temp/audio/")
# wav *= 32767 / max(0.01, np.max(np.abs(wav)))
# wav = wav.astype(np.int16)
# self._audio_render.write(wav, len(wav))
self._audio_chunks = []
print('mel_chunks count:', count)
def _on_run(self): def _on_run(self):
logging.info('chunk2mal run') logging.info('chunk2mal run')
while self._exit_event.is_set(): while self._exit_event.is_set():
if self._audio_chunk_queue.empty(): self._run_step()
if len(self._chunks) > 0: time.sleep(0.01)
self._concatenate()
else:
time.sleep(0.5)
continue
try:
chunk = self._audio_chunk_queue.get(block=True, timeout=1)
self._chunks.append(chunk)
self._stream_len = self._stream_len + len(chunk)
print('Chunk2Mal _stream_len:', self._stream_len)
self._audio_chunks.append(chunk.copy())
# self._human.push_audio_frames(chunk, 0)
if len(self._chunks) < 10: # 200ms
continue
except queue.Empty:
# print('Chunk2Mal queue.Empty')
continue
print('len(self._chunks):', len(self._chunks))
self._concatenate()
logging.info('chunk2mal exit') logging.info('chunk2mal exit')
def _run_step(self):
for _ in range(self._human.get_batch_size() * 2):
frame, _type = self.get_audio_frame()
self.frames.append(frame)
# put to output
self._human.push_out_put(frame, _type)
# self.output_queue.put((frame, _type))
# context not enough, do not run network.
if len(self.frames) <= self._human.get_stride_left_size() + self._human.get_stride_right_size():
return
inputs = np.concatenate(self.frames) # [N * chunk]
mel = audio.melspectrogram(inputs)
# print(mel.shape[0],mel.shape,len(mel[0]),len(self.frames))
# cut off stride
left = max(0, self._human.get_stride_left_size() * 80 / 50)
right = min(len(mel[0]), len(mel[0]) - self._human.get_stride_right_size() * 80 / 50)
mel_idx_multiplier = 80. * 2 / self._human.get_fps()
mel_step_size = 16
i = 0
mel_chunks = []
while i < (len(self.frames) - self._human.get_stride_left_size() - self._human.get_stride_right_size()) / 2:
start_idx = int(left + i * mel_idx_multiplier)
# print(start_idx)
if start_idx + mel_step_size > len(mel[0]):
mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
else:
mel_chunks.append(mel[:, start_idx: start_idx + mel_step_size])
i += 1
self._human.push_mel_chunks(mel_chunks)
# self.feat_queue.put(mel_chunks)
# discard the old part to save memory
self.frames = self.frames[-(self._human.get_stride_left_size() + self._human.get_stride_right_size()):]
def stop(self): def stop(self):
if self._exit_event is None: if self._exit_event is None:
return return
@ -106,15 +86,34 @@ class Chunk2Mal:
self._thread.join() self._thread.join()
logging.info('chunk2mal stop') logging.info('chunk2mal stop')
def push_chunk(self, chunk): def pause_talk(self):
self._audio_chunk_queue.put(chunk) self.queue.queue.clear()
def pull_chunk(self): def put_audio_frame(self, audio_chunk): #16khz 20ms pcm
self.queue.put(audio_chunk)
def get_audio_frame(self):
try: try:
chunk = self._audio_chunk_queue.get(block=True, timeout=1) frame = self.queue.get(block=True, timeout=0.01)
type = 1
except queue.Empty:
chunk = np.zeros(self._chunk_len, dtype=np.float32)
type = 0 type = 0
return chunk, type # print(f'[INFO] get frame {frame.shape}')
except queue.Empty:
frame = np.zeros(self.chunk, dtype=np.float32)
type = 1
return frame, type
def get_audio_out(self): # get origin audio pcm to nerf
return self.output_queue.get()
def warm_up(self):
for _ in range(self._human.get_stride_left_size() + self._human.get_stride_right_size()):
audio_frame, _type = self.get_audio_frame()
self.frames.append(audio_frame)
self.output_queue.put((audio_frame, type))
for _ in range(self._human.get_stride_right_size()):
self.output_queue.get()
def get_next_feat(self, block, timeout):
return self.feat_queue.get(block, timeout)

View File

@ -27,6 +27,7 @@ class EdgeTTS(TTSBase):
stream = self.__create_bytes_stream(self._io_stream) stream = self.__create_bytes_stream(self._io_stream)
stream_len = stream.shape[0] stream_len = stream.shape[0]
index = 0 index = 0
while stream_len >= self._chunk: while stream_len >= self._chunk:
self._human.push_audio_chunk(stream[index:index + self._chunk]) self._human.push_audio_chunk(stream[index:index + self._chunk])
stream_len -= self._chunk stream_len -= self._chunk
@ -56,4 +57,3 @@ class EdgeTTS(TTSBase):
if chuck['type'] == 'audio': if chuck['type'] == 'audio':
self._io_stream.write(chuck['data']) self._io_stream.write(chuck['data'])

View File

@ -5,9 +5,7 @@ import time
import edge_tts import edge_tts
import numpy as np import numpy as np
import pyaudio import soundfile as sf
import soundfile
import sounddevice
import resampy import resampy
import queue import queue
from io import BytesIO from io import BytesIO
@ -16,7 +14,6 @@ from threading import Thread, Event
import audio import audio
from audio_render import AudioRender
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -26,17 +23,13 @@ class TTSBase:
self._human = human self._human = human
self._thread = None self._thread = None
self._queue = Queue() self._queue = Queue()
self._io_stream = BytesIO() self.input_stream = BytesIO()
self._chunk_len = self._human.get_audio_sample_rate() // self._human.get_fps() self.chunk = self._human.get_audio_sample_rate() // self._human.get_fps()
self._exit_event = Event() self._exit_event = Event()
self._thread = Thread(target=self._on_run) self._thread = Thread(target=self._on_run)
self._exit_event.set() self._exit_event.set()
self._thread.start() self._thread.start()
# self._pcm_player = pyaudio.PyAudio()
# self._pcm_stream = self._pcm_player.open(format=pyaudio.paInt16,
# channels=1, rate=24000, output=True)
# self._audio_render = AudioRender()
logging.info('tts start') logging.info('tts start')
def _on_run(self): def _on_run(self):
@ -52,94 +45,54 @@ class TTSBase:
def _request(self, txt): def _request(self, txt):
voice = 'zh-CN-XiaoyiNeural' voice = 'zh-CN-XiaoyiNeural'
t = time.time() t = time.time()
asyncio.new_event_loop().run_until_complete(self.__on_request(voice, txt)) asyncio.new_event_loop().run_until_complete(self.__main(voice, txt))
logger.info(f'edge tts time:{time.time() - t : 0.4f}s') print(f'-------edge tts time:{time.time() - t:.4f}s')
self._io_stream.seek(0) self.input_stream.seek(0)
stream = self.__create_bytes_stream(self._io_stream) stream = self.__create_bytes_stream(self.input_stream)
audio.save_chunks([stream], 16000, './temp/audio/') streamlen = stream.shape[0]
# wav = audio.split_audio(stream, 16000, 0.04) idx = 0
print('-------tts start push chunk')
while streamlen >= self.chunk:
self._human.put_audio_frame(stream[idx:idx + self.chunk])
streamlen -= self.chunk
idx += self.chunk
# if streamlen>0: #skip last frame(not 20ms)
# self.queue.put(stream[idx:])
self.input_stream.seek(0)
self.input_stream.truncate()
print('-------tts finish push chunk')
# audio.save_chunks(wav, 16000, './temp/audio/') def __create_bytes_stream(self, byte_stream):
# audio.save_wav(stream, "./temp/audio/test1.wav", 16000) # byte_stream=BytesIO(buffer)
stream_len = stream.shape[0] stream, sample_rate = sf.read(byte_stream) # [T*sample_rate,] float64
print(f'[INFO]tts audio stream {sample_rate}: {stream.shape}')
stream = stream.astype(np.float32)
# wav = stream #np.concatenate(stream) # [5 * chunk]self._audio_chunks if stream.ndim > 1:
# print('_concatenate', len(wav)) print(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
stream = stream[:, 0]
# self._audio_chunks = [] if sample_rate != self._human.get_audio_sample_rate() and stream.shape[0] > 0:
print(f'[WARN] audio sample rate is {sample_rate}, resampling into {self._human.get_audio_sample_rate()}.')
stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self._human.get_audio_sample_rate())
print("stream_len:", stream_len, " _chunk_len:", self._chunk_len) return stream
index = 0
segment = 0
while stream_len >= self._chunk_len:
audio_chunk = stream[index:index + self._chunk_len]
self._human.push_audio_chunk(audio_chunk)
stream_len -= self._chunk_len
index += self._chunk_len
segment = segment + 1
if stream_len > 0: async def __main(self, voicename: str, text: str):
audio_chunk = stream[index:index + stream_len] communicate = edge_tts.Communicate(text, voicename)
self._human.push_audio_chunk(audio_chunk)
segment = segment + 1
print("segment:", segment) #with open(OUTPUT_FILE, "wb") as file:
self._io_stream.seek(0)
self._io_stream.truncate()
def __create_bytes_stream(self, io_stream):
stream, sample_rate = soundfile.read(io_stream)
logger.info(f'tts audio stream {sample_rate} : {stream.shape}')
stream = stream.astype(np.float32)
if stream.ndim > 1:
logger.warning(f'tts audio has {stream.shape[1]} channels, only use the first')
stream = stream[:, 1]
if sample_rate != self._human.get_audio_sample_rate() and stream.shape[0] > 0:
logger.warning(f'tts audio sample rate is {sample_rate}, resample to {self._human.get_audio_sample_rate() }')
stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self._human.get_audio_sample_rate())
return stream
async def __on_request(self, voice, txt):
communicate = edge_tts.Communicate(txt, voice)
first = True first = True
total_data = b''
CHUNK_SIZE = self._chunk_len
async for chunk in communicate.stream(): async for chunk in communicate.stream():
if chunk["type"] == "audio" and chunk["data"]: if first:
data = chunk['data'] first = False
self._io_stream.write(data) if chunk["type"] == "audio":
#self.push_audio(chunk["data"])
self.input_stream.write(chunk["data"])
#file.write(chunk["data"])
elif chunk["type"] == "WordBoundary": elif chunk["type"] == "WordBoundary":
pass pass
'''
total_data += chunk["data"]
if len(total_data) >= CHUNK_SIZE:
# print(f"Time elapsed: {time.time() - start_time:.2f} seconds") # Print time
audio_data = AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])) #.raw_data
audio_data = audio_data.set_frame_rate(self._human.get_audio_sample_rate())
# self._human.push_audio_chunk(audio_data)
self._pcm_stream.write(audio_data.raw_data)
# play_audio(total_data[:CHUNK_SIZE], stream) # Play first CHUNK_SIZE bytes
total_data = total_data[CHUNK_SIZE:] # Remove played data
'''
# if first:
# first = False
# if chuck['type'] == 'audio':
# # self._io_stream.write(chuck['data'])
# self._io_stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data)
# if len(total_data) > 0:
# self._pcm_stream.write(AudioSegment.from_mp3(BytesIO(total_data)).raw_data)
# audio_data = AudioSegment.from_mp3(BytesIO(total_data)) # .raw_data
# audio_data = audio_data.set_frame_rate(self._human.get_audio_sample_rate())
# self._pcm_stream.write(audio_data.raw_data)
# self._human.push_audio_chunk(audio_data)
# self._io_stream.write(AudioSegment.from_mp3(BytesIO(total_data)).raw_data)
def stop(self): def stop(self):
self._pcm_stream.stop_stream() self._pcm_stream.stop_stream()

4
ui.py
View File

@ -44,7 +44,7 @@ class App(customtkinter.CTk):
# self.logo_label.grid(row=0, column=0, padx=20, pady=(20, 10)) # self.logo_label.grid(row=0, column=0, padx=20, pady=(20, 10))
self.entry = customtkinter.CTkEntry(self, placeholder_text="输入内容") self.entry = customtkinter.CTkEntry(self, placeholder_text="输入内容")
self.entry.insert(0, "基本信息,北京九零科技有限公司成立于2015年位于北京市是一家以从事科技推广和应用服务业为主的企业。企业注册资本500万人民币") self.entry.insert(0, "你好,我是中国湘西人")
self.entry.grid(row=2, column=0, columnspan=2, padx=(20, 0), pady=(20, 20), sticky="nsew") self.entry.grid(row=2, column=0, columnspan=2, padx=(20, 0), pady=(20, 20), sticky="nsew")
self.main_button_1 = customtkinter.CTkButton(master=self, fg_color="transparent", border_width=2, self.main_button_1 = customtkinter.CTkButton(master=self, fg_color="transparent", border_width=2,
@ -83,7 +83,7 @@ class App(customtkinter.CTk):
self.after(100, self._render) self.after(100, self._render)
return return
self.play_audio() # self.play_audio()
iheight, iwidth = image.shape[0], image.shape[1] iheight, iwidth = image.shape[0], image.shape[1]
width = self.winfo_width() width = self.winfo_width()
height = self.winfo_height() height = self.winfo_height()

View File

@ -10,3 +10,13 @@ def read_files_path(path):
file_paths.append(path + file) file_paths.append(path + file)
return file_paths return file_paths
def mirror_index(size, index):
# size = len(self.coord_list_cycle)
turn = index // size
res = index % size
if turn % 2 == 0:
return res
else:
return size - res - 1