add audio render
This commit is contained in:
parent
f848529859
commit
472a17f896
22
Human.py
22
Human.py
@ -246,12 +246,12 @@ def datagen(frames, mels):
|
||||
|
||||
|
||||
def datagen_signal(frame, mel, face_det_results):
|
||||
img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
|
||||
img_batch, mel_batch, frame_batch, coord_batch = [], [], [], []
|
||||
|
||||
# for i, m in enumerate(mels):
|
||||
idx = 0
|
||||
frame_to_save = frame.copy()
|
||||
face, coords = face_det_results[idx].copy()
|
||||
face, coord = face_det_results[idx].copy()
|
||||
|
||||
face = cv2.resize(face, (img_size, img_size))
|
||||
m = mel
|
||||
@ -259,7 +259,7 @@ def datagen_signal(frame, mel, face_det_results):
|
||||
img_batch.append(face)
|
||||
mel_batch.append(m)
|
||||
frame_batch.append(frame_to_save)
|
||||
coords_batch.append(coords)
|
||||
coord_batch.append(coord)
|
||||
|
||||
if len(img_batch) >= wav2lip_batch_size:
|
||||
img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
|
||||
@ -269,7 +269,7 @@ def datagen_signal(frame, mel, face_det_results):
|
||||
img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
|
||||
mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
|
||||
|
||||
return img_batch, mel_batch, frame_batch, coords_batch
|
||||
return img_batch, mel_batch, frame_batch, coord_batch
|
||||
|
||||
if len(img_batch) > 0:
|
||||
img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
|
||||
@ -279,7 +279,7 @@ def datagen_signal(frame, mel, face_det_results):
|
||||
img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
|
||||
mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
|
||||
|
||||
return img_batch, mel_batch, frame_batch, coords_batch
|
||||
return img_batch, mel_batch, frame_batch, coord_batch
|
||||
|
||||
|
||||
# 从字节流加载音频数据
|
||||
@ -294,7 +294,7 @@ def load_audio_from_bytes(byte_data):
|
||||
|
||||
class Human:
|
||||
def __init__(self):
|
||||
self._fps = 25 # 20 ms per frame
|
||||
self._fps = 25 # 40 ms per frame
|
||||
self._batch_size = 16
|
||||
self._sample_rate = 16000
|
||||
self._stride_left_size = 10
|
||||
@ -340,13 +340,14 @@ class Human:
|
||||
# p.terminate()
|
||||
|
||||
def test(self):
|
||||
wav = audio.load_wav(r'./audio/test.wav', 16000)
|
||||
wav = audio.load_wav(r'./audio/audio.wav', 16000)
|
||||
# with open(r'./audio/test.wav', 'rb') as f:
|
||||
# byte_data = f.read()
|
||||
#
|
||||
# byte_data = byte_data[16:]
|
||||
# inputs = np.concatenate(byte_data) # [N * chunk]
|
||||
# wav = load_audio_from_bytes(inputs)
|
||||
print('wav length:', len(wav))
|
||||
mel = audio.melspectrogram(wav)
|
||||
if np.isnan(mel.reshape(-1)).sum() > 0:
|
||||
raise ValueError(
|
||||
@ -405,9 +406,9 @@ class Human:
|
||||
p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))
|
||||
|
||||
f[y1:y2, x1:x2] = p
|
||||
# name = "%04d" % j
|
||||
# cv2.imwrite(f'temp/images/{j}.jpg', p)
|
||||
# j = j + 1
|
||||
name = "%04d" % j
|
||||
cv2.imwrite(f'temp/images/{j}.jpg', p)
|
||||
j = j + 1
|
||||
p = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
|
||||
self._test_image_queue.put(p)
|
||||
# out.write(f)
|
||||
@ -460,7 +461,6 @@ class Human:
|
||||
self._feat_queue.put(mel_chunks)
|
||||
|
||||
def push_audio_frames(self, chunk, type_):
|
||||
print("push_audio_frames")
|
||||
self._output_queue.put((chunk, type_))
|
||||
|
||||
def push_render_image(self, image):
|
||||
|
BIN
audio_render/AudioRender.dll
Normal file
BIN
audio_render/AudioRender.dll
Normal file
Binary file not shown.
BIN
audio_render/AudioRender.lib
Normal file
BIN
audio_render/AudioRender.lib
Normal file
Binary file not shown.
BIN
audio_render/AudioRender.pdb
Normal file
BIN
audio_render/AudioRender.pdb
Normal file
Binary file not shown.
3
audio_render/__init__.py
Normal file
3
audio_render/__init__.py
Normal file
@ -0,0 +1,3 @@
|
||||
#encoding = utf8
|
||||
|
||||
from .audio_render import AudioRender
|
35
audio_render/audio_render.py
Normal file
35
audio_render/audio_render.py
Normal file
@ -0,0 +1,35 @@
|
||||
#encoding = utf8
|
||||
|
||||
from ctypes import *
|
||||
import os
|
||||
current = os.path.dirname(__file__)
|
||||
dynamic_path = os.path.join(current, 'AudioRender.dll')
|
||||
|
||||
|
||||
def audio_render_log_callback(level, log, size):
|
||||
print(f'level={level}, log={log}, len={size}')
|
||||
|
||||
|
||||
class AudioRender:
|
||||
def __init__(self):
|
||||
self.__audio_render_obj = WinDLL(dynamic_path)
|
||||
print(self.__audio_render_obj)
|
||||
if self.__audio_render_obj is not None:
|
||||
CALLBACK_TYPE = CFUNCTYPE(None, c_int, c_ubyte, c_uint)
|
||||
c_callback = CALLBACK_TYPE(audio_render_log_callback)
|
||||
self.__init = self.__audio_render_obj.Initialize(c_callback)
|
||||
print('AudioRender init', self.__init)
|
||||
|
||||
def __del__(self):
|
||||
print('AudioRender __del__')
|
||||
if self.__audio_render_obj is None:
|
||||
return
|
||||
if self.__init:
|
||||
self.__audio_render_obj.Uninitialize()
|
||||
|
||||
def write(self, data, size):
|
||||
if not self.__init:
|
||||
return False
|
||||
|
||||
self.__audio_render_obj.argtypes = (POINTER(c_ubyte), c_uint)
|
||||
return self.__audio_render_obj.Write(data.ctypes.data_as(POINTER(c_ubyte)), size)
|
185
edge_tts_test.py
185
edge_tts_test.py
@ -1,102 +1,105 @@
|
||||
#encoding = utf8
|
||||
|
||||
import edge_tts
|
||||
import asyncio
|
||||
import pyaudio
|
||||
from pydub import AudioSegment
|
||||
from io import BytesIO
|
||||
|
||||
# 如果在 Jupyter Notebook 中使用,解除事件循环限制
|
||||
try:
|
||||
import nest_asyncio
|
||||
nest_asyncio.apply()
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
def play_audio(data: bytes, stream: pyaudio.Stream) -> None:
|
||||
stream.write(AudioSegment.from_mp3(BytesIO(data)).raw_data)
|
||||
|
||||
CHUNK_SIZE = 20 * 1024
|
||||
async def play_tts(text, voice):
|
||||
communicate = edge_tts.Communicate(text, voice)
|
||||
|
||||
# 设置 PyAudio
|
||||
audio = pyaudio.PyAudio()
|
||||
stream = audio.open(format=pyaudio.paInt16, channels=1, rate=16000, output=True)
|
||||
|
||||
# async for chunk in communicate.stream(): # 使用 stream 方法
|
||||
# if chunk['type'] == 'audio': # 确保 chunk 是字节流
|
||||
# stream.write(chunk['data'])
|
||||
|
||||
total_data = b''
|
||||
for chunk in communicate.stream_sync():
|
||||
if chunk["type"] == "audio" and chunk["data"]:
|
||||
total_data += chunk["data"]
|
||||
if len(total_data) >= CHUNK_SIZE:
|
||||
# print(f"Time elapsed: {time.time() - start_time:.2f} seconds") # Print time
|
||||
stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data)
|
||||
# play_audio(total_data[:CHUNK_SIZE], stream) # Play first CHUNK_SIZE bytes
|
||||
total_data = total_data[CHUNK_SIZE:] # Remove played data
|
||||
# play_audio(total_data, stream)
|
||||
# 停止和关闭音频流
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
audio.terminate()
|
||||
|
||||
|
||||
async def save_to_file(text, voice, filename):
|
||||
communicate = edge_tts.Communicate(text, voice)
|
||||
|
||||
with open(filename, "wb") as f:
|
||||
async for chunk in communicate.stream():
|
||||
if chunk['type'] == 'audio':
|
||||
f.write(chunk['data'])
|
||||
|
||||
if __name__ == "__main__":
|
||||
text = "Hello, this is a test of the Edge TTS service."
|
||||
voice = "en-US-JessaNeural"
|
||||
|
||||
# 使用 asyncio.run() 运行异步函数
|
||||
asyncio.run(play_tts(text, voice))
|
||||
# asyncio.run(save_to_file(text, voice, "output.wav"))
|
||||
|
||||
#
|
||||
# import edge_tts
|
||||
# import asyncio
|
||||
# import pyaudio
|
||||
# from io import BytesIO
|
||||
# from pydub import AudioSegment
|
||||
# import time
|
||||
# from io import BytesIO
|
||||
#
|
||||
# TEXT = 'Hello World! How are you guys doing? I hope great, cause I am having fun and honestly it has been a blast'
|
||||
# VOICE = "en-US-AndrewMultilingualNeural"
|
||||
# CHUNK_SIZE = 20 * 1024 # Assuming around 1024 bytes per chunk (adjust based on format)
|
||||
#
|
||||
# def main() -> None:
|
||||
# start_time = time.time()
|
||||
# communicator = edge_tts.Communicate(TEXT, VOICE)
|
||||
#
|
||||
# pyaudio_instance = pyaudio.PyAudio()
|
||||
# audio_stream = pyaudio_instance.open(format=pyaudio.paInt16, channels=1, rate=16000, output=True)
|
||||
#
|
||||
# total_data = b'' # Store audio data instead of chunks
|
||||
#
|
||||
# for chunk in communicator.stream_sync():
|
||||
# if chunk["type"] == "audio" and chunk["data"]:
|
||||
# total_data += chunk["data"]
|
||||
# if len(total_data) >= CHUNK_SIZE:
|
||||
# print(f"Time elapsed: {time.time() - start_time:.2f} seconds") # Print time
|
||||
# play_audio(total_data[:CHUNK_SIZE], audio_stream) # Play first CHUNK_SIZE bytes
|
||||
# total_data = total_data[CHUNK_SIZE:] # Remove played data
|
||||
#
|
||||
# # Play remaining audio
|
||||
# play_audio(total_data, audio_stream)
|
||||
#
|
||||
# audio_stream.stop_stream()
|
||||
# audio_stream.close()
|
||||
# pyaudio_instance.terminate()
|
||||
# # 如果在 Jupyter Notebook 中使用,解除事件循环限制
|
||||
# try:
|
||||
# import nest_asyncio
|
||||
# nest_asyncio.apply()
|
||||
# except ImportError:
|
||||
# pass
|
||||
#
|
||||
# def play_audio(data: bytes, stream: pyaudio.Stream) -> None:
|
||||
# stream.write(AudioSegment.from_mp3(BytesIO(data)).raw_data)
|
||||
#
|
||||
# CHUNK_SIZE = 20 * 1024
|
||||
# async def play_tts(text, voice):
|
||||
# communicate = edge_tts.Communicate(text, voice)
|
||||
#
|
||||
# # 设置 PyAudio
|
||||
# audio = pyaudio.PyAudio()
|
||||
# stream = audio.open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
|
||||
#
|
||||
# # async for chunk in communicate.stream(): # 使用 stream 方法
|
||||
# # if chunk['type'] == 'audio': # 确保 chunk 是字节流
|
||||
# # stream.write(chunk['data'])
|
||||
#
|
||||
# total_data = b''
|
||||
# for chunk in communicate.stream_sync():
|
||||
# if chunk["type"] == "audio" and chunk["data"]:
|
||||
# total_data += chunk["data"]
|
||||
# if len(total_data) >= CHUNK_SIZE:
|
||||
# # print(f"Time elapsed: {time.time() - start_time:.2f} seconds") # Print time
|
||||
# stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data)
|
||||
# # play_audio(total_data[:CHUNK_SIZE], stream) # Play first CHUNK_SIZE bytes
|
||||
# total_data = total_data[CHUNK_SIZE:] # Remove played data
|
||||
# # play_audio(total_data, stream)
|
||||
# # 停止和关闭音频流
|
||||
# stream.stop_stream()
|
||||
# stream.close()
|
||||
# audio.terminate()
|
||||
#
|
||||
#
|
||||
# async def save_to_file(text, voice, filename):
|
||||
# communicate = edge_tts.Communicate(text, voice)
|
||||
#
|
||||
# with open(filename, "wb") as f:
|
||||
# async for chunk in communicate.stream():
|
||||
# if chunk['type'] == 'audio':
|
||||
# f.write(chunk['data'])
|
||||
#
|
||||
# if __name__ == "__main__":
|
||||
# main()
|
||||
# text = "Hello, this is a test of the Edge TTS service."
|
||||
# voice = "en-US-JessaNeural"
|
||||
#
|
||||
# # 使用 asyncio.run() 运行异步函数
|
||||
# asyncio.run(play_tts(text, voice))
|
||||
# # asyncio.run(save_to_file(text, voice, "output.wav"))
|
||||
|
||||
|
||||
import edge_tts
|
||||
import pyaudio
|
||||
from io import BytesIO
|
||||
from pydub import AudioSegment
|
||||
import time
|
||||
|
||||
TEXT = 'Hello World! How are you guys doing? I hope great, cause I am having fun and honestly it has been a blast'
|
||||
VOICE = "en-US-AndrewMultilingualNeural"
|
||||
CHUNK_SIZE = 20 * 1024 # Assuming around 1024 bytes per chunk (adjust based on format)
|
||||
|
||||
|
||||
def main() -> None:
|
||||
start_time = time.time()
|
||||
communicator = edge_tts.Communicate(TEXT, VOICE)
|
||||
|
||||
pyaudio_instance = pyaudio.PyAudio()
|
||||
audio_stream = pyaudio_instance.open(format=pyaudio.paInt16, channels=1, rate=24000, output=True)
|
||||
|
||||
total_data = b'' # Store audio data instead of chunks
|
||||
|
||||
for chunk in communicator.stream_sync():
|
||||
if chunk["type"] == "audio" and chunk["data"]:
|
||||
total_data += chunk["data"]
|
||||
if len(total_data) >= CHUNK_SIZE:
|
||||
print(f"Time elapsed: {time.time() - start_time:.2f} seconds") # Print time
|
||||
play_audio(total_data[:CHUNK_SIZE], audio_stream) # Play first CHUNK_SIZE bytes
|
||||
total_data = total_data[CHUNK_SIZE:] # Remove played data
|
||||
|
||||
# Play remaining audio
|
||||
play_audio(total_data, audio_stream)
|
||||
|
||||
audio_stream.stop_stream()
|
||||
audio_stream.close()
|
||||
pyaudio_instance.terminate()
|
||||
|
||||
|
||||
def play_audio(data: bytes, stream: pyaudio.Stream) -> None:
|
||||
stream.write(AudioSegment.from_mp3(BytesIO(data)).raw_data)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
8
infer.py
8
infer.py
@ -1,5 +1,6 @@
|
||||
#encoding = utf8
|
||||
import queue
|
||||
import time
|
||||
from queue import Queue
|
||||
from threading import Thread, Event
|
||||
import logging
|
||||
@ -169,6 +170,7 @@ class Infer:
|
||||
|
||||
j = 0
|
||||
|
||||
count = 0
|
||||
while self._exit_event.is_set():
|
||||
try:
|
||||
m = self._queue.get(block=True, timeout=1)
|
||||
@ -180,6 +182,8 @@ class Infer:
|
||||
img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
|
||||
mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
|
||||
|
||||
time.sleep(0.01)
|
||||
|
||||
with torch.no_grad():
|
||||
pred = model(mel_batch, img_batch)
|
||||
|
||||
@ -189,12 +193,14 @@ class Infer:
|
||||
p = cv2.resize(p.astype(np.uint8), (x2 - x1, y2 - y1))
|
||||
|
||||
f[y1:y2, x1:x2] = p
|
||||
name = "%04d" % j
|
||||
# name = "%04d" % j
|
||||
cv2.imwrite(f'temp/images/{j}.jpg', p)
|
||||
j = j + 1
|
||||
# count = count + 1
|
||||
p = cv2.cvtColor(f, cv2.COLOR_BGR2RGB)
|
||||
self._human.push_render_image(p)
|
||||
# out.write(f)
|
||||
# print('infer count:', count)
|
||||
|
||||
def push(self, chunk):
|
||||
self._queue.put(chunk)
|
@ -1,5 +1,5 @@
|
||||
#encoding = utf8
|
||||
|
||||
import ctypes
|
||||
import logging
|
||||
import queue
|
||||
import time
|
||||
@ -8,6 +8,7 @@ from threading import Thread, Event
|
||||
|
||||
import numpy as np
|
||||
import audio
|
||||
from audio_render import AudioRender
|
||||
|
||||
|
||||
class Chunk2Mal:
|
||||
@ -17,6 +18,7 @@ class Chunk2Mal:
|
||||
self._thread = None
|
||||
|
||||
self._chunks = []
|
||||
self._audio_chunks = []
|
||||
# 320 samples per chunk (20ms * 16000 / 1000)audio_chunk
|
||||
self._chunk_len = self._human.get_audio_sample_rate() // self._human.get_fps()
|
||||
|
||||
@ -24,63 +26,67 @@ class Chunk2Mal:
|
||||
self._thread = Thread(target=self._on_run)
|
||||
self._exit_event.set()
|
||||
self._thread.start()
|
||||
self._audio_render = AudioRender()
|
||||
logging.info('chunk2mal start')
|
||||
|
||||
def _concatenate(self):
|
||||
logging.info('np.concatenate')
|
||||
inputs = np.concatenate(self._chunks) # [5 * chunk]
|
||||
self._chunks = []
|
||||
mel = audio.melspectrogram(inputs)
|
||||
if np.isnan(mel.reshape(-1)).sum() > 0:
|
||||
raise ValueError(
|
||||
'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
|
||||
|
||||
mel_step_size = 16
|
||||
# print('fps:', self._human.get_fps())
|
||||
mel_idx_multiplier = 80. / self._human.get_fps()
|
||||
# print('mel_idx_multiplier:', mel_idx_multiplier)
|
||||
count = 0
|
||||
i = 0
|
||||
while 1:
|
||||
count = count + 1
|
||||
start_idx = int(i * mel_idx_multiplier)
|
||||
print('i', i, 'start_idx', start_idx, 'mel len:', len(mel[0]))
|
||||
if start_idx + mel_step_size > len(mel[0]):
|
||||
self._human.push_mel_chunks_queue(mel[:, len(mel[0]) - mel_step_size:])
|
||||
break
|
||||
self._human.push_mel_chunks_queue(mel[:, start_idx: start_idx + mel_step_size])
|
||||
i += 1
|
||||
|
||||
wav = np.concatenate(self._audio_chunks) # [5 * chunk]self._audio_chunks
|
||||
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
|
||||
wav = wav.astype(np.int16)
|
||||
self._audio_render.write(wav, len(wav))
|
||||
self._audio_chunks = []
|
||||
|
||||
print('mel_chunks count:', count)
|
||||
|
||||
def _on_run(self):
|
||||
logging.info('chunk2mal run')
|
||||
while self._exit_event.is_set():
|
||||
if self._audio_chunk_queue.empty():
|
||||
time.sleep(0.5)
|
||||
if len(self._chunks) > 0:
|
||||
self._concatenate()
|
||||
else:
|
||||
time.sleep(0.5)
|
||||
continue
|
||||
try:
|
||||
chunk = self._audio_chunk_queue.get(block=True, timeout=1)
|
||||
self._chunks.append(chunk)
|
||||
self._human.push_audio_frames(chunk, 0)
|
||||
if len(self._chunks) < 10:
|
||||
self._audio_chunks.append(chunk.copy())
|
||||
# print(type(chunk))
|
||||
|
||||
# self._human.push_audio_frames(chunk, 0)
|
||||
if len(self._chunks) < 102: # 200ms
|
||||
continue
|
||||
except queue.Empty:
|
||||
# print('Chunk2Mal queue.Empty')
|
||||
continue
|
||||
|
||||
print('len(self._chunks):', len(self._chunks))
|
||||
self._concatenate()
|
||||
|
||||
logging.info('np.concatenate')
|
||||
inputs = np.concatenate(self._chunks) # [N * chunk]
|
||||
mel = audio.melspectrogram(inputs)
|
||||
if np.isnan(mel.reshape(-1)).sum() > 0:
|
||||
raise ValueError(
|
||||
'Mel contains nan! Using a TTS voice? Add a small epsilon noise to the wav file and try again')
|
||||
|
||||
mel_step_size = 16
|
||||
print('fps:', self._human.get_fps())
|
||||
mel_idx_multiplier = 80. / self._human.get_fps()
|
||||
print('mel_idx_multiplier:', mel_idx_multiplier)
|
||||
|
||||
i = 0
|
||||
while 1:
|
||||
start_idx = int(i * mel_idx_multiplier)
|
||||
if start_idx + mel_step_size > len(mel[0]):
|
||||
self._human.push_mel_chunks_queue(mel[:, len(mel[0]) - mel_step_size:])
|
||||
break
|
||||
self._human.push_mel_chunks_queue(mel[:, start_idx: start_idx + mel_step_size])
|
||||
i += 1
|
||||
|
||||
batch_size = 128
|
||||
|
||||
'''
|
||||
while i < (len(self._chunks) - self._human.get_stride_left_size()
|
||||
- self._human.get_stride_right_size()) / 2:
|
||||
start_idx = int(left + i * mel_idx_multiplier)
|
||||
# print(start_idx)
|
||||
if start_idx + mel_step_size > len(mel[0]):
|
||||
mel_chunks.append(mel[:, len(mel[0]) - mel_step_size:])
|
||||
else:
|
||||
mel_chunks.append(mel[:, start_idx: start_idx + mel_step_size])
|
||||
i += 1
|
||||
self._human.push_feat_queue(mel_chunks)
|
||||
|
||||
# discard the old part to save memory
|
||||
self._chunks = self._chunks[-(self._human.get_stride_left_size() + self._human.get_stride_right_size()):]
|
||||
'''
|
||||
|
||||
logging.info('chunk2mal exit')
|
||||
|
||||
|
@ -57,33 +57,19 @@ class TTSBase:
|
||||
|
||||
self._io_stream.seek(0)
|
||||
stream = self.__create_bytes_stream(self._io_stream)
|
||||
# audio.save_wav(stream, "./temp/audio/test1.wav", 16000)
|
||||
stream_len = stream.shape[0]
|
||||
|
||||
sr = 16000
|
||||
soundfile.read('./temp/audio/audio.wav', stream, sr)
|
||||
# audio_chunks = audio.split_audio(stream, sr, 4)
|
||||
|
||||
# display(audio.play_audio_chunk(audio_chunks[0], sr=sr))
|
||||
|
||||
# 保存切割后的片段
|
||||
# audio.save_chunks(stream[0:-1], sr, './temp/audio/')
|
||||
# audio.save_chunks(audio_chunks, sr, './temp/audio/')
|
||||
# try:
|
||||
# sounddevice.play(stream, samplerate=self._human.get_audio_sample_rate())
|
||||
# sounddevice.wait() # 等待音频播放完毕
|
||||
# except Exception as e:
|
||||
# logger.error(f"播放音频出错: {e}") playrec
|
||||
print("stream_len:", stream_len, " _chunk_len:", self._chunk_len)
|
||||
index = 0
|
||||
segment = 0
|
||||
while stream_len >= self._chunk_len:
|
||||
audio_chunk = stream[index:index + self._chunk_len]
|
||||
# sounddevice.play(audio_chunk, samplerate=self._human.get_audio_sample_rate())
|
||||
# self._pcm_stream.write(audio_chunk)
|
||||
# self._pcm_stream.write(audio_chunk.tobytes())
|
||||
# self._human.push_audio_chunk(audio_chunk)
|
||||
# self._human.push_mel_chunks_queue(audio_chunk)
|
||||
self._human.push_audio_chunk(audio_chunk)
|
||||
stream_len -= self._chunk_len
|
||||
index += self._chunk_len
|
||||
segment = segment + 1
|
||||
print("segment:", segment)
|
||||
self._io_stream.seek(0)
|
||||
self._io_stream.truncate()
|
||||
|
||||
|
19
ui.py
19
ui.py
@ -44,6 +44,7 @@ class App(customtkinter.CTk):
|
||||
# self.logo_label.grid(row=0, column=0, padx=20, pady=(20, 10))
|
||||
|
||||
self.entry = customtkinter.CTkEntry(self, placeholder_text="输入内容")
|
||||
self.entry.insert(0, "基本信息,北京九零科技有限公司,成立于2015年,位于北京市,是一家以从事科技推广和应用服务业为主的企业。企业注册资本500万人民币。")
|
||||
self.entry.grid(row=2, column=0, columnspan=2, padx=(20, 0), pady=(20, 20), sticky="nsew")
|
||||
|
||||
self.main_button_1 = customtkinter.CTkButton(master=self, fg_color="transparent", border_width=2,
|
||||
@ -63,13 +64,13 @@ class App(customtkinter.CTk):
|
||||
self._human.on_destroy()
|
||||
|
||||
def play_audio(self):
|
||||
# return
|
||||
if self._is_play_audio:
|
||||
return
|
||||
self._is_play_audio = True
|
||||
file = os.path.curdir + '/audio/test.wav'
|
||||
print(file)
|
||||
winsound.PlaySound(file, winsound.SND_ASYNC or winsound.SND_FILENAME)
|
||||
return
|
||||
# if self._is_play_audio:
|
||||
# return
|
||||
# self._is_play_audio = True
|
||||
# file = os.path.curdir + '/audio/test1.wav'
|
||||
# print(file)
|
||||
# winsound.PlaySound(file, winsound.SND_ASYNC or winsound.SND_FILENAME)
|
||||
# playsound(file)
|
||||
|
||||
def _init_image_canvas(self):
|
||||
@ -105,11 +106,11 @@ class App(customtkinter.CTk):
|
||||
height = self.winfo_height() * 0.5
|
||||
self._canvas.create_image(width, height, anchor=customtkinter.CENTER, image=imgtk)
|
||||
self._canvas.update()
|
||||
self.after(33, self._render)
|
||||
self.after(40, self._render)
|
||||
|
||||
def request_tts(self):
|
||||
content = self.entry.get()
|
||||
content = 'Hello, this is a test of the Edge TTS service.'
|
||||
# content = ''
|
||||
print('content:', content)
|
||||
self.entry.delete(0, customtkinter.END)
|
||||
self._human.read(content)
|
||||
|
Loading…
Reference in New Issue
Block a user