human/tts/TTSBase.py

164 lines
5.9 KiB
Python
Raw Normal View History

2024-09-02 00:13:34 +00:00
#encoding = utf8
2024-09-04 16:51:14 +00:00
import logging
2024-09-21 12:58:26 +00:00
import asyncio
import time
import edge_tts
import numpy as np
2024-09-26 12:28:49 +00:00
import pyaudio
2024-09-21 12:58:26 +00:00
import soundfile
2024-09-26 17:34:52 +00:00
import sounddevice
2024-09-21 12:58:26 +00:00
import resampy
2024-09-02 00:13:34 +00:00
import queue
from io import BytesIO
from queue import Queue
from threading import Thread, Event
2024-09-27 11:31:36 +00:00
from IPython.core.display_functions import display
2024-09-26 12:28:49 +00:00
from pydub import AudioSegment
2024-09-27 11:31:36 +00:00
import audio
2024-09-21 12:58:26 +00:00
logger = logging.getLogger(__name__)
2024-09-02 00:13:34 +00:00
class TTSBase:
def __init__(self, human):
self._human = human
self._thread = None
self._queue = Queue()
self._io_stream = BytesIO()
2024-09-26 17:34:52 +00:00
self._chunk_len = self._human.get_audio_sample_rate() // self._human.get_fps()
2024-09-21 12:58:26 +00:00
self._exit_event = Event()
self._thread = Thread(target=self._on_run)
self._exit_event.set()
self._thread.start()
2024-09-26 17:34:52 +00:00
# self._pcm_player = pyaudio.PyAudio()
# self._pcm_stream = self._pcm_player.open(format=pyaudio.paInt16,
# channels=1, rate=24000, output=True)
2024-09-21 12:58:26 +00:00
logging.info('tts start')
2024-09-02 00:13:34 +00:00
def _on_run(self):
2024-09-04 16:51:14 +00:00
logging.info('tts run')
2024-09-21 12:58:26 +00:00
while self._exit_event.is_set():
2024-09-02 00:13:34 +00:00
try:
txt = self._queue.get(block=True, timeout=1)
except queue.Empty:
continue
self._request(txt)
2024-09-04 16:51:14 +00:00
logging.info('tts exit')
2024-09-02 00:13:34 +00:00
def _request(self, txt):
2024-09-21 12:58:26 +00:00
voice = 'zh-CN-XiaoyiNeural'
t = time.time()
asyncio.new_event_loop().run_until_complete(self.__on_request(voice, txt))
logger.info(f'edge tts time:{time.time() - t : 0.4f}s')
2024-09-02 00:13:34 +00:00
2024-09-21 12:58:26 +00:00
self._io_stream.seek(0)
stream = self.__create_bytes_stream(self._io_stream)
stream_len = stream.shape[0]
2024-09-27 11:31:36 +00:00
sr = 16000
soundfile.read('./temp/audio/audio.wav', stream, sr)
# audio_chunks = audio.split_audio(stream, sr, 4)
# display(audio.play_audio_chunk(audio_chunks[0], sr=sr))
# 保存切割后的片段
# audio.save_chunks(stream[0:-1], sr, './temp/audio/')
# audio.save_chunks(audio_chunks, sr, './temp/audio/')
2024-09-26 17:34:52 +00:00
# try:
# sounddevice.play(stream, samplerate=self._human.get_audio_sample_rate())
# sounddevice.wait() # 等待音频播放完毕
# except Exception as e:
# logger.error(f"播放音频出错: {e}") playrec
2024-09-21 12:58:26 +00:00
index = 0
2024-09-26 12:28:49 +00:00
while stream_len >= self._chunk_len:
audio_chunk = stream[index:index + self._chunk_len]
2024-09-26 17:34:52 +00:00
# sounddevice.play(audio_chunk, samplerate=self._human.get_audio_sample_rate())
2024-09-26 12:28:49 +00:00
# self._pcm_stream.write(audio_chunk)
2024-09-26 17:34:52 +00:00
# self._pcm_stream.write(audio_chunk.tobytes())
2024-09-26 12:28:49 +00:00
# self._human.push_audio_chunk(audio_chunk)
# self._human.push_mel_chunks_queue(audio_chunk)
self._human.push_audio_chunk(audio_chunk)
stream_len -= self._chunk_len
index += self._chunk_len
2024-09-26 17:34:52 +00:00
self._io_stream.seek(0)
self._io_stream.truncate()
2024-09-21 12:58:26 +00:00
def __create_bytes_stream(self, io_stream):
stream, sample_rate = soundfile.read(io_stream)
logger.info(f'tts audio stream {sample_rate} : {stream.shape}')
stream = stream.astype(np.float32)
if stream.ndim > 1:
logger.warning(f'tts audio has {stream.shape[1]} channels, only use the first')
stream = stream[:, 1]
2024-09-26 17:34:52 +00:00
if sample_rate != self._human.get_audio_sample_rate() and stream.shape[0] > 0:
logger.warning(f'tts audio sample rate is {sample_rate}, resample to {self._human.get_audio_sample_rate() }')
stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self._human.get_audio_sample_rate() )
2024-09-21 12:58:26 +00:00
return stream
async def __on_request(self, voice, txt):
communicate = edge_tts.Communicate(txt, voice)
first = True
2024-09-26 17:34:52 +00:00
total_data = b''
CHUNK_SIZE = self._chunk_len
2024-09-26 12:28:49 +00:00
async for chunk in communicate.stream():
if chunk["type"] == "audio" and chunk["data"]:
2024-09-26 17:34:52 +00:00
data = chunk['data']
self._io_stream.write(data)
elif chunk["type"] == "WordBoundary":
pass
'''
total_data += chunk["data"]
if len(total_data) >= CHUNK_SIZE:
2024-09-26 12:28:49 +00:00
# print(f"Time elapsed: {time.time() - start_time:.2f} seconds") # Print time
2024-09-26 17:34:52 +00:00
audio_data = AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])) #.raw_data
audio_data = audio_data.set_frame_rate(self._human.get_audio_sample_rate())
2024-09-26 12:28:49 +00:00
# self._human.push_audio_chunk(audio_data)
2024-09-26 17:34:52 +00:00
self._pcm_stream.write(audio_data.raw_data)
2024-09-26 12:28:49 +00:00
# play_audio(total_data[:CHUNK_SIZE], stream) # Play first CHUNK_SIZE bytes
2024-09-26 17:34:52 +00:00
total_data = total_data[CHUNK_SIZE:] # Remove played data
'''
2024-09-26 12:28:49 +00:00
# if first:
# first = False
# if chuck['type'] == 'audio':
# # self._io_stream.write(chuck['data'])
# self._io_stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data)
2024-09-26 17:34:52 +00:00
2024-09-26 12:28:49 +00:00
# if len(total_data) > 0:
# self._pcm_stream.write(AudioSegment.from_mp3(BytesIO(total_data)).raw_data)
# audio_data = AudioSegment.from_mp3(BytesIO(total_data)) # .raw_data
# audio_data = audio_data.set_frame_rate(self._human.get_audio_sample_rate())
2024-09-26 17:34:52 +00:00
# self._pcm_stream.write(audio_data.raw_data)
2024-09-26 12:28:49 +00:00
# self._human.push_audio_chunk(audio_data)
# self._io_stream.write(AudioSegment.from_mp3(BytesIO(total_data)).raw_data)
2024-09-02 00:13:34 +00:00
def stop(self):
2024-09-26 12:28:49 +00:00
self._pcm_stream.stop_stream()
self._pcm_player.close(self._pcm_stream)
self._pcm_player.terminate()
2024-09-02 00:13:34 +00:00
if self._exit_event is None:
return
2024-09-21 12:58:26 +00:00
self._exit_event.clear()
2024-09-02 00:13:34 +00:00
self._thread.join()
2024-09-04 16:51:14 +00:00
logging.info('tts stop')
2024-09-02 00:13:34 +00:00
def clear(self):
self._queue.queue.clear()
def push_txt(self, txt):
self._queue.put(txt)