human/tts/TTSBase.py

121 lines
3.6 KiB
Python
Raw Normal View History

2024-09-02 00:13:34 +00:00
#encoding = utf8
2024-09-04 16:51:14 +00:00
import logging
2024-09-21 12:58:26 +00:00
import asyncio
import time
import edge_tts
import numpy as np
2024-10-03 17:52:49 +00:00
import soundfile as sf
2024-09-21 12:58:26 +00:00
import resampy
2024-09-02 00:13:34 +00:00
import queue
from io import BytesIO
from queue import Queue
from threading import Thread, Event
2024-09-26 12:28:49 +00:00
2024-09-27 11:31:36 +00:00
import audio
2024-09-21 12:58:26 +00:00
logger = logging.getLogger(__name__)
2024-09-02 00:13:34 +00:00
class TTSBase:
def __init__(self, human):
self._human = human
self._thread = None
self._queue = Queue()
2024-10-03 17:52:49 +00:00
self.input_stream = BytesIO()
self.chunk = self._human.get_audio_sample_rate() // self._human.get_fps()
2024-09-21 12:58:26 +00:00
self._exit_event = Event()
self._thread = Thread(target=self._on_run)
self._exit_event.set()
self._thread.start()
logging.info('tts start')
2024-09-02 00:13:34 +00:00
def _on_run(self):
2024-09-04 16:51:14 +00:00
logging.info('tts run')
2024-09-21 12:58:26 +00:00
while self._exit_event.is_set():
2024-09-02 00:13:34 +00:00
try:
txt = self._queue.get(block=True, timeout=1)
except queue.Empty:
continue
self._request(txt)
2024-09-04 16:51:14 +00:00
logging.info('tts exit')
2024-09-02 00:13:34 +00:00
def _request(self, txt):
2024-09-21 12:58:26 +00:00
voice = 'zh-CN-XiaoyiNeural'
t = time.time()
2024-10-03 17:52:49 +00:00
asyncio.new_event_loop().run_until_complete(self.__main(voice, txt))
print(f'-------edge tts time:{time.time() - t:.4f}s')
self.input_stream.seek(0)
stream = self.__create_bytes_stream(self.input_stream)
streamlen = stream.shape[0]
idx = 0
print('-------tts start push chunk')
while streamlen >= self.chunk:
self._human.put_audio_frame(stream[idx:idx + self.chunk])
streamlen -= self.chunk
idx += self.chunk
# if streamlen>0: #skip last frame(not 20ms)
# self.queue.put(stream[idx:])
self.input_stream.seek(0)
self.input_stream.truncate()
print('-------tts finish push chunk')
def __create_bytes_stream(self, byte_stream):
# byte_stream=BytesIO(buffer)
stream, sample_rate = sf.read(byte_stream) # [T*sample_rate,] float64
print(f'[INFO]tts audio stream {sample_rate}: {stream.shape}')
stream = stream.astype(np.float32)
if stream.ndim > 1:
print(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
stream = stream[:, 0]
if sample_rate != self._human.get_audio_sample_rate() and stream.shape[0] > 0:
print(f'[WARN] audio sample rate is {sample_rate}, resampling into {self._human.get_audio_sample_rate()}.')
stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self._human.get_audio_sample_rate())
return stream
async def __main(self, voicename: str, text: str):
communicate = edge_tts.Communicate(text, voicename)
#with open(OUTPUT_FILE, "wb") as file:
2024-09-21 12:58:26 +00:00
first = True
2024-09-26 12:28:49 +00:00
async for chunk in communicate.stream():
2024-10-03 17:52:49 +00:00
if first:
first = False
if chunk["type"] == "audio":
#self.push_audio(chunk["data"])
self.input_stream.write(chunk["data"])
#file.write(chunk["data"])
2024-09-26 17:34:52 +00:00
elif chunk["type"] == "WordBoundary":
pass
2024-09-02 00:13:34 +00:00
def stop(self):
2024-09-26 12:28:49 +00:00
self._pcm_stream.stop_stream()
self._pcm_player.close(self._pcm_stream)
self._pcm_player.terminate()
2024-09-02 00:13:34 +00:00
if self._exit_event is None:
return
2024-09-21 12:58:26 +00:00
self._exit_event.clear()
2024-09-02 00:13:34 +00:00
self._thread.join()
2024-09-04 16:51:14 +00:00
logging.info('tts stop')
2024-09-02 00:13:34 +00:00
2024-10-04 08:44:06 +00:00
def pause_talk(self):
self.clear()
2024-09-02 00:13:34 +00:00
def clear(self):
self._queue.queue.clear()
def push_txt(self, txt):
self._queue.put(txt)