2024-09-02 00:13:34 +00:00
|
|
|
#encoding = utf8
|
2024-09-04 16:51:14 +00:00
|
|
|
import logging
|
2024-09-21 12:58:26 +00:00
|
|
|
import asyncio
|
|
|
|
import time
|
|
|
|
|
|
|
|
import edge_tts
|
|
|
|
import numpy as np
|
2024-09-26 12:28:49 +00:00
|
|
|
import pyaudio
|
2024-09-21 12:58:26 +00:00
|
|
|
import soundfile
|
|
|
|
import resampy
|
2024-09-02 00:13:34 +00:00
|
|
|
import queue
|
|
|
|
from io import BytesIO
|
|
|
|
from queue import Queue
|
|
|
|
from threading import Thread, Event
|
|
|
|
|
2024-09-26 12:28:49 +00:00
|
|
|
from pydub import AudioSegment
|
|
|
|
|
2024-09-21 12:58:26 +00:00
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
2024-09-02 00:13:34 +00:00
|
|
|
|
|
|
|
class TTSBase:
|
|
|
|
def __init__(self, human):
|
|
|
|
self._human = human
|
|
|
|
self._thread = None
|
|
|
|
self._queue = Queue()
|
|
|
|
self._exit_event = None
|
|
|
|
self._io_stream = BytesIO()
|
|
|
|
self._sample_rate = 16000
|
2024-09-26 12:28:49 +00:00
|
|
|
self._chunk_len = self._sample_rate // self._human.get_fps()
|
2024-09-21 12:58:26 +00:00
|
|
|
|
|
|
|
self._exit_event = Event()
|
|
|
|
self._thread = Thread(target=self._on_run)
|
|
|
|
self._exit_event.set()
|
|
|
|
self._thread.start()
|
2024-09-26 12:28:49 +00:00
|
|
|
self._pcm_player = pyaudio.PyAudio()
|
|
|
|
self._pcm_stream = self._pcm_player.open(format=pyaudio.paInt16,
|
|
|
|
channels=1, rate=16000, output=True)
|
2024-09-21 12:58:26 +00:00
|
|
|
logging.info('tts start')
|
2024-09-02 00:13:34 +00:00
|
|
|
|
|
|
|
def _on_run(self):
|
2024-09-04 16:51:14 +00:00
|
|
|
logging.info('tts run')
|
2024-09-21 12:58:26 +00:00
|
|
|
while self._exit_event.is_set():
|
2024-09-02 00:13:34 +00:00
|
|
|
try:
|
|
|
|
txt = self._queue.get(block=True, timeout=1)
|
|
|
|
except queue.Empty:
|
|
|
|
continue
|
|
|
|
self._request(txt)
|
2024-09-04 16:51:14 +00:00
|
|
|
logging.info('tts exit')
|
2024-09-02 00:13:34 +00:00
|
|
|
|
|
|
|
def _request(self, txt):
|
2024-09-21 12:58:26 +00:00
|
|
|
voice = 'zh-CN-XiaoyiNeural'
|
|
|
|
t = time.time()
|
|
|
|
asyncio.new_event_loop().run_until_complete(self.__on_request(voice, txt))
|
|
|
|
logger.info(f'edge tts time:{time.time() - t : 0.4f}s')
|
2024-09-02 00:13:34 +00:00
|
|
|
|
2024-09-21 12:58:26 +00:00
|
|
|
self._io_stream.seek(0)
|
|
|
|
stream = self.__create_bytes_stream(self._io_stream)
|
|
|
|
stream_len = stream.shape[0]
|
|
|
|
index = 0
|
2024-09-26 12:28:49 +00:00
|
|
|
while stream_len >= self._chunk_len:
|
|
|
|
audio_chunk = stream[index:index + self._chunk_len]
|
|
|
|
# self._pcm_stream.write(audio_chunk)
|
|
|
|
# self._pcm_stream.write(AudioSegment.from_mp3(audio_chunk))
|
|
|
|
# self._human.push_audio_chunk(audio_chunk)
|
|
|
|
# self._human.push_mel_chunks_queue(audio_chunk)
|
|
|
|
self._human.push_audio_chunk(audio_chunk)
|
|
|
|
stream_len -= self._chunk_len
|
|
|
|
index += self._chunk_len
|
2024-09-21 12:58:26 +00:00
|
|
|
|
|
|
|
def __create_bytes_stream(self, io_stream):
|
|
|
|
stream, sample_rate = soundfile.read(io_stream)
|
|
|
|
logger.info(f'tts audio stream {sample_rate} : {stream.shape}')
|
|
|
|
stream = stream.astype(np.float32)
|
|
|
|
|
|
|
|
if stream.ndim > 1:
|
|
|
|
logger.warning(f'tts audio has {stream.shape[1]} channels, only use the first')
|
|
|
|
stream = stream[:, 1]
|
|
|
|
|
|
|
|
if sample_rate != self._sample_rate and stream.shape[0] > 0:
|
|
|
|
logger.warning(f'tts audio sample rate is {sample_rate}, resample to {self._sample_rate}')
|
|
|
|
stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self._sample_rate)
|
|
|
|
|
|
|
|
return stream
|
|
|
|
|
|
|
|
async def __on_request(self, voice, txt):
|
|
|
|
communicate = edge_tts.Communicate(txt, voice)
|
|
|
|
first = True
|
2024-09-26 12:28:49 +00:00
|
|
|
# total_data = b''
|
|
|
|
# CHUNK_SIZE = self._chunk_len
|
|
|
|
async for chunk in communicate.stream():
|
|
|
|
if chunk["type"] == "audio" and chunk["data"]:
|
|
|
|
self._io_stream.write(chunk['data'])
|
|
|
|
# total_data += chunk["data"]
|
|
|
|
# if len(total_data) >= CHUNK_SIZE:
|
|
|
|
# print(f"Time elapsed: {time.time() - start_time:.2f} seconds") # Print time
|
|
|
|
# audio_data = AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])) #.raw_data
|
|
|
|
# audio_data = audio_data.set_frame_rate(self._human.get_audio_sample_rate())
|
|
|
|
# self._human.push_audio_chunk(audio_data)
|
|
|
|
# self._pcm_stream.write(audio_data.raw_data)
|
|
|
|
# play_audio(total_data[:CHUNK_SIZE], stream) # Play first CHUNK_SIZE bytes
|
|
|
|
# total_data = total_data[CHUNK_SIZE:] # Remove played data
|
|
|
|
|
|
|
|
# if first:
|
|
|
|
# first = False
|
|
|
|
|
|
|
|
# if chuck['type'] == 'audio':
|
|
|
|
# # self._io_stream.write(chuck['data'])
|
|
|
|
# self._io_stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data)
|
|
|
|
# if len(total_data) > 0:
|
|
|
|
# self._pcm_stream.write(AudioSegment.from_mp3(BytesIO(total_data)).raw_data)
|
|
|
|
# audio_data = AudioSegment.from_mp3(BytesIO(total_data)) # .raw_data
|
|
|
|
# audio_data = audio_data.set_frame_rate(self._human.get_audio_sample_rate())
|
|
|
|
# self._human.push_audio_chunk(audio_data)
|
|
|
|
# self._io_stream.write(AudioSegment.from_mp3(BytesIO(total_data)).raw_data)
|
2024-09-02 00:13:34 +00:00
|
|
|
|
|
|
|
def stop(self):
|
2024-09-26 12:28:49 +00:00
|
|
|
self._pcm_stream.stop_stream()
|
|
|
|
self._pcm_player.close(self._pcm_stream)
|
|
|
|
self._pcm_player.terminate()
|
2024-09-02 00:13:34 +00:00
|
|
|
if self._exit_event is None:
|
|
|
|
return
|
|
|
|
|
2024-09-21 12:58:26 +00:00
|
|
|
self._exit_event.clear()
|
2024-09-02 00:13:34 +00:00
|
|
|
self._thread.join()
|
2024-09-04 16:51:14 +00:00
|
|
|
logging.info('tts stop')
|
2024-09-02 00:13:34 +00:00
|
|
|
|
|
|
|
def clear(self):
|
|
|
|
self._queue.queue.clear()
|
|
|
|
|
|
|
|
def push_txt(self, txt):
|
|
|
|
self._queue.put(txt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|