73 lines
2.6 KiB
Python
73 lines
2.6 KiB
Python
|
#encoding = utf8
|
||
|
|
||
|
from io import BytesIO
|
||
|
|
||
|
import numpy as np
|
||
|
import soundfile as sf
|
||
|
import edge_tts
|
||
|
import resampy
|
||
|
|
||
|
from audio import save_chunks, save_wav
|
||
|
from .tts_base import TTSBase
|
||
|
|
||
|
|
||
|
class TTSEdge(TTSBase):
|
||
|
def __init__(self, voice='zh-CN-XiaoyiNeural'):
|
||
|
super().__init__()
|
||
|
self._voice = voice
|
||
|
self._byte_stream = BytesIO()
|
||
|
self._count = 1
|
||
|
|
||
|
async def _on_request(self, txt: str):
|
||
|
communicate = edge_tts.Communicate(txt, self._voice)
|
||
|
first = True
|
||
|
async for chunk in communicate.stream():
|
||
|
if first:
|
||
|
first = False
|
||
|
if chunk["type"] == "audio":
|
||
|
# self.push_audio(chunk["data"])
|
||
|
self._byte_stream.write(chunk["data"])
|
||
|
# file.write(chunk["data"])
|
||
|
elif chunk["type"] == "WordBoundary":
|
||
|
pass
|
||
|
|
||
|
async def _on_handle(self):
|
||
|
self._byte_stream.seek(0)
|
||
|
try:
|
||
|
stream = self.__create_bytes_stream(self._byte_stream)
|
||
|
stream_len = stream.shape[0]
|
||
|
idx = 0
|
||
|
print('-------tts start push chunk')
|
||
|
save_wav(stream, '../temp/audio/' + str(self._count) + '.wav', 16000)
|
||
|
self._count = self._count + 1
|
||
|
# chunk = stream[0:]
|
||
|
# save_chunks(chunk, 16000, './temp/audio')
|
||
|
# while stream_len >= self.chunk:
|
||
|
# self._human.put_audio_frame(stream[idx:idx + self.chunk])
|
||
|
# streamlen -= self.chunk
|
||
|
# idx += self.chunk
|
||
|
# if streamlen>0: #skip last frame(not 20ms)
|
||
|
# self.queue.put(stream[idx:])
|
||
|
self._byte_stream.seek(0)
|
||
|
self._byte_stream.truncate()
|
||
|
print('-------tts finish push chunk')
|
||
|
except Exception as e:
|
||
|
self._byte_stream.seek(0)
|
||
|
self._byte_stream.truncate()
|
||
|
print('-------tts finish error:', e)
|
||
|
|
||
|
def __create_bytes_stream(self, byte_stream):
|
||
|
stream, sample_rate = sf.read(byte_stream) # [T*sample_rate,] float64
|
||
|
print(f'[INFO]tts audio stream {sample_rate}: {stream.shape}')
|
||
|
stream = stream.astype(np.float32)
|
||
|
|
||
|
if stream.ndim > 1:
|
||
|
print(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
|
||
|
stream = stream[:, 0]
|
||
|
|
||
|
if sample_rate != self._sample_rate and stream.shape[0] > 0:
|
||
|
print(f'[WARN] audio sample rate is {sample_rate}, resampling into {self._sample_rate}.')
|
||
|
stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self._sample_rate)
|
||
|
|
||
|
return stream
|