modfiy tts source

2024-10-19 18:47:34 +08:00 · 2024-10-19 18:47:34 +08:00 · 055d1733f3
commit 055d1733f3
parent 6b5361d91a
10 changed files with 188 additions and 6 deletions
--- a/human/human_context.py
+++ b/human/human_context.py
@ -7,7 +7,7 @@ from .audio_inference_handler import AudioInferenceHandler
 from .audio_mal_handler import AudioMalHandler
 from .human_render import HumanRender
 from nlp import PunctuationSplit, DouBao
-from tts import TTSEdge, TTSAudioSplitHandle
+from tts import TTSEdge, TTSAudioSplitHandle, TTSEdgeHttp
 from utils import load_avatar, get_device
 logger = logging.getLogger(__name__)
@ -102,10 +102,14 @@ class HumanContext:
        self._infer_handler = AudioInferenceHandler(self, self._render_handler)
        self._mal_handler = AudioMalHandler(self,  self._infer_handler)
        self._tts_handle = TTSAudioSplitHandle(self,  self._mal_handler)
-        self._tts = TTSEdge(self._tts_handle)
+        self._tts = TTSEdgeHttp(self._tts_handle)
        split = PunctuationSplit()
        self._nlp = DouBao(split,  self._tts)
        self._asr = SherpaNcnnAsr()
        self._asr.attach(self._nlp)
-
+    def pause_talk(self):
        self._nlp.pause_talk()
        self._tts.pause_talk()
        self._mal_handler.pause_talk()
        self._infer_handler.pause_talk()
--- a/human_handler/audio_handler.py
+++ b/human_handler/audio_handler.py
@ -23,3 +23,6 @@ class AudioHandler(ABC):
            self._handler.on_handle(stream, type_)
        else:
            logging.info(f'_handler is None')
    def pause_talk(self):
        pass
--- a/nlp/nlp_base.py
+++ b/nlp/nlp_base.py
@ -46,3 +46,6 @@ class NLPBase(AsrObserver):
        self._ask_queue.add_task(self._on_close)
        self._ask_queue.stop()
    def pause_talk(self):
        self._ask_queue.clear()
--- a/test/test_mzzsfy_tts.py
+++ b/test/test_mzzsfy_tts.py
@ -0,0 +1,79 @@
 #encoding = utf8
 import time
 import librosa
 import numpy as np
 import requests
 import resampy
 import soundfile as sf
 def download_tts(url):
    file_name = url[3:]
    print(file_name)
    download_url = url
    print('download tts', download_url)
    resp = requests.get(download_url)
    with open('./audio/mp3/' + file_name, 'wb') as mp3:
        mp3.write(resp.content)
    from pydub import AudioSegment
    sound = AudioSegment.from_mp3('./audio/mp3/' + file_name)
    sound.export('./audio/wav/' + file_name + '.wav', format="wav")
 def __create_bytes_stream(byte_stream):
    stream, sample_rate = sf.read(byte_stream)  # [T*sample_rate,] float64
    print(f'[INFO]tts audio stream {sample_rate}: {stream.shape}')
    stream = stream.astype(np.float32)
    if stream.ndim > 1:
        print(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
        stream = stream[:, 0]
    if sample_rate != 16000 and stream.shape[0] > 0:
        print(f'[WARN] audio sample rate is {sample_rate}, resampling into {16000}.')
        stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=16000)
    return stream
 def main():
    import aiohttp
    import asyncio
    from io import BytesIO
    async def fetch_audio():
        url = "http://localhost:8082/v1/audio/speech"
        data = {
            "model": "tts-1",
            "input": "写了一个高性能tts(文本转声音)工具,5千字仅需5秒,免费使用",
            "voice": "alloy",
            "speed": 1.0
        }
        async with aiohttp.ClientSession() as session:
            async with session.post(url, json=data) as response:
                if response.status == 200:
                    audio_data = BytesIO(await response.read())
                    audio_stream = __create_bytes_stream(audio_data)
                    # 保存为新的音频文件
                    sf.write("output_audio.wav", audio_stream, 16000)
                    print("Audio data received and saved to output_audio.wav")
                else:
                    print("Error:", response.status, await response.text())
    # Run the async function
    asyncio.run(fetch_audio())
 if __name__ == "__main__":
    try:
        t = time.time()
        main()
        print(f'-------tts time:{time.time() - t:.4f}s')
    except KeyboardInterrupt:
        print("\nCaught Ctrl + C. Exiting")
    except Exception as e:
        print(e)
--- a/test/test_nlp_tts.py
+++ b/test/test_nlp_tts.py
@ -6,7 +6,7 @@ import time
 from asr import SherpaNcnnAsr
 from nlp import PunctuationSplit
 from nlp.nlp_doubao import DouBao
-from tts import TTSEdge, TTSAudioSaveHandle
+from tts import TTSEdge, TTSAudioSaveHandle, TTSEdgeHttp
 try:
    import sounddevice as sd
@ -21,12 +21,14 @@ except ImportError as e:
 def main():
    print("Started! Please speak")
-    handle = TTSAudioSaveHandle()
+    handle = TTSAudioSaveHandle(None, None)
-    tts = TTSEdge(handle)
+    # tts = TTSEdge(handle)
    tts = TTSEdgeHttp(handle)
    split = PunctuationSplit()
    nlp = DouBao(split, tts)
    nlp.ask('你好，你是谁？')
    nlp.ask('可以帮我做什么？')
    nlp.ask('背诵出师表')
    nlp.stop()
    tts.stop()
    print("Stop! ")
--- a/tts/init.py
+++ b/tts/init.py
@ -1,4 +1,5 @@
 #encoding = utf8
 from .tts_edge import TTSEdge
 from .tts_edge_http import TTSEdgeHttp
 from .tts_audio_handle import TTSAudioSplitHandle, TTSAudioSaveHandle
--- a/tts/tts_audio_handle.py
+++ b/tts/tts_audio_handle.py
@ -34,6 +34,9 @@ class TTSAudioHandle(AudioHandler):
    def stop(self):
        pass
    def pause_talk(self):
        pass
 class TTSAudioSplitHandle(TTSAudioHandle):
    def __init__(self, context, handler):
--- a/tts/tts_base.py
+++ b/tts/tts_base.py
@ -59,3 +59,6 @@ class TTSBase(NLPCallback):
    def stop(self):
        self._message_queue.add_task(self._on_close)
        self._message_queue.stop()
    def pause_talk(self):
        self._message_queue.clear()
--- a/tts/tts_edge_http.py
+++ b/tts/tts_edge_http.py
@ -0,0 +1,79 @@
 #encoding = utf8
 import logging
 from io import BytesIO
 import aiohttp
 import numpy as np
 import soundfile as sf
 import edge_tts
 import resampy
 from .tts_base import TTSBase
 logger = logging.getLogger(__name__)
 class TTSEdgeHttp(TTSBase):
    def __init__(self, handle, voice='zh-CN-XiaoyiNeural'):
        super().__init__(handle)
        self._voice = voice
        self._url = 'http://localhost:8082/v1/audio/speech'
        logger.info(f"TTSEdge init, {voice}")
    async def _on_request(self, txt: str):
        print('_on_request, txt')
        data = {
            "model": "tts-1",
            "input": txt,
            "voice": "alloy",
            "speed": 1.0,
            "thread": 10
        }
        async with aiohttp.ClientSession() as session:
            async with session.post(self._url, json=data) as response:
                if response.status == 200:
                    stream = BytesIO(await response.read())
                    print("Audio data received and saved to output_audio.wav")
                    return stream
                else:
                    byte_stream = None
                    return byte_stream
    async def _on_handle(self, stream, index):
        print('-------tts _on_handle')
        try:
            stream.seek(0)
            byte_stream = self.__create_bytes_stream(stream)
            print('-------tts start push chunk')
            self._handle.on_handle(byte_stream, index)
            stream.seek(0)
            stream.truncate()
            print('-------tts finish push chunk')
        except Exception as e:
            self._handle.on_handle(None, index)
            stream.seek(0)
            stream.truncate()
            print('-------tts finish error:', e)
        stream.close()
    def __create_bytes_stream(self, byte_stream):
        stream, sample_rate = sf.read(byte_stream)  # [T*sample_rate,] float64
        print(f'[INFO]tts audio stream {sample_rate}: {stream.shape}')
        stream = stream.astype(np.float32)
        if stream.ndim > 1:
            print(f'[WARN] audio has {stream.shape[1]} channels, only use the first.')
            stream = stream[:, 0]
        if sample_rate != self._handle.sample_rate and stream.shape[0] > 0:
            print(f'[WARN] audio sample rate is {sample_rate}, resampling into {self._handle.sample_rate}.')
            stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self._handle.sample_rate)
        return stream
    async def _on_close(self):
        print('TTSEdge close')
        # if self._byte_stream is not None and not self._byte_stream.closed:
        #     self._byte_stream.close()
--- a/utils/async_task_queue.py
+++ b/utils/async_task_queue.py
@ -53,6 +53,11 @@ class AsyncTaskQueue:
        for _ in range(self._worker_num):
            self.add_task(None)  # 发送结束信号
    def clear(self):
        while not self._queue.empty():
            self._queue.get()
            self._queue.task_done()
    def stop(self):
        self.stop_workers()
        self._thread.join()