human/tts/TTSBase.py

#encoding = utf8
import logging
import asyncio
import time

import edge_tts
import numpy as np
import pyaudio
import soundfile
import resampy
import queue
from io import BytesIO
from queue import Queue
from threading import Thread, Event

from pydub import AudioSegment

logger = logging.getLogger(__name__)


class TTSBase:
    def __init__(self, human):
        self._human = human
        self._thread = None
        self._queue = Queue()
        self._exit_event = None
        self._io_stream = BytesIO()
        self._sample_rate = 16000
        self._chunk_len = self._sample_rate // self._human.get_fps()

        self._exit_event = Event()
        self._thread = Thread(target=self._on_run)
        self._exit_event.set()
        self._thread.start()
        self._pcm_player = pyaudio.PyAudio()
        self._pcm_stream = self._pcm_player.open(format=pyaudio.paInt16,
                                                 channels=1, rate=16000, output=True)
        logging.info('tts start')

    def _on_run(self):
        logging.info('tts run')
        while self._exit_event.is_set():
            try:
                txt = self._queue.get(block=True, timeout=1)
            except queue.Empty:
                continue
            self._request(txt)
        logging.info('tts exit')

    def _request(self, txt):
        voice = 'zh-CN-XiaoyiNeural'
        t = time.time()
        asyncio.new_event_loop().run_until_complete(self.__on_request(voice, txt))
        logger.info(f'edge tts time:{time.time() - t : 0.4f}s')

        self._io_stream.seek(0)
        stream = self.__create_bytes_stream(self._io_stream)
        stream_len = stream.shape[0]
        index = 0
        while stream_len >= self._chunk_len:
            audio_chunk = stream[index:index + self._chunk_len]
            # self._pcm_stream.write(audio_chunk)
            # self._pcm_stream.write(AudioSegment.from_mp3(audio_chunk))
            # self._human.push_audio_chunk(audio_chunk)
            # self._human.push_mel_chunks_queue(audio_chunk)
            self._human.push_audio_chunk(audio_chunk)
            stream_len -= self._chunk_len
            index += self._chunk_len

    def __create_bytes_stream(self, io_stream):
        stream, sample_rate = soundfile.read(io_stream)
        logger.info(f'tts audio stream {sample_rate} : {stream.shape}')
        stream = stream.astype(np.float32)

        if stream.ndim > 1:
            logger.warning(f'tts audio has {stream.shape[1]} channels, only use the first')
            stream = stream[:, 1]

        if sample_rate != self._sample_rate and stream.shape[0] > 0:
            logger.warning(f'tts audio sample rate is {sample_rate}, resample to {self._sample_rate}')
            stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self._sample_rate)

        return stream

    async def __on_request(self, voice, txt):
        communicate = edge_tts.Communicate(txt, voice)
        first = True
        # total_data = b''
        # CHUNK_SIZE = self._chunk_len
        async for chunk in communicate.stream():
            if chunk["type"] == "audio" and chunk["data"]:
                self._io_stream.write(chunk['data'])
                # total_data += chunk["data"]
                # if len(total_data) >= CHUNK_SIZE:
                #     print(f"Time elapsed: {time.time() - start_time:.2f} seconds")  # Print time
                    # audio_data = AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])) #.raw_data
                    # audio_data = audio_data.set_frame_rate(self._human.get_audio_sample_rate())
                    # self._human.push_audio_chunk(audio_data)
                    # self._pcm_stream.write(audio_data.raw_data)
                    # play_audio(total_data[:CHUNK_SIZE], stream)  # Play first CHUNK_SIZE bytes
                    # total_data = total_data[CHUNK_SIZE:]  # Remove played data

            # if first:
            #     first = False

            # if chuck['type'] == 'audio':
            #     # self._io_stream.write(chuck['data'])
            #     self._io_stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data)
        # if len(total_data) > 0:
             # self._pcm_stream.write(AudioSegment.from_mp3(BytesIO(total_data)).raw_data)
             # audio_data = AudioSegment.from_mp3(BytesIO(total_data))  # .raw_data
             # audio_data = audio_data.set_frame_rate(self._human.get_audio_sample_rate())
             # self._human.push_audio_chunk(audio_data)
        # self._io_stream.write(AudioSegment.from_mp3(BytesIO(total_data)).raw_data)

    def stop(self):
        self._pcm_stream.stop_stream()
        self._pcm_player.close(self._pcm_stream)
        self._pcm_player.terminate()
        if self._exit_event is None:
            return

        self._exit_event.clear()
        self._thread.join()
        logging.info('tts stop')

    def clear(self):
        self._queue.queue.clear()

    def push_txt(self, txt):
        self._queue.put(txt)
首次添加数字人 2024-09-02 00:13:34 +00:00			`#encoding = utf8`
添加chunk处理 2024-09-04 16:51:14 +00:00			`import logging`
modify human 2024-09-21 12:58:26 +00:00			`import asyncio`
			`import time`

			`import edge_tts`
			`import numpy as np`
修改tts录制文件 2024-09-26 12:28:49 +00:00			`import pyaudio`
modify human 2024-09-21 12:58:26 +00:00			`import soundfile`
			`import resampy`
首次添加数字人 2024-09-02 00:13:34 +00:00			`import queue`
			`from io import BytesIO`
			`from queue import Queue`
			`from threading import Thread, Event`

修改tts录制文件 2024-09-26 12:28:49 +00:00			`from pydub import AudioSegment`

modify human 2024-09-21 12:58:26 +00:00			`logger = logging.getLogger(__name__)`

首次添加数字人 2024-09-02 00:13:34 +00:00
			`class TTSBase:`
			`def __init__(self, human):`
			`self._human = human`
			`self._thread = None`
			`self._queue = Queue()`
			`self._exit_event = None`
			`self._io_stream = BytesIO()`
			`self._sample_rate = 16000`
修改tts录制文件 2024-09-26 12:28:49 +00:00			`self._chunk_len = self._sample_rate // self._human.get_fps()`
modify human 2024-09-21 12:58:26 +00:00
			`self._exit_event = Event()`
			`self._thread = Thread(target=self._on_run)`
			`self._exit_event.set()`
			`self._thread.start()`
修改tts录制文件 2024-09-26 12:28:49 +00:00			`self._pcm_player = pyaudio.PyAudio()`
			`self._pcm_stream = self._pcm_player.open(format=pyaudio.paInt16,`
			`channels=1, rate=16000, output=True)`
modify human 2024-09-21 12:58:26 +00:00			`logging.info('tts start')`
首次添加数字人 2024-09-02 00:13:34 +00:00
			`def _on_run(self):`
添加chunk处理 2024-09-04 16:51:14 +00:00			`logging.info('tts run')`
modify human 2024-09-21 12:58:26 +00:00			`while self._exit_event.is_set():`
首次添加数字人 2024-09-02 00:13:34 +00:00			`try:`
			`txt = self._queue.get(block=True, timeout=1)`
			`except queue.Empty:`
			`continue`
			`self._request(txt)`
添加chunk处理 2024-09-04 16:51:14 +00:00			`logging.info('tts exit')`
首次添加数字人 2024-09-02 00:13:34 +00:00
			`def _request(self, txt):`
modify human 2024-09-21 12:58:26 +00:00			`voice = 'zh-CN-XiaoyiNeural'`
			`t = time.time()`
			`asyncio.new_event_loop().run_until_complete(self.__on_request(voice, txt))`
			`logger.info(f'edge tts time:{time.time() - t : 0.4f}s')`
首次添加数字人 2024-09-02 00:13:34 +00:00
modify human 2024-09-21 12:58:26 +00:00			`self._io_stream.seek(0)`
			`stream = self.__create_bytes_stream(self._io_stream)`
			`stream_len = stream.shape[0]`
			`index = 0`
修改tts录制文件 2024-09-26 12:28:49 +00:00			`while stream_len >= self._chunk_len:`
			`audio_chunk = stream[index:index + self._chunk_len]`
			`# self._pcm_stream.write(audio_chunk)`
			`# self._pcm_stream.write(AudioSegment.from_mp3(audio_chunk))`
			`# self._human.push_audio_chunk(audio_chunk)`
			`# self._human.push_mel_chunks_queue(audio_chunk)`
			`self._human.push_audio_chunk(audio_chunk)`
			`stream_len -= self._chunk_len`
			`index += self._chunk_len`
modify human 2024-09-21 12:58:26 +00:00
			`def __create_bytes_stream(self, io_stream):`
			`stream, sample_rate = soundfile.read(io_stream)`
			`logger.info(f'tts audio stream {sample_rate} : {stream.shape}')`
			`stream = stream.astype(np.float32)`

			`if stream.ndim > 1:`
			`logger.warning(f'tts audio has {stream.shape[1]} channels, only use the first')`
			`stream = stream[:, 1]`

			`if sample_rate != self._sample_rate and stream.shape[0] > 0:`
			`logger.warning(f'tts audio sample rate is {sample_rate}, resample to {self._sample_rate}')`
			`stream = resampy.resample(x=stream, sr_orig=sample_rate, sr_new=self._sample_rate)`

			`return stream`

			`async def __on_request(self, voice, txt):`
			`communicate = edge_tts.Communicate(txt, voice)`
			`first = True`
修改tts录制文件 2024-09-26 12:28:49 +00:00			`# total_data = b''`
			`# CHUNK_SIZE = self._chunk_len`
			`async for chunk in communicate.stream():`
			`if chunk["type"] == "audio" and chunk["data"]:`
			`self._io_stream.write(chunk['data'])`
			`# total_data += chunk["data"]`
			`# if len(total_data) >= CHUNK_SIZE:`
			`# print(f"Time elapsed: {time.time() - start_time:.2f} seconds") # Print time`
			`# audio_data = AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])) #.raw_data`
			`# audio_data = audio_data.set_frame_rate(self._human.get_audio_sample_rate())`
			`# self._human.push_audio_chunk(audio_data)`
			`# self._pcm_stream.write(audio_data.raw_data)`
			`# play_audio(total_data[:CHUNK_SIZE], stream) # Play first CHUNK_SIZE bytes`
			`# total_data = total_data[CHUNK_SIZE:] # Remove played data`

			`# if first:`
			`# first = False`

			`# if chuck['type'] == 'audio':`
			`# # self._io_stream.write(chuck['data'])`
			`# self._io_stream.write(AudioSegment.from_mp3(BytesIO(total_data[:CHUNK_SIZE])).raw_data)`
			`# if len(total_data) > 0:`
			`# self._pcm_stream.write(AudioSegment.from_mp3(BytesIO(total_data)).raw_data)`
			`# audio_data = AudioSegment.from_mp3(BytesIO(total_data)) # .raw_data`
			`# audio_data = audio_data.set_frame_rate(self._human.get_audio_sample_rate())`
			`# self._human.push_audio_chunk(audio_data)`
			`# self._io_stream.write(AudioSegment.from_mp3(BytesIO(total_data)).raw_data)`
首次添加数字人 2024-09-02 00:13:34 +00:00
			`def stop(self):`
修改tts录制文件 2024-09-26 12:28:49 +00:00			`self._pcm_stream.stop_stream()`
			`self._pcm_player.close(self._pcm_stream)`
			`self._pcm_player.terminate()`
首次添加数字人 2024-09-02 00:13:34 +00:00			`if self._exit_event is None:`
			`return`

modify human 2024-09-21 12:58:26 +00:00			`self._exit_event.clear()`
首次添加数字人 2024-09-02 00:13:34 +00:00			`self._thread.join()`
添加chunk处理 2024-09-04 16:51:14 +00:00			`logging.info('tts stop')`
首次添加数字人 2024-09-02 00:13:34 +00:00
			`def clear(self):`
			`self._queue.queue.clear()`

			`def push_txt(self, txt):`
			`self._queue.put(txt)`