add audio index and handle

2024-10-15 08:31:43 +08:00 · 2024-10-15 08:31:43 +08:00 · 1bb6684416
commit 1bb6684416
parent 205c8f21fe
5 changed files with 38 additions and 11 deletions
--- a/human/init.py
+++ b/human/init.py
@ -1,3 +1,4 @@
 #encoding = utf8

 from .human_context import HumanContext
+from .audio_handler import AudioHandler
--- a/human/audio_handler.py
+++ b/human/audio_handler.py
@ -0,0 +1,8 @@
+#encoding = utf8
+from abc import ABC, abstractmethod
+
+
+class AudioHandler(ABC):
+    @abstractmethod
+    def on_handle(self,  stream, index):
+        pass
--- a/human/human_context.py
+++ b/human/human_context.py
@ -1,4 +1,7 @@
 #encoding = utf8
+from asr import SherpaNcnnAsr
+from nlp import PunctuationSplit, DouBao
+from tts import TTSEdge, TTSAudioSplitHandle


 class HumanContext:
@ -29,3 +32,12 @@ class HumanContext:
    def stride_right_size(self):
        return self._stride_right_size

+    def build(self):
+        tts_handle = TTSAudioSplitHandle(self)
+        tts = TTSEdge(tts_handle)
+        split = PunctuationSplit()
+        nlp = DouBao(split, tts)
+        asr = SherpaNcnnAsr()
+        asr.attach(nlp)
+
+
--- a/tts/tts_audio_handle.py
+++ b/tts/tts_audio_handle.py
@ -1,12 +1,12 @@
 #encoding = utf8
 import os
 import shutil
-from abc import ABC, abstractmethod

 from audio import save_wav
+from human import AudioHandler


-class TTSAudioHandle(ABC):
+class TTSAudioHandle(AudioHandler):
    def __init__(self):
        self._sample_rate = 16000
        self._index = 1
@ -19,28 +19,24 @@ class TTSAudioHandle(ABC):
    def sample_rate(self, value):
        self._sample_rate = value

-    @abstractmethod
-    def on_handle(self, stream, index):
-        pass
-
    def get_index(self):
        self._index = self._index + 1
        return self._index


 class TTSAudioSplitHandle(TTSAudioHandle):
-    def __init__(self, human):
+    def __init__(self, context):
        super().__init__()
-        self._human = human
-        self.sample_rate = self._human.get_audio_sample_rate()
-        self._chunk = self.sample_rate // self._human.get_fps()
+        self._context = context
+        self.sample_rate = self._context.get_audio_sample_rate()
+        self._chunk = self.sample_rate // self._context.get_fps()

    def on_handle(self, stream, index):
        stream_len = stream.shape[0]
        idx = 0

        while stream_len >= self._chunk:
-            self._human.put_audio_frame(stream[idx:idx + self._chunk])
+            self._context.put_audio_frame(stream[idx:idx + self._chunk])
            stream_len -= self._chunk
            idx += self._chunk

--- a/tts/tts_audio_index.py
+++ b/tts/tts_audio_index.py
@ -0,0 +1,10 @@
+#encoding = utf8
+
+
+class TTSAudioIndex:
+    def __init__(self):
+        self._index = 0
+
+    def get_index(self):
+        self._index = self._index + 1
+        return self._index