feat(engine): 优化字幕引擎输出格式、准备合并两个字幕引擎

- 重构字幕引擎相关代码 - 准备合并两个字幕引擎
2026-02-13 02:54:48 +08:00 · 2025-07-27 17:15:12 +08:00
parent 3792eb88b6
commit b658ef5440
11 changed files with 205 additions and 45 deletions
--- a/engine/audio2text/gummy.py
+++ b/engine/audio2text/gummy.py
@@ -15,18 +15,20 @@ class Callback(TranslationRecognizerCallback):
    """
    def __init__(self):
        super().__init__()
+        self.index = 0
        self.usage = 0
        self.cur_id = -1
-        self.index = 0
        self.time_str = ''

    def on_open(self) -> None:
+        self.usage = 0
        self.cur_id = -1
        self.time_str = ''
        stdout_cmd('info', 'Gummy translator started.')

    def on_close(self) -> None:
        stdout_cmd('info', 'Gummy translator closed.')
+        stdout_cmd('usage', str(self.usage))

    def on_event(
        self,
@@ -46,7 +48,6 @@ class Callback(TranslationRecognizerCallback):
            caption['index'] = self.index
            caption['time_s'] = self.time_str
            caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
-            caption['end'] = transcription_result.is_sentence_end
            caption['text'] = transcription_result.text
            caption['translation'] = ""

@@ -57,7 +58,8 @@ class Callback(TranslationRecognizerCallback):
        if usage:
            self.usage += usage['duration']

-        stdout_obj(caption)
+        if 'text' in caption:
+            stdout_obj(caption)


 class GummyTranslator:
@@ -88,7 +90,7 @@ class GummyTranslator:
        self.translator.start()

    def send_audio_frame(self, data):
-        """发送音频帧"""
+        """发送音频帧，擎将自动识别并将识别结果输出到标准输出中"""
        self.translator.send_audio_frame(data)

    def stop(self):
--- a/engine/audio2text/vosk.py
+++ b/engine/audio2text/vosk.py
@@ -0,0 +1,59 @@
+import json
+from datetime import datetime
+
+from vosk import Model, KaldiRecognizer, SetLogLevel
+from utils import stdout_obj
+
+class VoskRecognizer:
+    """
+    使用 Vosk 引擎流式处理的音频数据，并在标准输出中输出与 Auto Caption 软件可读取的 JSON 字符串数据
+
+    初始化参数：
+        model_path: Vosk 识别模型路径
+    """
+    def __int__(self, model_path: str):
+        SetLogLevel(-1)
+        if model_path.startswith('"'):
+            model_path = model_path[1:]
+        if model_path.endswith('"'):
+            model_path = model_path[:-1]
+        self.model_path = model_path
+        self.time_str = ''
+        self.cur_id = 0
+        self.prev_content = ''
+
+        self.model = Model(self.model_path)
+        self.recognizer = KaldiRecognizer(self.model, 16000)
+    
+    def send_audio_frame(self, data: bytes):
+        """
+        发送音频帧给 Vosk 引擎，引擎将自动识别并将识别结果输出到标准输出中
+
+        Args:
+            data: 音频帧数据，采样率必须为 16000Hz
+        """
+        caption = {}
+        caption['command'] = 'caption'
+        caption['translation'] = ''
+
+        if self.recognizer.AcceptWaveform(data):
+            content = json.loads(self.recognizer.Result()).get('text', '')
+            caption['index'] = self.cur_id
+            caption['text'] = content
+            caption['time_s'] = self.time_str
+            caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
+            self.prev_content = ''
+            self.cur_id += 1
+        else:
+            content = json.loads(self.recognizer.PartialResult()).get('partial', '')
+            if content == '' or content == self.prev_content:
+                return
+            if self.prev_content == '':
+                self.time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3]
+            caption['index'] = self.cur_id
+            caption['text'] = content
+            caption['time_s'] = self.time_str
+            caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
+            self.prev_content = content
+        
+        stdout_obj(caption)
--- a/engine/main-vosk.py
+++ b/engine/main-vosk.py
@@ -49,6 +49,7 @@ def convert_audio_to_text(audio_type, chunk_rate, model_path):
                continue
            if prev_content == '':
                time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3]
+            caption['command'] = 'caption'
            caption['index'] = cur_id
            caption['text'] = content
            caption['time_s'] = time_str
--- a/engine/main.py
+++ b/engine/main.py
@@ -0,0 +1,37 @@
+import argparse
+
+def gummy_engine(s, t, a, c, k):
+    pass
+
+def vosk_engine(a, c, m):
+    pass
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Convert system audio stream to text')
+    # both
+    parser.add_argument('-e', '--caption_engine', default='gummy', help='Caption engine: gummy or vosk')
+    parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output, 1 for input')
+    parser.add_argument('-c', '--chunk_rate', default=20, help='Number of audio stream chunks collected per second')
+    # gummy
+    parser.add_argument('-s', '--source_language', default='en', help='Source language code')
+    parser.add_argument('-t', '--target_language', default='zh', help='Target language code')
+    parser.add_argument('-k', '--api_key', default='', help='API KEY for Gummy model')
+    # vosk
+    parser.add_argument('-m', '--model_path', default='', help='The path to the vosk model.')
+    args = parser.parse_args()
+    if args.caption_engine == 'gummy':
+        gummy_engine(
+            args.source_language,
+            args.target_language,
+            int(args.audio_type),
+            int(args.chunk_rate),
+            args.api_key
+        )
+    elif args.caption_engine == 'vosk':
+        vosk_engine(
+            int(args.audio_type),
+            int(args.chunk_rate),
+            args.model_path
+        )
+    else:
+        raise ValueError('Invalid caption engine specified.')