feat(engine): 添加 Vosk 本地离线引擎支持

- 新增 Vosk 引擎配置和识别逻辑 - 更新用户界面，增加 Vosk 引擎选项和模型路径设置 - 更新依赖，添加 vosk 库
2026-02-15 20:34:47 +08:00 · 2025-07-09 19:53:30 +08:00
parent f97b885411
commit 1c29fd5adc
19 changed files with 389 additions and 41 deletions
--- a/caption-engine/main-vosk.py
+++ b/caption-engine/main-vosk.py
@@ -0,0 +1,83 @@
+import sys
+import json
+import argparse
+from datetime import datetime
+import numpy.core.multiarray
+
+if sys.platform == 'win32':
+    from sysaudio.win import AudioStream
+elif sys.platform == 'darwin':
+    from sysaudio.darwin import AudioStream
+elif sys.platform == 'linux':
+    from sysaudio.linux import AudioStream
+else:
+    raise NotImplementedError(f"Unsupported platform: {sys.platform}")
+
+from vosk import Model, KaldiRecognizer, SetLogLevel
+from audioprcs import resampleRawChunk
+
+SetLogLevel(-1)
+
+def convert_audio_to_text(audio_type, chunk_rate, model_path):
+    sys.stdout.reconfigure(line_buffering=True) # type: ignore
+
+    if model_path.startswith('"'):
+        model_path = model_path[1:]
+    if model_path.endswith('"'):
+        model_path = model_path[:-1]
+
+    model = Model(model_path)
+    recognizer = KaldiRecognizer(model, 16000)
+
+    stream = AudioStream(audio_type, chunk_rate)
+    stream.openStream()
+
+    time_str = ''
+    cur_id = 0
+    prev_content = ''
+
+    while True:
+        chunk = stream.read_chunk()
+        chunk_mono = resampleRawChunk(chunk, stream.CHANNELS, stream.RATE, 16000)
+
+        caption = {}
+        if recognizer.AcceptWaveform(chunk_mono):
+            content = json.loads(recognizer.Result()).get('text', '')
+            caption['index'] = cur_id
+            caption['text'] = content
+            caption['time_s'] = time_str
+            caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
+            caption['translation'] = ''
+            prev_content = ''
+            cur_id += 1
+        else:
+            content = json.loads(recognizer.PartialResult()).get('partial', '')
+            if content == '' or content == prev_content:
+                continue
+            if prev_content == '':
+                time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3]
+            caption['index'] = cur_id
+            caption['text'] = content
+            caption['time_s'] = time_str
+            caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
+            caption['translation'] = ''
+            prev_content = content
+        try:
+            json_str = json.dumps(caption) + '\n'
+            sys.stdout.write(json_str)
+            sys.stdout.flush()
+        except Exception as e:
+            print(e)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Convert system audio stream to text')
+    parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output audio stream, 1 for input audio stream')
+    parser.add_argument('-c', '--chunk_rate', default=20, help='The number of audio stream chunks collected per second.')
+    parser.add_argument('-m', '--model_path', default='', help='The path to the vosk model.')
+    args = parser.parse_args()
+    convert_audio_to_text(
+        int(args.audio_type),
+        int(args.chunk_rate),
+        args.model_path
+    )