feat(engine): 添加 Vosk 本地离线引擎支持

- 新增 Vosk 引擎配置和识别逻辑 - 更新用户界面，增加 Vosk 引擎选项和模型路径设置 - 更新依赖，添加 vosk 库
2026-02-04 04:14:42 +08:00 · 2025-07-09 19:53:30 +08:00
parent f97b885411
commit 1c29fd5adc
19 changed files with 389 additions and 41 deletions
--- a/engine-test/vosk.ipynb
+++ b/engine-test/vosk.ipynb
@@ -0,0 +1,124 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "6fb12704",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "d:\\Projects\\auto-caption\\caption-engine\\subenv\\Lib\\site-packages\\vosk\\__init__.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "import vosk\n",
+    "print(vosk.__file__)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "63a06f5c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "        采样设备：\n",
+      "            - 设备类型：音频输入\n",
+      "            - 序号：1\n",
+      "            - 名称：麦克风阵列 (Realtek(R) Audio)\n",
+      "            - 最大输入通道数：2\n",
+      "            - 默认低输入延迟：0.09s\n",
+      "            - 默认高输入延迟：0.18s\n",
+      "            - 默认采样率：44100.0Hz\n",
+      "            - 是否回环设备：False\n",
+      "\n",
+      "        音频样本块大小：2205\n",
+      "        样本位宽：2\n",
+      "        采样格式：8\n",
+      "        音频通道数：2\n",
+      "        音频采样率：44100\n",
+      "        \n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "import json\n",
+    "from vosk import Model, KaldiRecognizer\n",
+    "\n",
+    "current_dir = os.getcwd() \n",
+    "sys.path.append(os.path.join(current_dir, '../caption-engine'))\n",
+    "\n",
+    "from sysaudio.win import AudioStream\n",
+    "from audioprcs import resampleRawChunk, mergeChunkChannels\n",
+    "\n",
+    "stream = AudioStream(1)\n",
+    "stream.printInfo()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "5d5a0afa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = Model(os.path.join(\n",
+    "    current_dir,\n",
+    "    '../caption-engine/models/vosk-model-small-cn-0.22'\n",
+    "))\n",
+    "recognizer = KaldiRecognizer(model, 16000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7e9d1530",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stream.openStream()\n",
+    "\n",
+    "for i in range(200):\n",
+    "    chunk = stream.read_chunk()\n",
+    "    chunk_mono = resampleRawChunk(chunk, stream.CHANNELS, stream.RATE, 16000)\n",
+    "    if recognizer.AcceptWaveform(chunk_mono):\n",
+    "        result = json.loads(recognizer.Result())\n",
+    "        print(\"acc:\", result.get(\"text\", \"\"))\n",
+    "    else:\n",
+    "        partial = json.loads(recognizer.PartialResult())\n",
+    "        print(\"else:\", partial.get(\"partial\", \"\"))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "subenv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}