feat(engine): 优化 Vosk 字幕引擎支持

- 实现文件夹选择功能，用于选择 Vosk 模型路径 - 在 EngineControl 组件中添加模型路径选择按钮和相关提示 - 在 EngineStatus 组件中增加对空模型路径的检查和错误提示
feat(engine): 添加 Vosk 本地离线引擎支持
2026-02-23 10:14:42 +08:00 · 2025-07-10 11:22:39 +08:00 · 2025-07-09 19:53:30 +08:00 · 2025-07-09 02:34:15 +08:00
26 changed files with 496 additions and 46 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -7,5 +7,6 @@ out
 __pycache__
 subenv
 caption-engine/build
+caption-engine/models
 output.wav
-.venv
+.venv
--- a/README.md
+++ b/README.md
@@ -2,11 +2,13 @@
    <img src="./build/icon.png" width="100px" height="100px"/>
    <h1 align="center">auto-caption</h1>
    <p>Auto Caption 是一个跨平台的实时字幕显示软件。</p>
-    <img src="https://img.shields.io/badge/version-0.3.0-blue">
-    <img src="https://img.shields.io/github/issues/HiMeditator/auto-caption?color=orange">
-    <img src="https://img.shields.io/github/languages/top/HiMeditator/auto-caption?color=royalblue">
-    <img src="https://img.shields.io/github/repo-size/HiMeditator/auto-caption?color=green">
-    <img src="https://visitor-badge.laobi.icu/badge?page_id=himeditator.github.io">
+    <p>
+      <img src="https://img.shields.io/badge/version-0.3.0-blue">
+      <img src="https://img.shields.io/github/issues/HiMeditator/auto-caption?color=orange">
+      <img src="https://img.shields.io/github/languages/top/HiMeditator/auto-caption?color=royalblue">
+      <img src="https://img.shields.io/github/repo-size/HiMeditator/auto-caption?color=green">
+      <img src="https://visitor-badge.laobi.icu/badge?page_id=himeditator.auto-caption">
+    </p>
    <p>
        | <b>简体中文</b>
        | <a href="./README_en.md">English</a>
--- a/README_en.md
+++ b/README_en.md
@@ -2,11 +2,13 @@
    <img src="./build/icon.png" width="100px" height="100px"/>
    <h1 align="center">auto-caption</h1>
    <p>Auto Caption is a cross-platform real-time caption display software.</p>
-    <img src="https://img.shields.io/badge/version-0.3.0-blue">
-    <img src="https://img.shields.io/github/issues/HiMeditator/auto-caption?color=orange">
-    <img src="https://img.shields.io/github/languages/top/HiMeditator/auto-caption?color=royalblue">
-    <img src="https://img.shields.io/github/repo-size/HiMeditator/auto-caption?color=green">
-    <img src="https://visitor-badge.laobi.icu/badge?page_id=himeditator.github.io">
+    <p>
+      <img src="https://img.shields.io/badge/version-0.3.0-blue">
+      <img src="https://img.shields.io/github/issues/HiMeditator/auto-caption?color=orange">
+      <img src="https://img.shields.io/github/languages/top/HiMeditator/auto-caption?color=royalblue">
+      <img src="https://img.shields.io/github/repo-size/HiMeditator/auto-caption?color=green">
+      <img src="https://visitor-badge.laobi.icu/badge?page_id=himeditator.auto-caption">
+    </p>
    <p>
        | <a href="./README.md">简体中文</a>
        | <b>English</b>
--- a/README_ja.md
+++ b/README_ja.md
@@ -2,11 +2,13 @@
    <img src="./build/icon.png" width="100px" height="100px"/>
    <h1 align="center">auto-caption</h1>
    <p>Auto Caption はクロスプラットフォームのリアルタイム字幕表示ソフトウェアです。</p>
-    <img src="https://img.shields.io/badge/version-0.3.0-blue">
-    <img src="https://img.shields.io/github/issues/HiMeditator/auto-caption?color=orange">
-    <img src="https://img.shields.io/github/languages/top/HiMeditator/auto-caption?color=royalblue">
-    <img src="https://img.shields.io/github/repo-size/HiMeditator/auto-caption?color=green">
-    <img src="https://visitor-badge.laobi.icu/badge?page_id=himeditator.github.io">
+    <p>
+      <img src="https://img.shields.io/badge/version-0.3.0-blue">
+      <img src="https://img.shields.io/github/issues/HiMeditator/auto-caption?color=orange">
+      <img src="https://img.shields.io/github/languages/top/HiMeditator/auto-caption?color=royalblue">
+      <img src="https://img.shields.io/github/repo-size/HiMeditator/auto-caption?color=green">
+      <img src="https://visitor-badge.laobi.icu/badge?page_id=himeditator.auto-caption">
+    </p>
    <p>
        | <a href="./README.md">简体中文</a>
        | <a href="./README_en.md">English</a>
--- a/caption-engine/audioprcs/init.py
+++ b/caption-engine/audioprcs/init.py
@@ -1 +1 @@
-from .process import mergeChunkChannels, resampleRawChunk
+from .process import mergeChunkChannels, resampleRawChunk, resampleMonoChunk
--- a/caption-engine/audioprcs/process.py
+++ b/caption-engine/audioprcs/process.py
@@ -47,3 +47,22 @@ def resampleRawChunk(chunk, channels, orig_sr, target_sr, mode="sinc_best"):
    chunk_mono_r =  samplerate.resample(chunk_mono, ratio, converter_type=mode)
    chunk_mono_r = np.round(chunk_mono_r).astype(np.int16)
    return chunk_mono_r.tobytes()
+
+def resampleMonoChunk(chunk, orig_sr, target_sr, mode="sinc_best"):
+    """
+    将当前单通道音频块进行重采样
+
+    Args:
+        chunk: (bytes)单通道音频数据块
+        orig_sr: 原始采样率
+        target_sr: 目标采样率
+        mode: 重采样模式，可选：'sinc_best' | 'sinc_medium' | 'sinc_fastest' | 'zero_order_hold' | 'linear'
+
+    Return:
+        (bytes)单通道音频数据块
+    """
+    chunk_np = np.frombuffer(chunk, dtype=np.int16)
+    ratio = target_sr / orig_sr
+    chunk_r =  samplerate.resample(chunk_np, ratio, converter_type=mode)
+    chunk_r = np.round(chunk_r).astype(np.int16)
+    return chunk_r.tobytes()
--- a/caption-engine/main-vosk.py
+++ b/caption-engine/main-vosk.py
@@ -0,0 +1,83 @@
+import sys
+import json
+import argparse
+from datetime import datetime
+import numpy.core.multiarray
+
+if sys.platform == 'win32':
+    from sysaudio.win import AudioStream
+elif sys.platform == 'darwin':
+    from sysaudio.darwin import AudioStream
+elif sys.platform == 'linux':
+    from sysaudio.linux import AudioStream
+else:
+    raise NotImplementedError(f"Unsupported platform: {sys.platform}")
+
+from vosk import Model, KaldiRecognizer, SetLogLevel
+from audioprcs import resampleRawChunk
+
+SetLogLevel(-1)
+
+def convert_audio_to_text(audio_type, chunk_rate, model_path):
+    sys.stdout.reconfigure(line_buffering=True) # type: ignore
+
+    if model_path.startswith('"'):
+        model_path = model_path[1:]
+    if model_path.endswith('"'):
+        model_path = model_path[:-1]
+
+    model = Model(model_path)
+    recognizer = KaldiRecognizer(model, 16000)
+
+    stream = AudioStream(audio_type, chunk_rate)
+    stream.openStream()
+
+    time_str = ''
+    cur_id = 0
+    prev_content = ''
+
+    while True:
+        chunk = stream.read_chunk()
+        chunk_mono = resampleRawChunk(chunk, stream.CHANNELS, stream.RATE, 16000)
+
+        caption = {}
+        if recognizer.AcceptWaveform(chunk_mono):
+            content = json.loads(recognizer.Result()).get('text', '')
+            caption['index'] = cur_id
+            caption['text'] = content
+            caption['time_s'] = time_str
+            caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
+            caption['translation'] = ''
+            prev_content = ''
+            cur_id += 1
+        else:
+            content = json.loads(recognizer.PartialResult()).get('partial', '')
+            if content == '' or content == prev_content:
+                continue
+            if prev_content == '':
+                time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3]
+            caption['index'] = cur_id
+            caption['text'] = content
+            caption['time_s'] = time_str
+            caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
+            caption['translation'] = ''
+            prev_content = content
+        try:
+            json_str = json.dumps(caption) + '\n'
+            sys.stdout.write(json_str)
+            sys.stdout.flush()
+        except Exception as e:
+            print(e)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Convert system audio stream to text')
+    parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output audio stream, 1 for input audio stream')
+    parser.add_argument('-c', '--chunk_rate', default=20, help='The number of audio stream chunks collected per second.')
+    parser.add_argument('-m', '--model_path', default='', help='The path to the vosk model.')
+    args = parser.parse_args()
+    convert_audio_to_text(
+        int(args.audio_type),
+        int(args.chunk_rate),
+        args.model_path
+    )
--- a/caption-engine/main-vosk.spec
+++ b/caption-engine/main-vosk.spec
@@ -0,0 +1,42 @@
+# -*- mode: python ; coding: utf-8 -*-
+
+from pathlib import Path
+
+vosk_path = str(Path('./subenv/Lib/site-packages/vosk').resolve())
+
+a = Analysis(
+    ['main-vosk.py'],
+    pathex=[],
+    binaries=[],
+    datas=[(vosk_path, 'vosk')],
+    hiddenimports=[],
+    hookspath=[],
+    hooksconfig={},
+    runtime_hooks=[],
+    excludes=[],
+    noarchive=False,
+    optimize=0,
+)
+
+pyz = PYZ(a.pure)
+
+exe = EXE(
+    pyz,
+    a.scripts,
+    a.binaries,
+    a.datas,
+    [],
+    name='main-vosk',
+    debug=False,
+    bootloader_ignore_signals=False,
+    strip=False,
+    upx=True,
+    upx_exclude=[],
+    runtime_tmpdir=None,
+    console=True,
+    disable_windowed_traceback=False,
+    argv_emulation=False,
+    target_arch=None,
+    codesign_identity=None,
+    entitlements_file=None,
+)
--- a/caption-engine/requirements.txt
+++ b/caption-engine/requirements.txt
@@ -3,4 +3,5 @@ numpy
 samplerate
 PyAudio
 PyAudioWPatch # Windows only
+vosk
 pyinstaller
--- a/caption-engine/sysaudio/win.py
+++ b/caption-engine/sysaudio/win.py
@@ -57,7 +57,7 @@ class AudioStream:
        self.stream = None
        self.SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16)
        self.FORMAT = pyaudio.paInt16
-        self.CHANNELS = self.device["maxInputChannels"]
+        self.CHANNELS = int(self.device["maxInputChannels"])
        self.RATE = int(self.device["defaultSampleRate"])
        self.CHUNK = self.RATE // chunk_rate
        self.INDEX = self.device["index"]
--- a/docs/TODO.md
+++ b/docs/TODO.md
@@ -9,11 +9,13 @@
 - [x] 添加复制字幕到剪贴板功能 *2025/07/08*
 - [x] 适配 macOS 平台 *2025/07/08*
 - [x] 添加字幕文字描边 *2025/07/09*
+- [x] 添加基于 Vosk 的字幕引擎 *2025/07/09*

 ## 待完成

+- [ ] 添加 Ollama 模型用于本地字幕引擎的翻译
+- [ ] 更新 README 和用户手册（字幕引擎构建、Vosk 模型获取和使用）
 - [ ] 添加本地字幕引擎
-  - [ ] 添加基于 Vosk 的字幕引擎
  - [ ] 验证 / 添加基于 FunASR 的字幕引擎
 - [ ] 减小软件不必要的体积

--- a/docs/api-docs/electron-ipc.md
+++ b/docs/api-docs/electron-ipc.md
@@ -44,6 +44,19 @@
 - 发送：无数据
 - 接收：`string`

+### `control.folder.select`
+
+**介绍：** 打开文件夹选择器，并将用户选择的文件夹路径返回给前端
+
+**发起方：** 前端控制窗口
+
+**接收方：** 后端控制窗口实例
+
+**数据类型：**
+
+- 发送：无数据
+- 接收：`string`
+
 ## 前端 ==> 后端

 ### `control.uiLanguage.change`
--- a/electron-builder.yml
+++ b/electron-builder.yml
@@ -10,8 +10,15 @@ files:
  - '!{.env,.env.*,.npmrc,pnpm-lock.yaml}'
  - '!{tsconfig.json,tsconfig.node.json,tsconfig.web.json}'
 extraResources:
-  from: ./caption-engine/dist/main-gummy
-  to: ./caption-engine/main-gummy
+  - from: ./caption-engine/dist/main-gummy.exe
+    to: ./caption-engine/main-gummy.exe
+  - from: ./caption-engine/dist/main-vosk.exe
+    to: ./caption-engine/main-vosk.exe
+  # For macOS and Linux
+  # - from: ./caption-engine/dist/main-gummy
+  #   to: ./caption-engine/main-gummy
+  # - from: ./caption-engine/dist/main-vosk
+  #   to: ./caption-engine/main-vosk
 asarUnpack:
  - resources/**
 win:
--- a/engine-test/vosk.ipynb
+++ b/engine-test/vosk.ipynb
@@ -0,0 +1,124 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "6fb12704",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "d:\\Projects\\auto-caption\\caption-engine\\subenv\\Lib\\site-packages\\vosk\\__init__.py\n"
+     ]
+    }
+   ],
+   "source": [
+    "import vosk\n",
+    "print(vosk.__file__)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "63a06f5c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "\n",
+      "        采样设备：\n",
+      "            - 设备类型：音频输入\n",
+      "            - 序号：1\n",
+      "            - 名称：麦克风阵列 (Realtek(R) Audio)\n",
+      "            - 最大输入通道数：2\n",
+      "            - 默认低输入延迟：0.09s\n",
+      "            - 默认高输入延迟：0.18s\n",
+      "            - 默认采样率：44100.0Hz\n",
+      "            - 是否回环设备：False\n",
+      "\n",
+      "        音频样本块大小：2205\n",
+      "        样本位宽：2\n",
+      "        采样格式：8\n",
+      "        音频通道数：2\n",
+      "        音频采样率：44100\n",
+      "        \n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "import json\n",
+    "from vosk import Model, KaldiRecognizer\n",
+    "\n",
+    "current_dir = os.getcwd() \n",
+    "sys.path.append(os.path.join(current_dir, '../caption-engine'))\n",
+    "\n",
+    "from sysaudio.win import AudioStream\n",
+    "from audioprcs import resampleRawChunk, mergeChunkChannels\n",
+    "\n",
+    "stream = AudioStream(1)\n",
+    "stream.printInfo()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "5d5a0afa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = Model(os.path.join(\n",
+    "    current_dir,\n",
+    "    '../caption-engine/models/vosk-model-small-cn-0.22'\n",
+    "))\n",
+    "recognizer = KaldiRecognizer(model, 16000)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7e9d1530",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "stream.openStream()\n",
+    "\n",
+    "for i in range(200):\n",
+    "    chunk = stream.read_chunk()\n",
+    "    chunk_mono = resampleRawChunk(chunk, stream.CHANNELS, stream.RATE, 16000)\n",
+    "    if recognizer.AcceptWaveform(chunk_mono):\n",
+    "        result = json.loads(recognizer.Result())\n",
+    "        print(\"acc:\", result.get(\"text\", \"\"))\n",
+    "    else:\n",
+    "        partial = json.loads(recognizer.PartialResult())\n",
+    "        print(\"else:\", partial.get(\"partial\", \"\"))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "subenv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.1"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/src/main/ControlWindow.ts
+++ b/src/main/ControlWindow.ts
@@ -1,4 +1,4 @@
-import { shell, BrowserWindow, ipcMain, nativeTheme } from 'electron'
+import { shell, BrowserWindow, ipcMain, nativeTheme, dialog } from 'electron'
 import path from 'path'
 import { is } from '@electron-toolkit/utils'
 import icon from '../../build/icon.png?asset'
@@ -72,6 +72,15 @@ class ControlWindow {
      return allConfig.uiTheme
    })

+    ipcMain.handle('control.folder.select', async () => {
+      const result = await dialog.showOpenDialog({
+        properties: ['openDirectory']
+      });
+
+      if (result.canceled) return "";
+      return result.filePaths[0];
+    })
+
    ipcMain.on('control.uiLanguage.change', (_, args) => {
      allConfig.uiLanguage = args
      if(captionWindow.window){
--- a/src/main/types/index.ts
+++ b/src/main/types/index.ts
@@ -6,10 +6,11 @@ export interface Controls {
  engineEnabled: boolean,
  sourceLang: string,
  targetLang: string,
-  engine: 'gummy',
+  engine: string,
  audio: 0 | 1,
  translation: boolean,
  API_KEY: string,
+  modelPath: string,
  customized: boolean,
  customizedApp: string,
  customizedCommand: string
--- a/src/main/utils/AllConfig.ts
+++ b/src/main/utils/AllConfig.ts
@@ -34,6 +34,7 @@ const defaultControls: Controls = {
  audio: 0,
  engineEnabled: false,
  API_KEY: '',
+  modelPath: '',
  translation: true,
  customized: false,
  customizedApp: '',
--- a/src/main/utils/CaptionEngine.ts
+++ b/src/main/utils/CaptionEngine.ts
@@ -13,26 +13,20 @@ export class CaptionEngine {
  processStatus: 'running' | 'stopping' | 'stopped' = 'stopped'

  private getApp(): boolean {
+    allConfig.controls.customized = false
    if (allConfig.controls.customized && allConfig.controls.customizedApp) {
      this.appPath = allConfig.controls.customizedApp
      this.command = [allConfig.controls.customizedCommand]
+      allConfig.controls.customized = true
    }
    else if (allConfig.controls.engine === 'gummy') {
-      allConfig.controls.customized = false
      if(!allConfig.controls.API_KEY && !process.env.DASHSCOPE_API_KEY) {
        controlWindow.sendErrorMessage(i18n('gummy.key.missing'))
        return false
      }
-      let gummyName = ''
+      let gummyName = 'main-gummy'
      if (process.platform === 'win32') {
-        gummyName = 'main-gummy.exe'
-      }
-      else if (process.platform === 'darwin' || process.platform === 'linux') {
-        gummyName = 'main-gummy'
-      }
-      else {
-        controlWindow.sendErrorMessage(i18n('platform.unsupported') + process.platform)
-        throw new Error(i18n('platform.unsupported'))
+        gummyName += '.exe'
      }
      if (is.dev) {
        this.appPath = path.join(
@@ -55,10 +49,29 @@ export class CaptionEngine {
      if(allConfig.controls.API_KEY) {
        this.command.push('-k', allConfig.controls.API_KEY)
      }
-
-      console.log('[INFO] Engine Path:', this.appPath)
-      console.log('[INFO] Engine Command:', this.command)
    }
+    else if(allConfig.controls.engine === 'vosk'){
+      let voskName = 'main-vosk'
+      if (process.platform === 'win32') {
+        voskName += '.exe'
+      }
+      if (is.dev) {
+        this.appPath = path.join(
+          app.getAppPath(),
+          'caption-engine', 'dist', voskName
+        )
+      }
+      else {
+        this.appPath = path.join(
+          process.resourcesPath, 'caption-engine', voskName
+        )
+      }
+      this.command = []
+      this.command.push('-a', allConfig.controls.audio ? '1' : '0')
+      this.command.push('-m', `"${allConfig.controls.modelPath}"`)
+    }
+    console.log('[INFO] Engine Path:', this.appPath)
+    console.log('[INFO] Engine Command:', this.command)
    return true
  }

--- a/src/renderer/src/components/EngineControl.vue
+++ b/src/renderer/src/components/EngineControl.vue
@@ -16,6 +16,7 @@
    <div class="input-item">
      <span class="input-label">{{ $t('engine.transLang') }}</span>
      <a-select
+        :disabled="currentEngine === 'vosk'"
        class="input-area"
        v-model:value="currentTargetLang"
        :options="langList.filter((item) => item.value !== 'auto')"
@@ -47,15 +48,38 @@
        <a-switch v-model:checked="showMore" />
      </div>
    </div>
-    <a-card size="small" :title="$t('engine.custom.title')" v-show="showMore">
+
+    <a-card size="small" :title="$t('engine.showMore')" v-show="showMore">
      <div class="input-item">
-        <span class="input-label">{{ $t('engine.apikey') }}</span>
+        <a-popover>
+          <template #content>
+            <p class="label-hover-info">{{ $t('engine.apikeyInfo') }}</p>
+          </template>
+          <span class="input-label info-label">{{ $t('engine.apikey') }}</span>
+        </a-popover>
        <a-input
          class="input-area"
          type="password"
          v-model:value="currentAPI_KEY"
        />
      </div>
+      <div class="input-item">
+        <a-popover>
+          <template #content>
+            <p class="label-hover-info">{{ $t('engine.modelPathInfo') }}</p>
+          </template>
+          <span class="input-label info-label">{{ $t('engine.modelPath') }}</span>
+        </a-popover>
+        <span
+          class="input-folder"
+          @click="selectFolderPath"
+        ><span><FolderOpenOutlined /></span></span>
+        <a-input
+          class="input-area"
+          style="width:calc(100% - 140px);"
+          v-model:value="currentModelPath"
+        />
+      </div>
      <div class="input-item">
        <span style="margin-right:5px;">{{ $t('engine.customEngine') }}</span>
        <a-switch v-model:checked="currentCustomized" />
@@ -85,9 +109,8 @@
            ></a-input>
          </div>
        </a-card>
-      </div>      
+      </div>
    </a-card>
-
  </a-card>
  <div style="height: 20px;"></div>
 </template>
@@ -95,9 +118,10 @@
 <script setup lang="ts">
 import { ref, computed, watch } from 'vue'
 import { storeToRefs } from 'pinia'
+import { useGeneralSettingStore } from '@renderer/stores/generalSetting'
 import { useEngineControlStore } from '@renderer/stores/engineControl'
 import { notification } from 'ant-design-vue'
-import { InfoCircleOutlined } from '@ant-design/icons-vue';
+import { FolderOpenOutlined ,InfoCircleOutlined } from '@ant-design/icons-vue';
 import { useI18n } from 'vue-i18n'

 const { t } = useI18n()
@@ -108,10 +132,11 @@ const { platform, captionEngine, audioType, changeSignal } = storeToRefs(engineC

 const currentSourceLang = ref('auto')
 const currentTargetLang = ref('zh')
-const currentEngine = ref<'gummy'>('gummy')
+const currentEngine = ref<string>('gummy')
 const currentAudio = ref<0 | 1>(0)
 const currentTranslation = ref<boolean>(false)
 const currentAPI_KEY = ref<string>('')
+const currentModelPath = ref<string>('')
 const currentCustomized = ref<boolean>(false)
 const currentCustomizedApp = ref('')
 const currentCustomizedCommand = ref('')
@@ -132,6 +157,7 @@ function applyChange(){
  engineControl.audio = currentAudio.value
  engineControl.translation = currentTranslation.value
  engineControl.API_KEY = currentAPI_KEY.value
+  engineControl.modelPath = currentModelPath.value
  engineControl.customized = currentCustomized.value
  engineControl.customizedApp = currentCustomizedApp.value
  engineControl.customizedCommand = currentCustomizedCommand.value
@@ -151,22 +177,70 @@ function cancelChange(){
  currentAudio.value = engineControl.audio
  currentTranslation.value = engineControl.translation
  currentAPI_KEY.value = engineControl.API_KEY
+  currentModelPath.value = engineControl.modelPath
  currentCustomized.value = engineControl.customized
  currentCustomizedApp.value = engineControl.customizedApp
  currentCustomizedCommand.value = engineControl.customizedCommand
 }

+function selectFolderPath() {
+  window.electron.ipcRenderer.invoke('control.folder.select').then((folderPath) => {
+    if(!folderPath) return
+    currentModelPath.value = folderPath
+  })
+}
+
 watch(changeSignal, (val) => {
  if(val == true) {
    cancelChange();
    engineControl.changeSignal = false;
  }
 })
+
+watch(currentEngine, (val) => {
+  if(val == 'vosk'){
+    currentSourceLang.value = 'auto'
+    currentTargetLang.value = ''
+  }
+  else if(val == 'gummy'){
+    currentSourceLang.value = 'auto'
+    currentTargetLang.value = useGeneralSettingStore().uiLanguage
+  }
+})
 </script>

 <style scoped>
@import url(../assets/input.css);

+.label-hover-info {
+  margin-top: 10px;
+  max-width: min(36vw, 380px);
+}
+
+.info-label {
+  color: #1677ff;
+  cursor: pointer;
+}
+
+.input-folder {
+  display:inline-block;
+  width: 40px;
+  font-size:1.38em;
+  cursor: pointer;
+  transition: all 0.25s;
+}
+
+.input-folder>span {
+  padding: 0 4px;
+  border: 2px solid #1677ff;
+  color: #1677ff;
+  border-radius: 30%;
+}
+
+.input-folder:hover {
+  transform: scale(1.1);
+}
+
 .customize-note {
  padding: 10px 10px 0;
  color: red;
--- a/src/renderer/src/components/EngineStatus.vue
+++ b/src/renderer/src/components/EngineStatus.vue
@@ -106,6 +106,11 @@ function openCaptionWindow() {
 }

 function startEngine() {
+  console.log(`@@${engineControl.modelPath}##`)
+  if(engineControl.modelPath.trim() === '') {
+    engineControl.emptyModelPathErr()
+    return
+  }
  window.electron.ipcRenderer.send('control.engine.start')
 }

--- a/src/renderer/src/i18n/config/engine.ts
+++ b/src/renderer/src/i18n/config/engine.ts
@@ -16,6 +16,13 @@ export const engines = {
        { value: 'it', label: '意大利语' },
      ]
    },
+    {
+      value: 'vosk',
+      label: '本地 -  Vosk',
+      languages: [
+        { value: 'auto', label: '需要自行配置模型' },
+      ]
+    }
  ],
  en: [
    {
@@ -34,6 +41,13 @@ export const engines = {
        { value: 'it', label: 'Italian' },
      ]
    },
+    {
+      value: 'vosk',
+      label: 'Local - Vosk',
+      languages: [
+        { value: 'auto', label: 'Model needs to be configured manually' },
+      ]
+    }
  ],
  ja: [
    {
@@ -52,6 +66,13 @@ export const engines = {
        { value: 'it', label: 'イタリア語' },
      ]
    },
+    {
+      value: 'vosk',
+      label: 'ローカル - Vosk',
+      languages: [
+        { value: 'auto', label: 'モデルを手動で設定する必要があります' },
+      ]
+    }
  ]
 }

--- a/src/renderer/src/i18n/lang/en.ts
+++ b/src/renderer/src/i18n/lang/en.ts
@@ -17,6 +17,8 @@ export default {
    "custom": "Type: Custom engine, engine path: ",
    "args": ", command arguments: ",
    "pidInfo": ", caption engine process PID: ",
+    "empty": "Model Path is Empty",
+    "emptyInfo": "The Vosk model path is empty. Please set the Vosk model path in the additional settings of the subtitle engine settings.",
    "stopped": "Caption Engine Stopped",
    "stoppedInfo": "The caption engine has stopped. You can click the 'Start Caption Engine' button to restart it.",
    "error": "An error occurred",
@@ -48,6 +50,9 @@ export default {
    "enableTranslation": "Translation",
    "showMore": "More Settings",
    "apikey": "API KEY",
+    "modelPath": "Model Path",
+    "apikeyInfo": "API KEY required for the Gummy subtitle engine, which needs to be obtained from the Alibaba Cloud Bailing platform. For more details, see the project user manual.",
+    "modelPathInfo": "The folder path of the model required by the Vosk subtitle engine. You need to download the required model to your local machine in advance. For more details, see the project user manual.",
    "customEngine": "Custom Engine",
    custom: {
      "title": "Custom Caption Engine",
--- a/src/renderer/src/i18n/lang/ja.ts
+++ b/src/renderer/src/i18n/lang/ja.ts
@@ -17,6 +17,8 @@ export default {
    "custom": "タイプ：カスタムエンジン、エンジンパス：",
    "args": "、コマンド引数：",
    "pidInfo": "、字幕エンジンプロセス PID：",
+    "empty": "モデルパスが空です",
+    "emptyInfo": "Vosk モデルのパスが空です。字幕エンジン設定の追加設定で Vosk モデルのパスを設定してください。",
    "stopped": "字幕エンジンが停止しました",
    "stoppedInfo": "字幕エンジンが停止しました。再起動するには「字幕エンジンを開始」ボタンをクリックしてください。",
    "error": "エラーが発生しました",
@@ -48,6 +50,9 @@ export default {
    "enableTranslation": "翻訳",
    "showMore": "詳細設定",
    "apikey": "API KEY",
+    "modelPath": "モデルパス",
+    "apikeyInfo": "Gummy 字幕エンジンに必要な API KEY は、アリババクラウド百煉プラットフォームから取得する必要があります。詳細情報はプロジェクトのユーザーマニュアルをご覧ください。",
+    "modelPathInfo": "Vosk 字幕エンジンに必要なモデルのフォルダパスです。必要なモデルを事前にローカルマシンにダウンロードする必要があります。詳細情報はプロジェクトのユーザーマニュアルをご覧ください。",
    "customEngine": "カスタムエンジン",
    custom: {
      "title": "カスタムキャプションエンジン",
--- a/src/renderer/src/i18n/lang/zh.ts
+++ b/src/renderer/src/i18n/lang/zh.ts
@@ -17,6 +17,8 @@ export default {
    "custom": "类型：自定义引擎，引擎路径：",
    "args": "，命令参数：",
    "pidInfo": "，字幕引擎进程 PID：",
+    "empty": "模型路径为空",
+    "emptyInfo": "Vosk 模型模型路径为空，请在字幕引擎设置的更多设置中设置 Vosk 模型的路径。",
    "stopped": "字幕引擎停止",
    "stoppedInfo": "字幕引擎已经停止，可点击“启动字幕引擎”按钮重新启动",
    "error": "发生错误",
@@ -48,6 +50,9 @@ export default {
    "enableTranslation": "启用翻译",
    "showMore": "更多设置",
    "apikey": "API KEY",
+    "modelPath": "模型路径",
+    "apikeyInfo": "Gummy 字幕引擎需要的 API KEY，需要在阿里云百炼平台获取。详细信息见项目用户手册。",
+    "modelPathInfo": "Vosk 字幕引擎需要的模型的文件夹路径，需要提前下载需要的模型到本地。信息详情见项目用户手册。",
    "customEngine": "自定义引擎",
    custom: {
      "title": "自定义字幕引擎",
--- a/src/renderer/src/stores/engineControl.ts
+++ b/src/renderer/src/stores/engineControl.ts
@@ -16,13 +16,14 @@ export const useEngineControlStore = defineStore('engineControl', () => {

  const captionEngine = ref(engines[useGeneralSettingStore().uiLanguage])
  const audioType = ref(audioTypes[useGeneralSettingStore().uiLanguage])
-  const API_KEY = ref<string>('')
  const engineEnabled = ref(false)
  const sourceLang = ref<string>('en')
  const targetLang = ref<string>('zh')
-  const engine = ref<'gummy'>('gummy')
+  const engine = ref<string>('gummy')
  const audio = ref<0 | 1>(0)
  const translation = ref<boolean>(true)
+  const API_KEY = ref<string>('')
+  const modelPath = ref<string>('')
  const customized = ref<boolean>(false)
  const customizedApp = ref<string>('')
  const customizedCommand = ref<string>('')
@@ -38,6 +39,7 @@ export const useEngineControlStore = defineStore('engineControl', () => {
      audio: audio.value,
      translation: translation.value,
      API_KEY: API_KEY.value,
+      modelPath: modelPath.value,
      customized: customized.value,
      customizedApp: customizedApp.value,
      customizedCommand: customizedCommand.value
@@ -53,12 +55,20 @@ export const useEngineControlStore = defineStore('engineControl', () => {
    engineEnabled.value = controls.engineEnabled
    translation.value = controls.translation
    API_KEY.value = controls.API_KEY
+    modelPath.value = controls.modelPath
    customized.value = controls.customized
    customizedApp.value = controls.customizedApp
    customizedCommand.value = controls.customizedCommand
    changeSignal.value = true
  }

+  function emptyModelPathErr() {
+    notification.open({
+      message: t('noti.empty'),
+      description: t('noti.emptyInfo')
+    });
+  }
+
  window.electron.ipcRenderer.on('control.controls.set', (_, controls: Controls) => {
    setControls(controls)
  })
@@ -102,7 +112,7 @@ export const useEngineControlStore = defineStore('engineControl', () => {

  return {
    platform,           // 系统平台
-    captionEngine,      // 字幕引擎
+    captionEngine,      // 字幕引擎列表
    audioType,          // 音频类型
    engineEnabled,      // 字幕引擎是否启用
    sourceLang,         // 源语言
@@ -111,11 +121,13 @@ export const useEngineControlStore = defineStore('engineControl', () => {
    audio,              // 选择音频
    translation,        // 是否启用翻译
    API_KEY,            // API KEY
+    modelPath,          // vosk 模型路径
    customized,         // 是否使用自定义字幕引擎
    customizedApp,      // 自定义字幕引擎的应用程序
    customizedCommand,  // 自定义字幕引擎的命令
    setControls,        // 设置引擎配置
    sendControlsChange, // 发送最新控制消息到后端
+    emptyModelPathErr,  // 模型路径为空时显示警告
    changeSignal,       // 配置改变信号
  }
 })
--- a/src/renderer/src/types/index.ts
+++ b/src/renderer/src/types/index.ts
@@ -6,10 +6,11 @@ export interface Controls {
  engineEnabled: boolean,
  sourceLang: string,
  targetLang: string,
-  engine: 'gummy',
+  engine: string,
  audio: 0 | 1,
  translation: boolean,
  API_KEY: string,
+  modelPath: string,
  customized: boolean,
  customizedApp: string,
  customizedCommand: string