feat(vosk): 为 Vosk 模型添加非实时翻译功能 (#14)

- 添加 Ollama 大模型翻译和 Google 翻译（非实时），支持多种语言 - 为 Vosk 引擎添加非实时翻译 - 为新增的翻译功能添加和修改接口 - 修改 Electron 构建配置，之后不同平台构建无需修改构建文件
2026-02-15 04:14:46 +08:00 · 2025-09-02 23:19:53 +08:00
parent 56fdc348f8
commit 14987cbfc5
16 changed files with 176 additions and 61 deletions
--- a/README.md
+++ b/README.md
@@ -188,15 +188,3 @@ npm run build:mac
 # For Linux
 npm run build:linux
 ```
 注意，根据不同的平台需要修改项目根目录下 `electron-builder.yml` 文件中的配置内容：
 ```yml
 extraResources:
  # For Windows
  - from: ./engine/dist/main.exe
    to: ./engine/main.exe
  # For macOS and Linux
  # - from: ./engine/dist/main
  #   to: ./engine/main
 ```
--- a/README_en.md
+++ b/README_en.md
@@ -188,15 +188,3 @@ npm run build:mac
 # For Linux
 npm run build:linux
 ```
 Note: You need to modify the configuration content in the `electron-builder.yml` file in the project root directory according to different platforms:
 ```yml
 extraResources:
  # For Windows
  - from: ./engine/dist/main.exe
    to: ./engine/main.exe
  # For macOS and Linux
  # - from: ./engine/dist/main
  #   to: ./engine/main
 ```
--- a/README_ja.md
+++ b/README_ja.md
@@ -188,15 +188,3 @@ npm run build:mac
 # Linux 用
 npm run build:linux
 ```
 注意: プラットフォームに応じて、プロジェクトルートディレクトリにある `electron-builder.yml` ファイルの設定内容を変更する必要があります:
 ```yml
 extraResources:
  # Windows 用
  - from: ./engine/dist/main.exe
    to: ./engine/main.exe
  # macOS と Linux 用
  # - from: ./engine/dist/main
  #   to: ./engine/main
 ```
--- a/docs/api-docs/caption-engine.md
+++ b/docs/api-docs/caption-engine.md
@@ -58,6 +58,18 @@ Electron 主进程通过 TCP Socket 向 Python 进程发送数据。发送的数
 Python 端监听到的音频流转换为的字幕数据。
 ### `translation`
 ```js
 {
  command: "translation",
  time_s: string,
  translation: string
 }
 ```
 语音识别的内容的翻译，可以根据起始时间确定对应的字幕。
 ### `print`
 ```js
@@ -67,7 +79,7 @@ Python 端监听到的音频流转换为的字幕数据。
 }
 ```
-输出 Python 端打印的内容。
+输出 Python 端打印的内容，不计入日志。
 ### `info`
@@ -78,7 +90,7 @@ Python 端监听到的音频流转换为的字幕数据。
 }
 ```
-Python 端打印的提示信息，比起 `print`，该信息更希望 Electron 端的关注。
+Python 端打印的提示信息，会计入日志。
 ### `error`
--- a/electron-builder.yml
+++ b/electron-builder.yml
@@ -15,14 +15,13 @@ files:
  - '!assets/*'
  - '!.repomap/*'
  - '!.virtualme/*'
 extraResources:
  # For Windows
  - from: ./engine/dist/main.exe
    to: ./engine/main.exe
  # For macOS and Linux
-  # - from: ./engine/dist/main
+  - from: ./engine/dist/main
-  #   to: ./engine/main
+    to: ./engine/main
 win:
  executableName: auto-caption
  icon: build/icon.png
--- a/engine/audio2text/vosk.py
+++ b/engine/audio2text/vosk.py
@@ -1,8 +1,10 @@
 import json
 import threading
 import time
 from datetime import datetime
 from vosk import Model, KaldiRecognizer, SetLogLevel
-from utils import stdout_cmd, stdout_obj
+from utils import stdout_cmd, stdout_obj, google_translate
 class VoskRecognizer:
@@ -11,15 +13,18 @@ class VoskRecognizer:
    初始化参数：
        model_path: Vosk 识别模型路径
        target: 翻译目标语言
    """
-    def __init__(self, model_path: str):
+    def __init__(self, model_path: str, target: str | None):
        SetLogLevel(-1)
        if model_path.startswith('"'):
            model_path = model_path[1:]
        if model_path.endswith('"'):
            model_path = model_path[:-1]
        self.model_path = model_path
        self.target = target
        self.time_str = ''
        self.trans_time = time.time()
        self.cur_id = 0
        self.prev_content = ''
@@ -48,7 +53,15 @@ class VoskRecognizer:
            caption['time_s'] = self.time_str
            caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
            self.prev_content = ''
            if content == '': return
            self.cur_id += 1
            if self.target:
                self.trans_time = time.time()
                th = threading.Thread(
                    target=google_translate,
                    args=(caption['text'], self.target, self.time_str)
                )
                th.start()
        else:
            content = json.loads(self.recognizer.PartialResult()).get('partial', '')
            if content == '' or content == self.prev_content:
@@ -62,6 +75,13 @@ class VoskRecognizer:
            self.prev_content = content
        stdout_obj(caption)
        if self.target and time.time() - self.trans_time > 2.0:
            self.trans_time = time.time()
            th = threading.Thread(
                target=google_translate,
                args=(caption['text'], self.target, self.time_str)
            )
            th.start()
    def stop(self):
        """停止 Vosk 引擎"""
--- a/engine/main.py
+++ b/engine/main.py
@@ -44,10 +44,10 @@ def main_gummy(s: str, t: str, a: int, c: int, k: str):
    engine.stop()
-def main_vosk(a: int, c: int, m: str):
+def main_vosk(a: int, c: int, m: str, t: str):
    global thread_data
    stream = AudioStream(a, c)
-    engine = VoskRecognizer(m)
+    engine = VoskRecognizer(m, None if t == 'none' else t)
    stream.open_stream()
    engine.start()
@@ -72,9 +72,9 @@ if __name__ == "__main__":
    parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output, 1 for input')
    parser.add_argument('-c', '--chunk_rate', default=10, help='Number of audio stream chunks collected per second')
    parser.add_argument('-p', '--port', default=8080, help='The port to run the server on, 0 for no server')
    parser.add_argument('-t', '--target_language', default='zh', help='Target language code, "none" for no translation')
    # gummy only
    parser.add_argument('-s', '--source_language', default='en', help='Source language code')
    parser.add_argument('-t', '--target_language', default='zh', help='Target language code')
    parser.add_argument('-k', '--api_key', default='', help='API KEY for Gummy model')
    # vosk only
    parser.add_argument('-m', '--model_path', default='', help='The path to the vosk model.')
@@ -97,7 +97,8 @@ if __name__ == "__main__":
        main_vosk(
            int(args.audio_type),
            int(args.chunk_rate),
-            args.model_path
+            args.model_path,
            args.target_language
        )
    else:
        raise ValueError('Invalid caption engine specified.')
--- a/engine/requirements.txt
+++ b/engine/requirements.txt
@@ -5,3 +5,5 @@ vosk
 pyinstaller
 pyaudio; sys_platform == 'darwin'
 pyaudiowpatch; sys_platform == 'win32'
 googletrans
 ollama
--- a/engine/utils/init.py
+++ b/engine/utils/init.py
@@ -6,4 +6,5 @@ from .audioprcs import (
 )
 from .sysout import stdout, stdout_err, stdout_cmd, stdout_obj, stderr
 from .thdata import thread_data
-from .server import start_server
+from .server import start_server
 from .translation import ollama_translate, google_translate
--- a/engine/utils/translation.py
+++ b/engine/utils/translation.py
@@ -0,0 +1,57 @@
 from ollama import chat
 from ollama import ChatResponse
 import asyncio
 from googletrans import Translator
 from .sysout import stdout, stdout_obj
 lang_map = {
    'en': 'English',
    'es': 'Spanish',
    'fr': 'French',
    'de': 'German',
    'it': 'Italian',
    'ru': 'Russian',
    'ja': 'Japanese',
    'ko': 'Korean',
    'zh': 'Chinese'
 }
 def ollama_translate(model: str, target: str, text: str, chunk_size = 3):
    stream = chat(
        model=model,
        messages=[
            {"role": "system", "content": f"/no_think Translate the following content into {lang_map[target]}, and do not output any additional information."},
            {"role": "user", "content": text}
        ],
        stream=True
    )
    chunk_content = ""
    in_thinking = False
    count = 0
    for chunk in stream:
        if count == 0 and chunk['message']['content'].startswith("<think>"):
            in_thinking = True
        if in_thinking:
            if "</think>" in chunk['message']['content']:
                in_thinking = False
            continue
        chunk_content += ' '.join(chunk['message']['content'].split('\n'))
        count += 1
        if count % chunk_size == 0:
            print(chunk_content, end='')
            chunk_content = ""
            count = 0
    if chunk_content:
        print(chunk_content)
 def google_translate(text: str, target: str, time_s: str):
    translator = Translator()
    try:
        res = asyncio.run(translator.translate(text, dest=target))
        stdout_obj({
            "command": "translation",
            "time_s": time_s,
            "translation": res.text
        })
    except Exception as e:
        stdout(f"Google Translation Request failed: {str(e)}")
--- a/src/main/types/index.ts
+++ b/src/main/types/index.ts
@@ -46,6 +46,11 @@ export interface CaptionItem {
  translation: string
 }
 export interface CaptionTranslation {
  time_s: string,
  translation: string
 }
 export interface SoftwareLogItem {
  type: "INFO" | "WARN" | "ERROR",
  index: number,
--- a/src/main/utils/AllConfig.ts
+++ b/src/main/utils/AllConfig.ts
@@ -1,6 +1,7 @@
 import {
  UILanguage, UITheme, Styles, Controls,
-  CaptionItem, FullConfig, SoftwareLogItem
+  CaptionItem, CaptionTranslation,
  FullConfig, SoftwareLogItem
 } from '../types'
 import { Log } from './Log'
 import { app, BrowserWindow } from 'electron'
@@ -158,12 +159,28 @@ class AllConfig {
    }
  }
-  public sendCaptionLog(window: BrowserWindow, command: 'add' | 'upd' | 'set') {
+  public updateCaptionTranslation(trans: CaptionTranslation){
    for(let i = this.captionLog.length - 1; i >= 0; i--){
      if(this.captionLog[i].time_s === trans.time_s){
        this.captionLog[i].translation = trans.translation
        for(const window of BrowserWindow.getAllWindows()){
          this.sendCaptionLog(window, 'upd', i)
        }
        break
      }
    }
  }
  public sendCaptionLog(
    window: BrowserWindow,
    command: 'add' | 'upd' | 'set',
    index: number | undefined = undefined
  ) {
    if(command === 'add'){
-      window.webContents.send(`both.captionLog.add`, this.captionLog[this.captionLog.length - 1])
+      window.webContents.send(`both.captionLog.add`, this.captionLog.at(-1))
    }
    else if(command === 'upd'){
-      window.webContents.send(`both.captionLog.upd`, this.captionLog[this.captionLog.length - 1])
+      if(index !== undefined) window.webContents.send(`both.captionLog.upd`, this.captionLog[index])
      else window.webContents.send(`both.captionLog.upd`, this.captionLog.at(-1))
    }
    else if(command === 'set'){
      window.webContents.send(`both.captionLog.set`, this.captionLog)
--- a/src/main/utils/CaptionEngine.ts
+++ b/src/main/utils/CaptionEngine.ts
@@ -67,21 +67,20 @@ export class CaptionEngine {
      this.command.push('-a', allConfig.controls.audio ? '1' : '0')
      this.port = Math.floor(Math.random() * (65535 - 1024 + 1)) + 1024
      this.command.push('-p', this.port.toString())
      this.command.push(
        '-t', allConfig.controls.translation ?
        allConfig.controls.targetLang : 'none'
      )
      if(allConfig.controls.engine === 'gummy') {
        this.command.push('-e', 'gummy')
        this.command.push('-s', allConfig.controls.sourceLang)
        this.command.push(
          '-t', allConfig.controls.translation ?
          allConfig.controls.targetLang : 'none'
        )
        if(allConfig.controls.API_KEY) {
          this.command.push('-k', allConfig.controls.API_KEY)
        }
      }
      else if(allConfig.controls.engine === 'vosk'){
        this.command.push('-e', 'vosk')
        this.command.push('-m', `"${allConfig.controls.modelPath}"`)        
      }
    }
@@ -249,8 +248,11 @@ function handleEngineData(data: any) {
  else if(data.command === 'caption') {
    allConfig.updateCaptionLog(data);
  }
  else if(data.command === 'translation') {
    allConfig.updateCaptionTranslation(data);
  }
  else if(data.command === 'print') {
-    Log.info('Engine Print:', data.content)
+    console.log(data.content)
  }
  else if(data.command === 'info') {
    Log.info('Engine Info:', data.content)
--- a/src/renderer/src/components/EngineControl.vue
+++ b/src/renderer/src/components/EngineControl.vue
@@ -8,6 +8,7 @@
    <div class="input-item">
      <span class="input-label">{{ $t('engine.sourceLang') }}</span>
      <a-select
        :disabled="currentEngine === 'vosk'"
        class="input-area"
        v-model:value="currentSourceLang"
        :options="langList"
@@ -16,7 +17,6 @@
    <div class="input-item">
      <span class="input-label">{{ $t('engine.transLang') }}</span>
      <a-select
        :disabled="currentEngine === 'vosk'"
        class="input-area"
        v-model:value="currentTargetLang"
        :options="langList.filter((item) => item.value !== 'auto')"
@@ -222,7 +222,10 @@ watch(changeSignal, (val) => {
 watch(currentEngine, (val) => {
  if(val == 'vosk'){
    currentSourceLang.value = 'auto'
-    currentTargetLang.value = ''
+    currentTargetLang.value = useGeneralSettingStore().uiLanguage
    if(currentTargetLang.value === 'zh') {
      currentTargetLang.value = 'zh-cn'
    }
  }
  else if(val == 'gummy'){
    currentSourceLang.value = 'auto'
--- a/src/renderer/src/i18n/config/engine.ts
+++ b/src/renderer/src/i18n/config/engine.ts
@@ -21,6 +21,15 @@ export const engines = {
      label: '本地 -  Vosk',
      languages: [
        { value: 'auto', label: '需要自行配置模型' },
        { value: 'en', label: '英语' },
        { value: 'zh-cn', label: '中文' },
        { value: 'ja', label: '日语' },
        { value: 'ko', label: '韩语' },
        { value: 'de', label: '德语' },
        { value: 'fr', label: '法语' },
        { value: 'ru', label: '俄语' },
        { value: 'es', label: '西班牙语' },
        { value: 'it', label: '意大利语' },
      ]
    }
  ],
@@ -46,6 +55,15 @@ export const engines = {
      label: 'Local - Vosk',
      languages: [
        { value: 'auto', label: 'Model needs to be configured manually' },
        { value: 'en', label: 'English' },
        { value: 'zh-cn', label: 'Chinese' },
        { value: 'ja', label: 'Japanese' },
        { value: 'ko', label: 'Korean' },
        { value: 'de', label: 'German' },
        { value: 'fr', label: 'French' },
        { value: 'ru', label: 'Russian' },
        { value: 'es', label: 'Spanish' },
        { value: 'it', label: 'Italian' },
      ]
    }
  ],
@@ -71,6 +89,15 @@ export const engines = {
      label: 'ローカル - Vosk',
      languages: [
        { value: 'auto', label: 'モデルを手動で設定する必要があります' },
        { value: 'en', label: '英語' },
        { value: 'zh-cn', label: '中国語' },
        { value: 'ja', label: '日本語' },
        { value: 'ko', label: '韓国語' },
        { value: 'de', label: 'ドイツ語' },
        { value: 'fr', label: 'フランス語' },
        { value: 'ru', label: 'ロシア語' },
        { value: 'es', label: 'スペイン語' },
        { value: 'it', label: 'イタリア語' },
      ]
    }
  ]
--- a/src/renderer/src/stores/captionLog.ts
+++ b/src/renderer/src/stores/captionLog.ts
@@ -15,7 +15,12 @@ export const useCaptionLogStore = defineStore('captionLog', () => {
  })
  window.electron.ipcRenderer.on('both.captionLog.upd', (_, log) => {
-    captionData.value.splice(captionData.value.length - 1, 1, log)
+    for(let i = captionData.value.length - 1; i >= 0; i--) {
      if(captionData.value[i].time_s === log.time_s){
        captionData.value.splice(i, 1, log)
        break
      }
    }
  })
  window.electron.ipcRenderer.on('both.captionLog.set', (_, logs) => {