feat(vosk): 为 Vosk 模型添加非实时翻译功能 (#14)

- 添加 Ollama 大模型翻译和 Google 翻译（非实时），支持多种语言 - 为 Vosk 引擎添加非实时翻译 - 为新增的翻译功能添加和修改接口 - 修改 Electron 构建配置，之后不同平台构建无需修改构建文件
2026-02-03 19:24:43 +08:00 · 2025-09-02 23:19:53 +08:00
parent 56fdc348f8
commit 14987cbfc5
16 changed files with 176 additions and 61 deletions
--- a/README.md
+++ b/README.md
@@ -188,15 +188,3 @@ npm run build:mac
 # For Linux
 npm run build:linux
 ```
-
-注意，根据不同的平台需要修改项目根目录下 `electron-builder.yml` 文件中的配置内容：
-
-```yml
-extraResources:
-  # For Windows
-  - from: ./engine/dist/main.exe
-    to: ./engine/main.exe
-  # For macOS and Linux
-  # - from: ./engine/dist/main
-  #   to: ./engine/main
-```
--- a/README_en.md
+++ b/README_en.md
@@ -188,15 +188,3 @@ npm run build:mac
 # For Linux
 npm run build:linux
 ```
-
-Note: You need to modify the configuration content in the `electron-builder.yml` file in the project root directory according to different platforms:
-
-```yml
-extraResources:
-  # For Windows
-  - from: ./engine/dist/main.exe
-    to: ./engine/main.exe
-  # For macOS and Linux
-  # - from: ./engine/dist/main
-  #   to: ./engine/main
-```
--- a/README_ja.md
+++ b/README_ja.md
@@ -188,15 +188,3 @@ npm run build:mac
 # Linux 用
 npm run build:linux
 ```
-
-注意: プラットフォームに応じて、プロジェクトルートディレクトリにある `electron-builder.yml` ファイルの設定内容を変更する必要があります:
-
-```yml
-extraResources:
-  # Windows 用
-  - from: ./engine/dist/main.exe
-    to: ./engine/main.exe
-  # macOS と Linux 用
-  # - from: ./engine/dist/main
-  #   to: ./engine/main
-```
--- a/docs/api-docs/caption-engine.md
+++ b/docs/api-docs/caption-engine.md
@@ -58,6 +58,18 @@ Electron 主进程通过 TCP Socket 向 Python 进程发送数据。发送的数

 Python 端监听到的音频流转换为的字幕数据。

+### `translation`
+
+```js
+{
+  command: "translation",
+  time_s: string,
+  translation: string
+}
+```
+
+语音识别的内容的翻译，可以根据起始时间确定对应的字幕。
+
 ### `print`

 ```js
@@ -67,7 +79,7 @@ Python 端监听到的音频流转换为的字幕数据。
 }
 ```

-输出 Python 端打印的内容。
+输出 Python 端打印的内容，不计入日志。

 ### `info`

@@ -78,7 +90,7 @@ Python 端监听到的音频流转换为的字幕数据。
 }
 ```

-Python 端打印的提示信息，比起 `print`，该信息更希望 Electron 端的关注。
+Python 端打印的提示信息，会计入日志。

 ### `error`

--- a/electron-builder.yml
+++ b/electron-builder.yml
@@ -15,14 +15,13 @@ files:
  - '!assets/*'
  - '!.repomap/*'
  - '!.virtualme/*'
-
 extraResources:
  # For Windows
  - from: ./engine/dist/main.exe
    to: ./engine/main.exe
  # For macOS and Linux
-  # - from: ./engine/dist/main
-  #   to: ./engine/main
+  - from: ./engine/dist/main
+    to: ./engine/main
 win:
  executableName: auto-caption
  icon: build/icon.png
--- a/engine/audio2text/vosk.py
+++ b/engine/audio2text/vosk.py
@@ -1,8 +1,10 @@
 import json
+import threading
+import time
 from datetime import datetime

 from vosk import Model, KaldiRecognizer, SetLogLevel
-from utils import stdout_cmd, stdout_obj
+from utils import stdout_cmd, stdout_obj, google_translate


 class VoskRecognizer:
@@ -11,15 +13,18 @@ class VoskRecognizer:

    初始化参数：
        model_path: Vosk 识别模型路径
+        target: 翻译目标语言
    """
-    def __init__(self, model_path: str):
+    def __init__(self, model_path: str, target: str | None):
        SetLogLevel(-1)
        if model_path.startswith('"'):
            model_path = model_path[1:]
        if model_path.endswith('"'):
            model_path = model_path[:-1]
        self.model_path = model_path
+        self.target = target
        self.time_str = ''
+        self.trans_time = time.time()
        self.cur_id = 0
        self.prev_content = ''

@@ -48,7 +53,15 @@ class VoskRecognizer:
            caption['time_s'] = self.time_str
            caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
            self.prev_content = ''
+            if content == '': return
            self.cur_id += 1
+            if self.target:
+                self.trans_time = time.time()
+                th = threading.Thread(
+                    target=google_translate,
+                    args=(caption['text'], self.target, self.time_str)
+                )
+                th.start()
        else:
            content = json.loads(self.recognizer.PartialResult()).get('partial', '')
            if content == '' or content == self.prev_content:
@@ -62,6 +75,13 @@ class VoskRecognizer:
            self.prev_content = content
        
        stdout_obj(caption)
+        if self.target and time.time() - self.trans_time > 2.0:
+            self.trans_time = time.time()
+            th = threading.Thread(
+                target=google_translate,
+                args=(caption['text'], self.target, self.time_str)
+            )
+            th.start()

    def stop(self):
        """停止 Vosk 引擎"""
--- a/engine/main.py
+++ b/engine/main.py
@@ -44,10 +44,10 @@ def main_gummy(s: str, t: str, a: int, c: int, k: str):
    engine.stop()


-def main_vosk(a: int, c: int, m: str):
+def main_vosk(a: int, c: int, m: str, t: str):
    global thread_data
    stream = AudioStream(a, c)
-    engine = VoskRecognizer(m)
+    engine = VoskRecognizer(m, None if t == 'none' else t)

    stream.open_stream()
    engine.start()
@@ -72,9 +72,9 @@ if __name__ == "__main__":
    parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output, 1 for input')
    parser.add_argument('-c', '--chunk_rate', default=10, help='Number of audio stream chunks collected per second')
    parser.add_argument('-p', '--port', default=8080, help='The port to run the server on, 0 for no server')
+    parser.add_argument('-t', '--target_language', default='zh', help='Target language code, "none" for no translation')
    # gummy only
    parser.add_argument('-s', '--source_language', default='en', help='Source language code')
-    parser.add_argument('-t', '--target_language', default='zh', help='Target language code')
    parser.add_argument('-k', '--api_key', default='', help='API KEY for Gummy model')
    # vosk only
    parser.add_argument('-m', '--model_path', default='', help='The path to the vosk model.')
@@ -97,7 +97,8 @@ if __name__ == "__main__":
        main_vosk(
            int(args.audio_type),
            int(args.chunk_rate),
-            args.model_path
+            args.model_path,
+            args.target_language
        )
    else:
        raise ValueError('Invalid caption engine specified.')
--- a/engine/requirements.txt
+++ b/engine/requirements.txt
@@ -5,3 +5,5 @@ vosk
 pyinstaller
 pyaudio; sys_platform == 'darwin'
 pyaudiowpatch; sys_platform == 'win32'
+googletrans
+ollama
--- a/engine/utils/init.py
+++ b/engine/utils/init.py
@@ -6,4 +6,5 @@ from .audioprcs import (
 )
 from .sysout import stdout, stdout_err, stdout_cmd, stdout_obj, stderr
 from .thdata import thread_data
-from .server import start_server
+from .server import start_server
+from .translation import ollama_translate, google_translate
--- a/engine/utils/translation.py
+++ b/engine/utils/translation.py
@@ -0,0 +1,57 @@
+from ollama import chat
+from ollama import ChatResponse
+import asyncio
+from googletrans import Translator
+from .sysout import stdout, stdout_obj
+
+lang_map = {
+    'en': 'English',
+    'es': 'Spanish',
+    'fr': 'French',
+    'de': 'German',
+    'it': 'Italian',
+    'ru': 'Russian',
+    'ja': 'Japanese',
+    'ko': 'Korean',
+    'zh': 'Chinese'
+}
+
+def ollama_translate(model: str, target: str, text: str, chunk_size = 3):
+    stream = chat(
+        model=model,
+        messages=[
+            {"role": "system", "content": f"/no_think Translate the following content into {lang_map[target]}, and do not output any additional information."},
+            {"role": "user", "content": text}
+        ],
+        stream=True
+    )
+    chunk_content = ""
+    in_thinking = False
+    count = 0
+    for chunk in stream:
+        if count == 0 and chunk['message']['content'].startswith("<think>"):
+            in_thinking = True
+        if in_thinking:
+            if "</think>" in chunk['message']['content']:
+                in_thinking = False
+            continue
+        chunk_content += ' '.join(chunk['message']['content'].split('\n'))
+        count += 1
+        if count % chunk_size == 0:
+            print(chunk_content, end='')
+            chunk_content = ""
+            count = 0
+    if chunk_content:
+        print(chunk_content)
+
+def google_translate(text: str, target: str, time_s: str):
+    translator = Translator()
+    try:
+        res = asyncio.run(translator.translate(text, dest=target))
+        stdout_obj({
+            "command": "translation",
+            "time_s": time_s,
+            "translation": res.text
+        })
+    except Exception as e:
+        stdout(f"Google Translation Request failed: {str(e)}")
--- a/src/main/types/index.ts
+++ b/src/main/types/index.ts
@@ -46,6 +46,11 @@ export interface CaptionItem {
  translation: string
 }

+export interface CaptionTranslation {
+  time_s: string,
+  translation: string
+}
+
 export interface SoftwareLogItem {
  type: "INFO" | "WARN" | "ERROR",
  index: number,
--- a/src/main/utils/AllConfig.ts
+++ b/src/main/utils/AllConfig.ts
@@ -1,6 +1,7 @@
 import {
  UILanguage, UITheme, Styles, Controls,
-  CaptionItem, FullConfig, SoftwareLogItem
+  CaptionItem, CaptionTranslation,
+  FullConfig, SoftwareLogItem
 } from '../types'
 import { Log } from './Log'
 import { app, BrowserWindow } from 'electron'
@@ -158,12 +159,28 @@ class AllConfig {
    }
  }

-  public sendCaptionLog(window: BrowserWindow, command: 'add' | 'upd' | 'set') {
+  public updateCaptionTranslation(trans: CaptionTranslation){
+    for(let i = this.captionLog.length - 1; i >= 0; i--){
+      if(this.captionLog[i].time_s === trans.time_s){
+        this.captionLog[i].translation = trans.translation
+        for(const window of BrowserWindow.getAllWindows()){
+          this.sendCaptionLog(window, 'upd', i)
+        }
+        break
+      }
+    }
+  }
+  public sendCaptionLog(
+    window: BrowserWindow,
+    command: 'add' | 'upd' | 'set',
+    index: number | undefined = undefined
+  ) {
    if(command === 'add'){
-      window.webContents.send(`both.captionLog.add`, this.captionLog[this.captionLog.length - 1])
+      window.webContents.send(`both.captionLog.add`, this.captionLog.at(-1))
    }
    else if(command === 'upd'){
-      window.webContents.send(`both.captionLog.upd`, this.captionLog[this.captionLog.length - 1])
+      if(index !== undefined) window.webContents.send(`both.captionLog.upd`, this.captionLog[index])
+      else window.webContents.send(`both.captionLog.upd`, this.captionLog.at(-1))
    }
    else if(command === 'set'){
      window.webContents.send(`both.captionLog.set`, this.captionLog)
--- a/src/main/utils/CaptionEngine.ts
+++ b/src/main/utils/CaptionEngine.ts
@@ -67,21 +67,20 @@ export class CaptionEngine {
      this.command.push('-a', allConfig.controls.audio ? '1' : '0')
      this.port = Math.floor(Math.random() * (65535 - 1024 + 1)) + 1024
      this.command.push('-p', this.port.toString())
+      this.command.push(
+        '-t', allConfig.controls.translation ?
+        allConfig.controls.targetLang : 'none'
+      )

      if(allConfig.controls.engine === 'gummy') {
        this.command.push('-e', 'gummy')
        this.command.push('-s', allConfig.controls.sourceLang)
-        this.command.push(
-          '-t', allConfig.controls.translation ?
-          allConfig.controls.targetLang : 'none'
-        )
        if(allConfig.controls.API_KEY) {
          this.command.push('-k', allConfig.controls.API_KEY)
        }
      }
      else if(allConfig.controls.engine === 'vosk'){
        this.command.push('-e', 'vosk')
-        
        this.command.push('-m', `"${allConfig.controls.modelPath}"`)        
      }
    }
@@ -249,8 +248,11 @@ function handleEngineData(data: any) {
  else if(data.command === 'caption') {
    allConfig.updateCaptionLog(data);
  }
+  else if(data.command === 'translation') {
+    allConfig.updateCaptionTranslation(data);
+  }
  else if(data.command === 'print') {
-    Log.info('Engine Print:', data.content)
+    console.log(data.content)
  }
  else if(data.command === 'info') {
    Log.info('Engine Info:', data.content)
--- a/src/renderer/src/components/EngineControl.vue
+++ b/src/renderer/src/components/EngineControl.vue
@@ -8,6 +8,7 @@
    <div class="input-item">
      <span class="input-label">{{ $t('engine.sourceLang') }}</span>
      <a-select
+        :disabled="currentEngine === 'vosk'"
        class="input-area"
        v-model:value="currentSourceLang"
        :options="langList"
@@ -16,7 +17,6 @@
    <div class="input-item">
      <span class="input-label">{{ $t('engine.transLang') }}</span>
      <a-select
-        :disabled="currentEngine === 'vosk'"
        class="input-area"
        v-model:value="currentTargetLang"
        :options="langList.filter((item) => item.value !== 'auto')"
@@ -222,7 +222,10 @@ watch(changeSignal, (val) => {
 watch(currentEngine, (val) => {
  if(val == 'vosk'){
    currentSourceLang.value = 'auto'
-    currentTargetLang.value = ''
+    currentTargetLang.value = useGeneralSettingStore().uiLanguage
+    if(currentTargetLang.value === 'zh') {
+      currentTargetLang.value = 'zh-cn'
+    }
  }
  else if(val == 'gummy'){
    currentSourceLang.value = 'auto'
--- a/src/renderer/src/i18n/config/engine.ts
+++ b/src/renderer/src/i18n/config/engine.ts
@@ -21,6 +21,15 @@ export const engines = {
      label: '本地 -  Vosk',
      languages: [
        { value: 'auto', label: '需要自行配置模型' },
+        { value: 'en', label: '英语' },
+        { value: 'zh-cn', label: '中文' },
+        { value: 'ja', label: '日语' },
+        { value: 'ko', label: '韩语' },
+        { value: 'de', label: '德语' },
+        { value: 'fr', label: '法语' },
+        { value: 'ru', label: '俄语' },
+        { value: 'es', label: '西班牙语' },
+        { value: 'it', label: '意大利语' },
      ]
    }
  ],
@@ -46,6 +55,15 @@ export const engines = {
      label: 'Local - Vosk',
      languages: [
        { value: 'auto', label: 'Model needs to be configured manually' },
+        { value: 'en', label: 'English' },
+        { value: 'zh-cn', label: 'Chinese' },
+        { value: 'ja', label: 'Japanese' },
+        { value: 'ko', label: 'Korean' },
+        { value: 'de', label: 'German' },
+        { value: 'fr', label: 'French' },
+        { value: 'ru', label: 'Russian' },
+        { value: 'es', label: 'Spanish' },
+        { value: 'it', label: 'Italian' },
      ]
    }
  ],
@@ -71,6 +89,15 @@ export const engines = {
      label: 'ローカル - Vosk',
      languages: [
        { value: 'auto', label: 'モデルを手動で設定する必要があります' },
+        { value: 'en', label: '英語' },
+        { value: 'zh-cn', label: '中国語' },
+        { value: 'ja', label: '日本語' },
+        { value: 'ko', label: '韓国語' },
+        { value: 'de', label: 'ドイツ語' },
+        { value: 'fr', label: 'フランス語' },
+        { value: 'ru', label: 'ロシア語' },
+        { value: 'es', label: 'スペイン語' },
+        { value: 'it', label: 'イタリア語' },
      ]
    }
  ]
--- a/src/renderer/src/stores/captionLog.ts
+++ b/src/renderer/src/stores/captionLog.ts
@@ -15,7 +15,12 @@ export const useCaptionLogStore = defineStore('captionLog', () => {
  })

  window.electron.ipcRenderer.on('both.captionLog.upd', (_, log) => {
-    captionData.value.splice(captionData.value.length - 1, 1, log)
+    for(let i = captionData.value.length - 1; i >= 0; i--) {
+      if(captionData.value[i].time_s === log.time_s){
+        captionData.value.splice(i, 1, log)
+        break
+      }
+    }
  })

  window.electron.ipcRenderer.on('both.captionLog.set', (_, logs) => {