feat(vosk): 为 Vosk 模型添加非实时翻译功能 (#14)

- 添加 Ollama 大模型翻译和 Google 翻译(非实时),支持多种语言
- 为 Vosk 引擎添加非实时翻译
- 为新增的翻译功能添加和修改接口
- 修改 Electron 构建配置,之后不同平台构建无需修改构建文件
This commit is contained in:
himeditator
2025-09-02 23:19:53 +08:00
parent 56fdc348f8
commit 14987cbfc5
16 changed files with 176 additions and 61 deletions

View File

@@ -188,15 +188,3 @@ npm run build:mac
# For Linux # For Linux
npm run build:linux npm run build:linux
``` ```
注意,根据不同的平台需要修改项目根目录下 `electron-builder.yml` 文件中的配置内容:
```yml
extraResources:
# For Windows
- from: ./engine/dist/main.exe
to: ./engine/main.exe
# For macOS and Linux
# - from: ./engine/dist/main
# to: ./engine/main
```

View File

@@ -188,15 +188,3 @@ npm run build:mac
# For Linux # For Linux
npm run build:linux npm run build:linux
``` ```
Note: You need to modify the configuration content in the `electron-builder.yml` file in the project root directory according to different platforms:
```yml
extraResources:
# For Windows
- from: ./engine/dist/main.exe
to: ./engine/main.exe
# For macOS and Linux
# - from: ./engine/dist/main
# to: ./engine/main
```

View File

@@ -188,15 +188,3 @@ npm run build:mac
# Linux 用 # Linux 用
npm run build:linux npm run build:linux
``` ```
注意: プラットフォームに応じて、プロジェクトルートディレクトリにある `electron-builder.yml` ファイルの設定内容を変更する必要があります:
```yml
extraResources:
# Windows 用
- from: ./engine/dist/main.exe
to: ./engine/main.exe
# macOS と Linux 用
# - from: ./engine/dist/main
# to: ./engine/main
```

View File

@@ -58,6 +58,18 @@ Electron 主进程通过 TCP Socket 向 Python 进程发送数据。发送的数
Python 端监听到的音频流转换为的字幕数据。 Python 端监听到的音频流转换为的字幕数据。
### `translation`
```js
{
command: "translation",
time_s: string,
translation: string
}
```
语音识别的内容的翻译,可以根据起始时间确定对应的字幕。
### `print` ### `print`
```js ```js
@@ -67,7 +79,7 @@ Python 端监听到的音频流转换为的字幕数据。
} }
``` ```
输出 Python 端打印的内容。 输出 Python 端打印的内容,不计入日志
### `info` ### `info`
@@ -78,7 +90,7 @@ Python 端监听到的音频流转换为的字幕数据。
} }
``` ```
Python 端打印的提示信息,比起 `print`,该信息更希望 Electron 端的关注 Python 端打印的提示信息,会计入日志
### `error` ### `error`

View File

@@ -15,14 +15,13 @@ files:
- '!assets/*' - '!assets/*'
- '!.repomap/*' - '!.repomap/*'
- '!.virtualme/*' - '!.virtualme/*'
extraResources: extraResources:
# For Windows # For Windows
- from: ./engine/dist/main.exe - from: ./engine/dist/main.exe
to: ./engine/main.exe to: ./engine/main.exe
# For macOS and Linux # For macOS and Linux
# - from: ./engine/dist/main - from: ./engine/dist/main
# to: ./engine/main to: ./engine/main
win: win:
executableName: auto-caption executableName: auto-caption
icon: build/icon.png icon: build/icon.png

View File

@@ -1,8 +1,10 @@
import json import json
import threading
import time
from datetime import datetime from datetime import datetime
from vosk import Model, KaldiRecognizer, SetLogLevel from vosk import Model, KaldiRecognizer, SetLogLevel
from utils import stdout_cmd, stdout_obj from utils import stdout_cmd, stdout_obj, google_translate
class VoskRecognizer: class VoskRecognizer:
@@ -11,15 +13,18 @@ class VoskRecognizer:
初始化参数: 初始化参数:
model_path: Vosk 识别模型路径 model_path: Vosk 识别模型路径
target: 翻译目标语言
""" """
def __init__(self, model_path: str): def __init__(self, model_path: str, target: str | None):
SetLogLevel(-1) SetLogLevel(-1)
if model_path.startswith('"'): if model_path.startswith('"'):
model_path = model_path[1:] model_path = model_path[1:]
if model_path.endswith('"'): if model_path.endswith('"'):
model_path = model_path[:-1] model_path = model_path[:-1]
self.model_path = model_path self.model_path = model_path
self.target = target
self.time_str = '' self.time_str = ''
self.trans_time = time.time()
self.cur_id = 0 self.cur_id = 0
self.prev_content = '' self.prev_content = ''
@@ -48,7 +53,15 @@ class VoskRecognizer:
caption['time_s'] = self.time_str caption['time_s'] = self.time_str
caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3] caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
self.prev_content = '' self.prev_content = ''
if content == '': return
self.cur_id += 1 self.cur_id += 1
if self.target:
self.trans_time = time.time()
th = threading.Thread(
target=google_translate,
args=(caption['text'], self.target, self.time_str)
)
th.start()
else: else:
content = json.loads(self.recognizer.PartialResult()).get('partial', '') content = json.loads(self.recognizer.PartialResult()).get('partial', '')
if content == '' or content == self.prev_content: if content == '' or content == self.prev_content:
@@ -62,6 +75,13 @@ class VoskRecognizer:
self.prev_content = content self.prev_content = content
stdout_obj(caption) stdout_obj(caption)
if self.target and time.time() - self.trans_time > 2.0:
self.trans_time = time.time()
th = threading.Thread(
target=google_translate,
args=(caption['text'], self.target, self.time_str)
)
th.start()
def stop(self): def stop(self):
"""停止 Vosk 引擎""" """停止 Vosk 引擎"""

View File

@@ -44,10 +44,10 @@ def main_gummy(s: str, t: str, a: int, c: int, k: str):
engine.stop() engine.stop()
def main_vosk(a: int, c: int, m: str): def main_vosk(a: int, c: int, m: str, t: str):
global thread_data global thread_data
stream = AudioStream(a, c) stream = AudioStream(a, c)
engine = VoskRecognizer(m) engine = VoskRecognizer(m, None if t == 'none' else t)
stream.open_stream() stream.open_stream()
engine.start() engine.start()
@@ -72,9 +72,9 @@ if __name__ == "__main__":
parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output, 1 for input') parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output, 1 for input')
parser.add_argument('-c', '--chunk_rate', default=10, help='Number of audio stream chunks collected per second') parser.add_argument('-c', '--chunk_rate', default=10, help='Number of audio stream chunks collected per second')
parser.add_argument('-p', '--port', default=8080, help='The port to run the server on, 0 for no server') parser.add_argument('-p', '--port', default=8080, help='The port to run the server on, 0 for no server')
parser.add_argument('-t', '--target_language', default='zh', help='Target language code, "none" for no translation')
# gummy only # gummy only
parser.add_argument('-s', '--source_language', default='en', help='Source language code') parser.add_argument('-s', '--source_language', default='en', help='Source language code')
parser.add_argument('-t', '--target_language', default='zh', help='Target language code')
parser.add_argument('-k', '--api_key', default='', help='API KEY for Gummy model') parser.add_argument('-k', '--api_key', default='', help='API KEY for Gummy model')
# vosk only # vosk only
parser.add_argument('-m', '--model_path', default='', help='The path to the vosk model.') parser.add_argument('-m', '--model_path', default='', help='The path to the vosk model.')
@@ -97,7 +97,8 @@ if __name__ == "__main__":
main_vosk( main_vosk(
int(args.audio_type), int(args.audio_type),
int(args.chunk_rate), int(args.chunk_rate),
args.model_path args.model_path,
args.target_language
) )
else: else:
raise ValueError('Invalid caption engine specified.') raise ValueError('Invalid caption engine specified.')

View File

@@ -5,3 +5,5 @@ vosk
pyinstaller pyinstaller
pyaudio; sys_platform == 'darwin' pyaudio; sys_platform == 'darwin'
pyaudiowpatch; sys_platform == 'win32' pyaudiowpatch; sys_platform == 'win32'
googletrans
ollama

View File

@@ -6,4 +6,5 @@ from .audioprcs import (
) )
from .sysout import stdout, stdout_err, stdout_cmd, stdout_obj, stderr from .sysout import stdout, stdout_err, stdout_cmd, stdout_obj, stderr
from .thdata import thread_data from .thdata import thread_data
from .server import start_server from .server import start_server
from .translation import ollama_translate, google_translate

View File

@@ -0,0 +1,57 @@
from ollama import chat
from ollama import ChatResponse
import asyncio
from googletrans import Translator
from .sysout import stdout, stdout_obj
lang_map = {
'en': 'English',
'es': 'Spanish',
'fr': 'French',
'de': 'German',
'it': 'Italian',
'ru': 'Russian',
'ja': 'Japanese',
'ko': 'Korean',
'zh': 'Chinese'
}
def ollama_translate(model: str, target: str, text: str, chunk_size = 3):
stream = chat(
model=model,
messages=[
{"role": "system", "content": f"/no_think Translate the following content into {lang_map[target]}, and do not output any additional information."},
{"role": "user", "content": text}
],
stream=True
)
chunk_content = ""
in_thinking = False
count = 0
for chunk in stream:
if count == 0 and chunk['message']['content'].startswith("<think>"):
in_thinking = True
if in_thinking:
if "</think>" in chunk['message']['content']:
in_thinking = False
continue
chunk_content += ' '.join(chunk['message']['content'].split('\n'))
count += 1
if count % chunk_size == 0:
print(chunk_content, end='')
chunk_content = ""
count = 0
if chunk_content:
print(chunk_content)
def google_translate(text: str, target: str, time_s: str):
translator = Translator()
try:
res = asyncio.run(translator.translate(text, dest=target))
stdout_obj({
"command": "translation",
"time_s": time_s,
"translation": res.text
})
except Exception as e:
stdout(f"Google Translation Request failed: {str(e)}")

View File

@@ -46,6 +46,11 @@ export interface CaptionItem {
translation: string translation: string
} }
export interface CaptionTranslation {
time_s: string,
translation: string
}
export interface SoftwareLogItem { export interface SoftwareLogItem {
type: "INFO" | "WARN" | "ERROR", type: "INFO" | "WARN" | "ERROR",
index: number, index: number,

View File

@@ -1,6 +1,7 @@
import { import {
UILanguage, UITheme, Styles, Controls, UILanguage, UITheme, Styles, Controls,
CaptionItem, FullConfig, SoftwareLogItem CaptionItem, CaptionTranslation,
FullConfig, SoftwareLogItem
} from '../types' } from '../types'
import { Log } from './Log' import { Log } from './Log'
import { app, BrowserWindow } from 'electron' import { app, BrowserWindow } from 'electron'
@@ -158,12 +159,28 @@ class AllConfig {
} }
} }
public sendCaptionLog(window: BrowserWindow, command: 'add' | 'upd' | 'set') { public updateCaptionTranslation(trans: CaptionTranslation){
for(let i = this.captionLog.length - 1; i >= 0; i--){
if(this.captionLog[i].time_s === trans.time_s){
this.captionLog[i].translation = trans.translation
for(const window of BrowserWindow.getAllWindows()){
this.sendCaptionLog(window, 'upd', i)
}
break
}
}
}
public sendCaptionLog(
window: BrowserWindow,
command: 'add' | 'upd' | 'set',
index: number | undefined = undefined
) {
if(command === 'add'){ if(command === 'add'){
window.webContents.send(`both.captionLog.add`, this.captionLog[this.captionLog.length - 1]) window.webContents.send(`both.captionLog.add`, this.captionLog.at(-1))
} }
else if(command === 'upd'){ else if(command === 'upd'){
window.webContents.send(`both.captionLog.upd`, this.captionLog[this.captionLog.length - 1]) if(index !== undefined) window.webContents.send(`both.captionLog.upd`, this.captionLog[index])
else window.webContents.send(`both.captionLog.upd`, this.captionLog.at(-1))
} }
else if(command === 'set'){ else if(command === 'set'){
window.webContents.send(`both.captionLog.set`, this.captionLog) window.webContents.send(`both.captionLog.set`, this.captionLog)

View File

@@ -67,21 +67,20 @@ export class CaptionEngine {
this.command.push('-a', allConfig.controls.audio ? '1' : '0') this.command.push('-a', allConfig.controls.audio ? '1' : '0')
this.port = Math.floor(Math.random() * (65535 - 1024 + 1)) + 1024 this.port = Math.floor(Math.random() * (65535 - 1024 + 1)) + 1024
this.command.push('-p', this.port.toString()) this.command.push('-p', this.port.toString())
this.command.push(
'-t', allConfig.controls.translation ?
allConfig.controls.targetLang : 'none'
)
if(allConfig.controls.engine === 'gummy') { if(allConfig.controls.engine === 'gummy') {
this.command.push('-e', 'gummy') this.command.push('-e', 'gummy')
this.command.push('-s', allConfig.controls.sourceLang) this.command.push('-s', allConfig.controls.sourceLang)
this.command.push(
'-t', allConfig.controls.translation ?
allConfig.controls.targetLang : 'none'
)
if(allConfig.controls.API_KEY) { if(allConfig.controls.API_KEY) {
this.command.push('-k', allConfig.controls.API_KEY) this.command.push('-k', allConfig.controls.API_KEY)
} }
} }
else if(allConfig.controls.engine === 'vosk'){ else if(allConfig.controls.engine === 'vosk'){
this.command.push('-e', 'vosk') this.command.push('-e', 'vosk')
this.command.push('-m', `"${allConfig.controls.modelPath}"`) this.command.push('-m', `"${allConfig.controls.modelPath}"`)
} }
} }
@@ -249,8 +248,11 @@ function handleEngineData(data: any) {
else if(data.command === 'caption') { else if(data.command === 'caption') {
allConfig.updateCaptionLog(data); allConfig.updateCaptionLog(data);
} }
else if(data.command === 'translation') {
allConfig.updateCaptionTranslation(data);
}
else if(data.command === 'print') { else if(data.command === 'print') {
Log.info('Engine Print:', data.content) console.log(data.content)
} }
else if(data.command === 'info') { else if(data.command === 'info') {
Log.info('Engine Info:', data.content) Log.info('Engine Info:', data.content)

View File

@@ -8,6 +8,7 @@
<div class="input-item"> <div class="input-item">
<span class="input-label">{{ $t('engine.sourceLang') }}</span> <span class="input-label">{{ $t('engine.sourceLang') }}</span>
<a-select <a-select
:disabled="currentEngine === 'vosk'"
class="input-area" class="input-area"
v-model:value="currentSourceLang" v-model:value="currentSourceLang"
:options="langList" :options="langList"
@@ -16,7 +17,6 @@
<div class="input-item"> <div class="input-item">
<span class="input-label">{{ $t('engine.transLang') }}</span> <span class="input-label">{{ $t('engine.transLang') }}</span>
<a-select <a-select
:disabled="currentEngine === 'vosk'"
class="input-area" class="input-area"
v-model:value="currentTargetLang" v-model:value="currentTargetLang"
:options="langList.filter((item) => item.value !== 'auto')" :options="langList.filter((item) => item.value !== 'auto')"
@@ -222,7 +222,10 @@ watch(changeSignal, (val) => {
watch(currentEngine, (val) => { watch(currentEngine, (val) => {
if(val == 'vosk'){ if(val == 'vosk'){
currentSourceLang.value = 'auto' currentSourceLang.value = 'auto'
currentTargetLang.value = '' currentTargetLang.value = useGeneralSettingStore().uiLanguage
if(currentTargetLang.value === 'zh') {
currentTargetLang.value = 'zh-cn'
}
} }
else if(val == 'gummy'){ else if(val == 'gummy'){
currentSourceLang.value = 'auto' currentSourceLang.value = 'auto'

View File

@@ -21,6 +21,15 @@ export const engines = {
label: '本地 - Vosk', label: '本地 - Vosk',
languages: [ languages: [
{ value: 'auto', label: '需要自行配置模型' }, { value: 'auto', label: '需要自行配置模型' },
{ value: 'en', label: '英语' },
{ value: 'zh-cn', label: '中文' },
{ value: 'ja', label: '日语' },
{ value: 'ko', label: '韩语' },
{ value: 'de', label: '德语' },
{ value: 'fr', label: '法语' },
{ value: 'ru', label: '俄语' },
{ value: 'es', label: '西班牙语' },
{ value: 'it', label: '意大利语' },
] ]
} }
], ],
@@ -46,6 +55,15 @@ export const engines = {
label: 'Local - Vosk', label: 'Local - Vosk',
languages: [ languages: [
{ value: 'auto', label: 'Model needs to be configured manually' }, { value: 'auto', label: 'Model needs to be configured manually' },
{ value: 'en', label: 'English' },
{ value: 'zh-cn', label: 'Chinese' },
{ value: 'ja', label: 'Japanese' },
{ value: 'ko', label: 'Korean' },
{ value: 'de', label: 'German' },
{ value: 'fr', label: 'French' },
{ value: 'ru', label: 'Russian' },
{ value: 'es', label: 'Spanish' },
{ value: 'it', label: 'Italian' },
] ]
} }
], ],
@@ -71,6 +89,15 @@ export const engines = {
label: 'ローカル - Vosk', label: 'ローカル - Vosk',
languages: [ languages: [
{ value: 'auto', label: 'モデルを手動で設定する必要があります' }, { value: 'auto', label: 'モデルを手動で設定する必要があります' },
{ value: 'en', label: '英語' },
{ value: 'zh-cn', label: '中国語' },
{ value: 'ja', label: '日本語' },
{ value: 'ko', label: '韓国語' },
{ value: 'de', label: 'ドイツ語' },
{ value: 'fr', label: 'フランス語' },
{ value: 'ru', label: 'ロシア語' },
{ value: 'es', label: 'スペイン語' },
{ value: 'it', label: 'イタリア語' },
] ]
} }
] ]

View File

@@ -15,7 +15,12 @@ export const useCaptionLogStore = defineStore('captionLog', () => {
}) })
window.electron.ipcRenderer.on('both.captionLog.upd', (_, log) => { window.electron.ipcRenderer.on('both.captionLog.upd', (_, log) => {
captionData.value.splice(captionData.value.length - 1, 1, log) for(let i = captionData.value.length - 1; i >= 0; i--) {
if(captionData.value[i].time_s === log.time_s){
captionData.value.splice(i, 1, log)
break
}
}
}) })
window.electron.ipcRenderer.on('both.captionLog.set', (_, logs) => { window.electron.ipcRenderer.on('both.captionLog.set', (_, logs) => {