mirror of
https://github.com/HiMeditator/auto-caption.git
synced 2026-02-03 19:24:43 +08:00
feat(vosk): 为 Vosk 模型添加非实时翻译功能 (#14)
- 添加 Ollama 大模型翻译和 Google 翻译(非实时),支持多种语言 - 为 Vosk 引擎添加非实时翻译 - 为新增的翻译功能添加和修改接口 - 修改 Electron 构建配置,之后不同平台构建无需修改构建文件
This commit is contained in:
12
README.md
12
README.md
@@ -188,15 +188,3 @@ npm run build:mac
|
||||
# For Linux
|
||||
npm run build:linux
|
||||
```
|
||||
|
||||
注意,根据不同的平台需要修改项目根目录下 `electron-builder.yml` 文件中的配置内容:
|
||||
|
||||
```yml
|
||||
extraResources:
|
||||
# For Windows
|
||||
- from: ./engine/dist/main.exe
|
||||
to: ./engine/main.exe
|
||||
# For macOS and Linux
|
||||
# - from: ./engine/dist/main
|
||||
# to: ./engine/main
|
||||
```
|
||||
|
||||
12
README_en.md
12
README_en.md
@@ -188,15 +188,3 @@ npm run build:mac
|
||||
# For Linux
|
||||
npm run build:linux
|
||||
```
|
||||
|
||||
Note: You need to modify the configuration content in the `electron-builder.yml` file in the project root directory according to different platforms:
|
||||
|
||||
```yml
|
||||
extraResources:
|
||||
# For Windows
|
||||
- from: ./engine/dist/main.exe
|
||||
to: ./engine/main.exe
|
||||
# For macOS and Linux
|
||||
# - from: ./engine/dist/main
|
||||
# to: ./engine/main
|
||||
```
|
||||
12
README_ja.md
12
README_ja.md
@@ -188,15 +188,3 @@ npm run build:mac
|
||||
# Linux 用
|
||||
npm run build:linux
|
||||
```
|
||||
|
||||
注意: プラットフォームに応じて、プロジェクトルートディレクトリにある `electron-builder.yml` ファイルの設定内容を変更する必要があります:
|
||||
|
||||
```yml
|
||||
extraResources:
|
||||
# Windows 用
|
||||
- from: ./engine/dist/main.exe
|
||||
to: ./engine/main.exe
|
||||
# macOS と Linux 用
|
||||
# - from: ./engine/dist/main
|
||||
# to: ./engine/main
|
||||
```
|
||||
|
||||
@@ -58,6 +58,18 @@ Electron 主进程通过 TCP Socket 向 Python 进程发送数据。发送的数
|
||||
|
||||
Python 端监听到的音频流转换为的字幕数据。
|
||||
|
||||
### `translation`
|
||||
|
||||
```js
|
||||
{
|
||||
command: "translation",
|
||||
time_s: string,
|
||||
translation: string
|
||||
}
|
||||
```
|
||||
|
||||
语音识别的内容的翻译,可以根据起始时间确定对应的字幕。
|
||||
|
||||
### `print`
|
||||
|
||||
```js
|
||||
@@ -67,7 +79,7 @@ Python 端监听到的音频流转换为的字幕数据。
|
||||
}
|
||||
```
|
||||
|
||||
输出 Python 端打印的内容。
|
||||
输出 Python 端打印的内容,不计入日志。
|
||||
|
||||
### `info`
|
||||
|
||||
@@ -78,7 +90,7 @@ Python 端监听到的音频流转换为的字幕数据。
|
||||
}
|
||||
```
|
||||
|
||||
Python 端打印的提示信息,比起 `print`,该信息更希望 Electron 端的关注。
|
||||
Python 端打印的提示信息,会计入日志。
|
||||
|
||||
### `error`
|
||||
|
||||
|
||||
@@ -15,14 +15,13 @@ files:
|
||||
- '!assets/*'
|
||||
- '!.repomap/*'
|
||||
- '!.virtualme/*'
|
||||
|
||||
extraResources:
|
||||
# For Windows
|
||||
- from: ./engine/dist/main.exe
|
||||
to: ./engine/main.exe
|
||||
# For macOS and Linux
|
||||
# - from: ./engine/dist/main
|
||||
# to: ./engine/main
|
||||
- from: ./engine/dist/main
|
||||
to: ./engine/main
|
||||
win:
|
||||
executableName: auto-caption
|
||||
icon: build/icon.png
|
||||
|
||||
@@ -1,8 +1,10 @@
|
||||
import json
|
||||
import threading
|
||||
import time
|
||||
from datetime import datetime
|
||||
|
||||
from vosk import Model, KaldiRecognizer, SetLogLevel
|
||||
from utils import stdout_cmd, stdout_obj
|
||||
from utils import stdout_cmd, stdout_obj, google_translate
|
||||
|
||||
|
||||
class VoskRecognizer:
|
||||
@@ -11,15 +13,18 @@ class VoskRecognizer:
|
||||
|
||||
初始化参数:
|
||||
model_path: Vosk 识别模型路径
|
||||
target: 翻译目标语言
|
||||
"""
|
||||
def __init__(self, model_path: str):
|
||||
def __init__(self, model_path: str, target: str | None):
|
||||
SetLogLevel(-1)
|
||||
if model_path.startswith('"'):
|
||||
model_path = model_path[1:]
|
||||
if model_path.endswith('"'):
|
||||
model_path = model_path[:-1]
|
||||
self.model_path = model_path
|
||||
self.target = target
|
||||
self.time_str = ''
|
||||
self.trans_time = time.time()
|
||||
self.cur_id = 0
|
||||
self.prev_content = ''
|
||||
|
||||
@@ -48,7 +53,15 @@ class VoskRecognizer:
|
||||
caption['time_s'] = self.time_str
|
||||
caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
|
||||
self.prev_content = ''
|
||||
if content == '': return
|
||||
self.cur_id += 1
|
||||
if self.target:
|
||||
self.trans_time = time.time()
|
||||
th = threading.Thread(
|
||||
target=google_translate,
|
||||
args=(caption['text'], self.target, self.time_str)
|
||||
)
|
||||
th.start()
|
||||
else:
|
||||
content = json.loads(self.recognizer.PartialResult()).get('partial', '')
|
||||
if content == '' or content == self.prev_content:
|
||||
@@ -62,6 +75,13 @@ class VoskRecognizer:
|
||||
self.prev_content = content
|
||||
|
||||
stdout_obj(caption)
|
||||
if self.target and time.time() - self.trans_time > 2.0:
|
||||
self.trans_time = time.time()
|
||||
th = threading.Thread(
|
||||
target=google_translate,
|
||||
args=(caption['text'], self.target, self.time_str)
|
||||
)
|
||||
th.start()
|
||||
|
||||
def stop(self):
|
||||
"""停止 Vosk 引擎"""
|
||||
|
||||
@@ -44,10 +44,10 @@ def main_gummy(s: str, t: str, a: int, c: int, k: str):
|
||||
engine.stop()
|
||||
|
||||
|
||||
def main_vosk(a: int, c: int, m: str):
|
||||
def main_vosk(a: int, c: int, m: str, t: str):
|
||||
global thread_data
|
||||
stream = AudioStream(a, c)
|
||||
engine = VoskRecognizer(m)
|
||||
engine = VoskRecognizer(m, None if t == 'none' else t)
|
||||
|
||||
stream.open_stream()
|
||||
engine.start()
|
||||
@@ -72,9 +72,9 @@ if __name__ == "__main__":
|
||||
parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output, 1 for input')
|
||||
parser.add_argument('-c', '--chunk_rate', default=10, help='Number of audio stream chunks collected per second')
|
||||
parser.add_argument('-p', '--port', default=8080, help='The port to run the server on, 0 for no server')
|
||||
parser.add_argument('-t', '--target_language', default='zh', help='Target language code, "none" for no translation')
|
||||
# gummy only
|
||||
parser.add_argument('-s', '--source_language', default='en', help='Source language code')
|
||||
parser.add_argument('-t', '--target_language', default='zh', help='Target language code')
|
||||
parser.add_argument('-k', '--api_key', default='', help='API KEY for Gummy model')
|
||||
# vosk only
|
||||
parser.add_argument('-m', '--model_path', default='', help='The path to the vosk model.')
|
||||
@@ -97,7 +97,8 @@ if __name__ == "__main__":
|
||||
main_vosk(
|
||||
int(args.audio_type),
|
||||
int(args.chunk_rate),
|
||||
args.model_path
|
||||
args.model_path,
|
||||
args.target_language
|
||||
)
|
||||
else:
|
||||
raise ValueError('Invalid caption engine specified.')
|
||||
|
||||
@@ -5,3 +5,5 @@ vosk
|
||||
pyinstaller
|
||||
pyaudio; sys_platform == 'darwin'
|
||||
pyaudiowpatch; sys_platform == 'win32'
|
||||
googletrans
|
||||
ollama
|
||||
|
||||
@@ -6,4 +6,5 @@ from .audioprcs import (
|
||||
)
|
||||
from .sysout import stdout, stdout_err, stdout_cmd, stdout_obj, stderr
|
||||
from .thdata import thread_data
|
||||
from .server import start_server
|
||||
from .server import start_server
|
||||
from .translation import ollama_translate, google_translate
|
||||
57
engine/utils/translation.py
Normal file
57
engine/utils/translation.py
Normal file
@@ -0,0 +1,57 @@
|
||||
from ollama import chat
|
||||
from ollama import ChatResponse
|
||||
import asyncio
|
||||
from googletrans import Translator
|
||||
from .sysout import stdout, stdout_obj
|
||||
|
||||
lang_map = {
|
||||
'en': 'English',
|
||||
'es': 'Spanish',
|
||||
'fr': 'French',
|
||||
'de': 'German',
|
||||
'it': 'Italian',
|
||||
'ru': 'Russian',
|
||||
'ja': 'Japanese',
|
||||
'ko': 'Korean',
|
||||
'zh': 'Chinese'
|
||||
}
|
||||
|
||||
def ollama_translate(model: str, target: str, text: str, chunk_size = 3):
|
||||
stream = chat(
|
||||
model=model,
|
||||
messages=[
|
||||
{"role": "system", "content": f"/no_think Translate the following content into {lang_map[target]}, and do not output any additional information."},
|
||||
{"role": "user", "content": text}
|
||||
],
|
||||
stream=True
|
||||
)
|
||||
chunk_content = ""
|
||||
in_thinking = False
|
||||
count = 0
|
||||
for chunk in stream:
|
||||
if count == 0 and chunk['message']['content'].startswith("<think>"):
|
||||
in_thinking = True
|
||||
if in_thinking:
|
||||
if "</think>" in chunk['message']['content']:
|
||||
in_thinking = False
|
||||
continue
|
||||
chunk_content += ' '.join(chunk['message']['content'].split('\n'))
|
||||
count += 1
|
||||
if count % chunk_size == 0:
|
||||
print(chunk_content, end='')
|
||||
chunk_content = ""
|
||||
count = 0
|
||||
if chunk_content:
|
||||
print(chunk_content)
|
||||
|
||||
def google_translate(text: str, target: str, time_s: str):
|
||||
translator = Translator()
|
||||
try:
|
||||
res = asyncio.run(translator.translate(text, dest=target))
|
||||
stdout_obj({
|
||||
"command": "translation",
|
||||
"time_s": time_s,
|
||||
"translation": res.text
|
||||
})
|
||||
except Exception as e:
|
||||
stdout(f"Google Translation Request failed: {str(e)}")
|
||||
@@ -46,6 +46,11 @@ export interface CaptionItem {
|
||||
translation: string
|
||||
}
|
||||
|
||||
export interface CaptionTranslation {
|
||||
time_s: string,
|
||||
translation: string
|
||||
}
|
||||
|
||||
export interface SoftwareLogItem {
|
||||
type: "INFO" | "WARN" | "ERROR",
|
||||
index: number,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import {
|
||||
UILanguage, UITheme, Styles, Controls,
|
||||
CaptionItem, FullConfig, SoftwareLogItem
|
||||
CaptionItem, CaptionTranslation,
|
||||
FullConfig, SoftwareLogItem
|
||||
} from '../types'
|
||||
import { Log } from './Log'
|
||||
import { app, BrowserWindow } from 'electron'
|
||||
@@ -158,12 +159,28 @@ class AllConfig {
|
||||
}
|
||||
}
|
||||
|
||||
public sendCaptionLog(window: BrowserWindow, command: 'add' | 'upd' | 'set') {
|
||||
public updateCaptionTranslation(trans: CaptionTranslation){
|
||||
for(let i = this.captionLog.length - 1; i >= 0; i--){
|
||||
if(this.captionLog[i].time_s === trans.time_s){
|
||||
this.captionLog[i].translation = trans.translation
|
||||
for(const window of BrowserWindow.getAllWindows()){
|
||||
this.sendCaptionLog(window, 'upd', i)
|
||||
}
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
public sendCaptionLog(
|
||||
window: BrowserWindow,
|
||||
command: 'add' | 'upd' | 'set',
|
||||
index: number | undefined = undefined
|
||||
) {
|
||||
if(command === 'add'){
|
||||
window.webContents.send(`both.captionLog.add`, this.captionLog[this.captionLog.length - 1])
|
||||
window.webContents.send(`both.captionLog.add`, this.captionLog.at(-1))
|
||||
}
|
||||
else if(command === 'upd'){
|
||||
window.webContents.send(`both.captionLog.upd`, this.captionLog[this.captionLog.length - 1])
|
||||
if(index !== undefined) window.webContents.send(`both.captionLog.upd`, this.captionLog[index])
|
||||
else window.webContents.send(`both.captionLog.upd`, this.captionLog.at(-1))
|
||||
}
|
||||
else if(command === 'set'){
|
||||
window.webContents.send(`both.captionLog.set`, this.captionLog)
|
||||
|
||||
@@ -67,21 +67,20 @@ export class CaptionEngine {
|
||||
this.command.push('-a', allConfig.controls.audio ? '1' : '0')
|
||||
this.port = Math.floor(Math.random() * (65535 - 1024 + 1)) + 1024
|
||||
this.command.push('-p', this.port.toString())
|
||||
this.command.push(
|
||||
'-t', allConfig.controls.translation ?
|
||||
allConfig.controls.targetLang : 'none'
|
||||
)
|
||||
|
||||
if(allConfig.controls.engine === 'gummy') {
|
||||
this.command.push('-e', 'gummy')
|
||||
this.command.push('-s', allConfig.controls.sourceLang)
|
||||
this.command.push(
|
||||
'-t', allConfig.controls.translation ?
|
||||
allConfig.controls.targetLang : 'none'
|
||||
)
|
||||
if(allConfig.controls.API_KEY) {
|
||||
this.command.push('-k', allConfig.controls.API_KEY)
|
||||
}
|
||||
}
|
||||
else if(allConfig.controls.engine === 'vosk'){
|
||||
this.command.push('-e', 'vosk')
|
||||
|
||||
this.command.push('-m', `"${allConfig.controls.modelPath}"`)
|
||||
}
|
||||
}
|
||||
@@ -249,8 +248,11 @@ function handleEngineData(data: any) {
|
||||
else if(data.command === 'caption') {
|
||||
allConfig.updateCaptionLog(data);
|
||||
}
|
||||
else if(data.command === 'translation') {
|
||||
allConfig.updateCaptionTranslation(data);
|
||||
}
|
||||
else if(data.command === 'print') {
|
||||
Log.info('Engine Print:', data.content)
|
||||
console.log(data.content)
|
||||
}
|
||||
else if(data.command === 'info') {
|
||||
Log.info('Engine Info:', data.content)
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
<div class="input-item">
|
||||
<span class="input-label">{{ $t('engine.sourceLang') }}</span>
|
||||
<a-select
|
||||
:disabled="currentEngine === 'vosk'"
|
||||
class="input-area"
|
||||
v-model:value="currentSourceLang"
|
||||
:options="langList"
|
||||
@@ -16,7 +17,6 @@
|
||||
<div class="input-item">
|
||||
<span class="input-label">{{ $t('engine.transLang') }}</span>
|
||||
<a-select
|
||||
:disabled="currentEngine === 'vosk'"
|
||||
class="input-area"
|
||||
v-model:value="currentTargetLang"
|
||||
:options="langList.filter((item) => item.value !== 'auto')"
|
||||
@@ -222,7 +222,10 @@ watch(changeSignal, (val) => {
|
||||
watch(currentEngine, (val) => {
|
||||
if(val == 'vosk'){
|
||||
currentSourceLang.value = 'auto'
|
||||
currentTargetLang.value = ''
|
||||
currentTargetLang.value = useGeneralSettingStore().uiLanguage
|
||||
if(currentTargetLang.value === 'zh') {
|
||||
currentTargetLang.value = 'zh-cn'
|
||||
}
|
||||
}
|
||||
else if(val == 'gummy'){
|
||||
currentSourceLang.value = 'auto'
|
||||
|
||||
@@ -21,6 +21,15 @@ export const engines = {
|
||||
label: '本地 - Vosk',
|
||||
languages: [
|
||||
{ value: 'auto', label: '需要自行配置模型' },
|
||||
{ value: 'en', label: '英语' },
|
||||
{ value: 'zh-cn', label: '中文' },
|
||||
{ value: 'ja', label: '日语' },
|
||||
{ value: 'ko', label: '韩语' },
|
||||
{ value: 'de', label: '德语' },
|
||||
{ value: 'fr', label: '法语' },
|
||||
{ value: 'ru', label: '俄语' },
|
||||
{ value: 'es', label: '西班牙语' },
|
||||
{ value: 'it', label: '意大利语' },
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -46,6 +55,15 @@ export const engines = {
|
||||
label: 'Local - Vosk',
|
||||
languages: [
|
||||
{ value: 'auto', label: 'Model needs to be configured manually' },
|
||||
{ value: 'en', label: 'English' },
|
||||
{ value: 'zh-cn', label: 'Chinese' },
|
||||
{ value: 'ja', label: 'Japanese' },
|
||||
{ value: 'ko', label: 'Korean' },
|
||||
{ value: 'de', label: 'German' },
|
||||
{ value: 'fr', label: 'French' },
|
||||
{ value: 'ru', label: 'Russian' },
|
||||
{ value: 'es', label: 'Spanish' },
|
||||
{ value: 'it', label: 'Italian' },
|
||||
]
|
||||
}
|
||||
],
|
||||
@@ -71,6 +89,15 @@ export const engines = {
|
||||
label: 'ローカル - Vosk',
|
||||
languages: [
|
||||
{ value: 'auto', label: 'モデルを手動で設定する必要があります' },
|
||||
{ value: 'en', label: '英語' },
|
||||
{ value: 'zh-cn', label: '中国語' },
|
||||
{ value: 'ja', label: '日本語' },
|
||||
{ value: 'ko', label: '韓国語' },
|
||||
{ value: 'de', label: 'ドイツ語' },
|
||||
{ value: 'fr', label: 'フランス語' },
|
||||
{ value: 'ru', label: 'ロシア語' },
|
||||
{ value: 'es', label: 'スペイン語' },
|
||||
{ value: 'it', label: 'イタリア語' },
|
||||
]
|
||||
}
|
||||
]
|
||||
|
||||
@@ -15,7 +15,12 @@ export const useCaptionLogStore = defineStore('captionLog', () => {
|
||||
})
|
||||
|
||||
window.electron.ipcRenderer.on('both.captionLog.upd', (_, log) => {
|
||||
captionData.value.splice(captionData.value.length - 1, 1, log)
|
||||
for(let i = captionData.value.length - 1; i >= 0; i--) {
|
||||
if(captionData.value[i].time_s === log.time_s){
|
||||
captionData.value.splice(i, 1, log)
|
||||
break
|
||||
}
|
||||
}
|
||||
})
|
||||
|
||||
window.electron.ipcRenderer.on('both.captionLog.set', (_, logs) => {
|
||||
|
||||
Reference in New Issue
Block a user