feat(vosk): 为 Vosk 模型添加非实时翻译功能 (#14)

- 添加 Ollama 大模型翻译和 Google 翻译(非实时),支持多种语言
- 为 Vosk 引擎添加非实时翻译
- 为新增的翻译功能添加和修改接口
- 修改 Electron 构建配置,之后不同平台构建无需修改构建文件
This commit is contained in:
himeditator
2025-09-02 23:19:53 +08:00
parent 56fdc348f8
commit 14987cbfc5
16 changed files with 176 additions and 61 deletions

View File

@@ -1,8 +1,10 @@
import json
import threading
import time
from datetime import datetime
from vosk import Model, KaldiRecognizer, SetLogLevel
from utils import stdout_cmd, stdout_obj
from utils import stdout_cmd, stdout_obj, google_translate
class VoskRecognizer:
@@ -11,15 +13,18 @@ class VoskRecognizer:
初始化参数:
model_path: Vosk 识别模型路径
target: 翻译目标语言
"""
def __init__(self, model_path: str):
def __init__(self, model_path: str, target: str | None):
SetLogLevel(-1)
if model_path.startswith('"'):
model_path = model_path[1:]
if model_path.endswith('"'):
model_path = model_path[:-1]
self.model_path = model_path
self.target = target
self.time_str = ''
self.trans_time = time.time()
self.cur_id = 0
self.prev_content = ''
@@ -48,7 +53,15 @@ class VoskRecognizer:
caption['time_s'] = self.time_str
caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
self.prev_content = ''
if content == '': return
self.cur_id += 1
if self.target:
self.trans_time = time.time()
th = threading.Thread(
target=google_translate,
args=(caption['text'], self.target, self.time_str)
)
th.start()
else:
content = json.loads(self.recognizer.PartialResult()).get('partial', '')
if content == '' or content == self.prev_content:
@@ -62,6 +75,13 @@ class VoskRecognizer:
self.prev_content = content
stdout_obj(caption)
if self.target and time.time() - self.trans_time > 2.0:
self.trans_time = time.time()
th = threading.Thread(
target=google_translate,
args=(caption['text'], self.target, self.time_str)
)
th.start()
def stop(self):
"""停止 Vosk 引擎"""

View File

@@ -44,10 +44,10 @@ def main_gummy(s: str, t: str, a: int, c: int, k: str):
engine.stop()
def main_vosk(a: int, c: int, m: str):
def main_vosk(a: int, c: int, m: str, t: str):
global thread_data
stream = AudioStream(a, c)
engine = VoskRecognizer(m)
engine = VoskRecognizer(m, None if t == 'none' else t)
stream.open_stream()
engine.start()
@@ -72,9 +72,9 @@ if __name__ == "__main__":
parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output, 1 for input')
parser.add_argument('-c', '--chunk_rate', default=10, help='Number of audio stream chunks collected per second')
parser.add_argument('-p', '--port', default=8080, help='The port to run the server on, 0 for no server')
parser.add_argument('-t', '--target_language', default='zh', help='Target language code, "none" for no translation')
# gummy only
parser.add_argument('-s', '--source_language', default='en', help='Source language code')
parser.add_argument('-t', '--target_language', default='zh', help='Target language code')
parser.add_argument('-k', '--api_key', default='', help='API KEY for Gummy model')
# vosk only
parser.add_argument('-m', '--model_path', default='', help='The path to the vosk model.')
@@ -97,7 +97,8 @@ if __name__ == "__main__":
main_vosk(
int(args.audio_type),
int(args.chunk_rate),
args.model_path
args.model_path,
args.target_language
)
else:
raise ValueError('Invalid caption engine specified.')

View File

@@ -5,3 +5,5 @@ vosk
pyinstaller
pyaudio; sys_platform == 'darwin'
pyaudiowpatch; sys_platform == 'win32'
googletrans
ollama

View File

@@ -6,4 +6,5 @@ from .audioprcs import (
)
from .sysout import stdout, stdout_err, stdout_cmd, stdout_obj, stderr
from .thdata import thread_data
from .server import start_server
from .server import start_server
from .translation import ollama_translate, google_translate

View File

@@ -0,0 +1,57 @@
from ollama import chat
from ollama import ChatResponse
import asyncio
from googletrans import Translator
from .sysout import stdout, stdout_obj
lang_map = {
'en': 'English',
'es': 'Spanish',
'fr': 'French',
'de': 'German',
'it': 'Italian',
'ru': 'Russian',
'ja': 'Japanese',
'ko': 'Korean',
'zh': 'Chinese'
}
def ollama_translate(model: str, target: str, text: str, chunk_size = 3):
stream = chat(
model=model,
messages=[
{"role": "system", "content": f"/no_think Translate the following content into {lang_map[target]}, and do not output any additional information."},
{"role": "user", "content": text}
],
stream=True
)
chunk_content = ""
in_thinking = False
count = 0
for chunk in stream:
if count == 0 and chunk['message']['content'].startswith("<think>"):
in_thinking = True
if in_thinking:
if "</think>" in chunk['message']['content']:
in_thinking = False
continue
chunk_content += ' '.join(chunk['message']['content'].split('\n'))
count += 1
if count % chunk_size == 0:
print(chunk_content, end='')
chunk_content = ""
count = 0
if chunk_content:
print(chunk_content)
def google_translate(text: str, target: str, time_s: str):
translator = Translator()
try:
res = asyncio.run(translator.translate(text, dest=target))
stdout_obj({
"command": "translation",
"time_s": time_s,
"translation": res.text
})
except Exception as e:
stdout(f"Google Translation Request failed: {str(e)}")