feat(translation): 添加非实时翻译功能用户界面组件

2026-02-13 19:23:26 +08:00 · 2025-09-04 23:41:22 +08:00
parent 14987cbfc5
commit 2b7ce06f04
15 changed files with 193 additions and 77 deletions
--- a/engine/audio2text/vosk.py
+++ b/engine/audio2text/vosk.py
@@ -4,7 +4,7 @@ import time
 from datetime import datetime

 from vosk import Model, KaldiRecognizer, SetLogLevel
-from utils import stdout_cmd, stdout_obj, google_translate
+from utils import stdout_cmd, stdout_obj, google_translate, ollama_translate


 class VoskRecognizer:
@@ -14,8 +14,10 @@ class VoskRecognizer:
    初始化参数：
        model_path: Vosk 识别模型路径
        target: 翻译目标语言
+        trans_model: 翻译模型名称
+        ollama_name: Ollama 模型名称
    """
-    def __init__(self, model_path: str, target: str | None):
+    def __init__(self, model_path: str, target: str | None, trans_model: str, ollama_name: str):
        SetLogLevel(-1)
        if model_path.startswith('"'):
            model_path = model_path[1:]
@@ -23,8 +25,12 @@ class VoskRecognizer:
            model_path = model_path[:-1]
        self.model_path = model_path
        self.target = target
+        if trans_model == 'google':
+            self.trans_func = google_translate
+        else:
+            self.trans_func = ollama_translate
+        self.ollama_name = ollama_name
        self.time_str = ''
-        self.trans_time = time.time()
        self.cur_id = 0
        self.prev_content = ''

@@ -58,8 +64,8 @@ class VoskRecognizer:
            if self.target:
                self.trans_time = time.time()
                th = threading.Thread(
-                    target=google_translate,
-                    args=(caption['text'], self.target, self.time_str)
+                    target=self.trans_func,
+                    args=(self.ollama_name, self.target, caption['text'], self.time_str)
                )
                th.start()
        else:
@@ -75,13 +81,6 @@ class VoskRecognizer:
            self.prev_content = content
        
        stdout_obj(caption)
-        if self.target and time.time() - self.trans_time > 2.0:
-            self.trans_time = time.time()
-            th = threading.Thread(
-                target=google_translate,
-                args=(caption['text'], self.target, self.time_str)
-            )
-            th.start()

    def stop(self):
        """停止 Vosk 引擎"""
--- a/engine/main.py
+++ b/engine/main.py
@@ -44,10 +44,13 @@ def main_gummy(s: str, t: str, a: int, c: int, k: str):
    engine.stop()


-def main_vosk(a: int, c: int, m: str, t: str):
+def main_vosk(a: int, c: int, m: str, t: str, tm: str, on: str):
    global thread_data
    stream = AudioStream(a, c)
-    engine = VoskRecognizer(m, None if t == 'none' else t)
+    engine = VoskRecognizer(
+        m, None if t == 'none' else t,
+        tm, on
+    )

    stream.open_stream()
    engine.start()
@@ -78,6 +81,8 @@ if __name__ == "__main__":
    parser.add_argument('-k', '--api_key', default='', help='API KEY for Gummy model')
    # vosk only
    parser.add_argument('-m', '--model_path', default='', help='The path to the vosk model.')
+    parser.add_argument('-tm', '--translation_model', default='', help='Google translate API KEY')
+    parser.add_argument('-on', '--ollama_name', default='', help='Ollama model name for translation')

    args = parser.parse_args()
    if int(args.port) == 0:
@@ -98,7 +103,9 @@ if __name__ == "__main__":
            int(args.audio_type),
            int(args.chunk_rate),
            args.model_path,
-            args.target_language
+            args.target_language,
+            args.translation_model,
+            args.ollama_name
        )
    else:
        raise ValueError('Invalid caption engine specified.')
--- a/engine/utils/translation.py
+++ b/engine/utils/translation.py
@@ -2,7 +2,7 @@ from ollama import chat
 from ollama import ChatResponse
 import asyncio
 from googletrans import Translator
-from .sysout import stdout, stdout_obj
+from .sysout import stdout_cmd, stdout_obj

 lang_map = {
    'en': 'English',
@@ -13,38 +13,29 @@ lang_map = {
    'ru': 'Russian',
    'ja': 'Japanese',
    'ko': 'Korean',
-    'zh': 'Chinese'
+    'zh-cn': 'Chinese'
 }

-def ollama_translate(model: str, target: str, text: str, chunk_size = 3):
-    stream = chat(
+def ollama_translate(model: str, target: str, text: str, time_s: str):
+    response: ChatResponse = chat(
        model=model,
        messages=[
            {"role": "system", "content": f"/no_think Translate the following content into {lang_map[target]}, and do not output any additional information."},
            {"role": "user", "content": text}
-        ],
-        stream=True
+        ]
    )
-    chunk_content = ""
-    in_thinking = False
-    count = 0
-    for chunk in stream:
-        if count == 0 and chunk['message']['content'].startswith("<think>"):
-            in_thinking = True
-        if in_thinking:
-            if "</think>" in chunk['message']['content']:
-                in_thinking = False
-            continue
-        chunk_content += ' '.join(chunk['message']['content'].split('\n'))
-        count += 1
-        if count % chunk_size == 0:
-            print(chunk_content, end='')
-            chunk_content = ""
-            count = 0
-    if chunk_content:
-        print(chunk_content)
+    content = response.message.content or ""
+    if content.startswith('<think>'):
+        index = content.find('</think>')
+        if index != -1:
+            content = content[index+8:]
+    stdout_obj({
+        "command": "translation",
+        "time_s": time_s,
+        "translation": content.strip()
+    })

-def google_translate(text: str, target: str, time_s: str):
+def google_translate(model: str, target: str, text: str, time_s: str):
    translator = Translator()
    try:
        res = asyncio.run(translator.translate(text, dest=target))
@@ -54,4 +45,4 @@ def google_translate(text: str, target: str, time_s: str):
            "translation": res.text
        })
    except Exception as e:
-        stdout(f"Google Translation Request failed: {str(e)}")
+        stdout_cmd("warn", f"Google translation request failed, please check your network connection...")