diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index cbdcae4..09f264c 100644
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -153,4 +153,18 @@
### 优化体验
- 优化软件用户界面的部分组件
-- 更清晰的日志输出
\ No newline at end of file
+- 更清晰的日志输出
+
+
+## v0.8.0
+
+2025-09-??
+
+### 新增功能
+
+- 字幕引擎添加超时关闭功能:如果在规定时间字幕引擎没有启动成功会自动关闭、在字幕引擎启动过程中也可选择关闭字幕引擎
+- 添加非实时翻译功能:支持调用 Ollama 本地模型进行翻译、支持调用 Google 翻译 API 进行翻译
+
+### 优化体验
+
+- 带有额外信息的标签颜色改为与主题色一致
\ No newline at end of file
diff --git a/docs/api-docs/caption-engine.md b/docs/api-docs/caption-engine.md
index 3c03335..be2f0ad 100644
--- a/docs/api-docs/caption-engine.md
+++ b/docs/api-docs/caption-engine.md
@@ -92,6 +92,17 @@ Python 端监听到的音频流转换为的字幕数据。
Python 端打印的提示信息,会计入日志。
+### `warn`
+
+```js
+{
+ command: "warn",
+ content: string
+}
+```
+
+Python 端打印的警告信息,会计入日志。
+
### `error`
```js
@@ -101,7 +112,7 @@ Python 端打印的提示信息,会计入日志。
}
```
-Python 端打印的错误信息,该错误信息需要在前端弹窗显示。
+Python 端打印的错误信息,该错误信息会在前端弹窗显示。
### `usage`
diff --git a/engine/audio2text/vosk.py b/engine/audio2text/vosk.py
index 0355adb..850a85d 100644
--- a/engine/audio2text/vosk.py
+++ b/engine/audio2text/vosk.py
@@ -4,7 +4,7 @@ import time
from datetime import datetime
from vosk import Model, KaldiRecognizer, SetLogLevel
-from utils import stdout_cmd, stdout_obj, google_translate
+from utils import stdout_cmd, stdout_obj, google_translate, ollama_translate
class VoskRecognizer:
@@ -14,8 +14,10 @@ class VoskRecognizer:
初始化参数:
model_path: Vosk 识别模型路径
target: 翻译目标语言
+ trans_model: 翻译模型名称
+ ollama_name: Ollama 模型名称
"""
- def __init__(self, model_path: str, target: str | None):
+ def __init__(self, model_path: str, target: str | None, trans_model: str, ollama_name: str):
SetLogLevel(-1)
if model_path.startswith('"'):
model_path = model_path[1:]
@@ -23,8 +25,12 @@ class VoskRecognizer:
model_path = model_path[:-1]
self.model_path = model_path
self.target = target
+ if trans_model == 'google':
+ self.trans_func = google_translate
+ else:
+ self.trans_func = ollama_translate
+ self.ollama_name = ollama_name
self.time_str = ''
- self.trans_time = time.time()
self.cur_id = 0
self.prev_content = ''
@@ -58,8 +64,8 @@ class VoskRecognizer:
if self.target:
self.trans_time = time.time()
th = threading.Thread(
- target=google_translate,
- args=(caption['text'], self.target, self.time_str)
+ target=self.trans_func,
+ args=(self.ollama_name, self.target, caption['text'], self.time_str)
)
th.start()
else:
@@ -75,13 +81,6 @@ class VoskRecognizer:
self.prev_content = content
stdout_obj(caption)
- if self.target and time.time() - self.trans_time > 2.0:
- self.trans_time = time.time()
- th = threading.Thread(
- target=google_translate,
- args=(caption['text'], self.target, self.time_str)
- )
- th.start()
def stop(self):
"""停止 Vosk 引擎"""
diff --git a/engine/main.py b/engine/main.py
index 836f5b6..dd01e7c 100644
--- a/engine/main.py
+++ b/engine/main.py
@@ -44,10 +44,13 @@ def main_gummy(s: str, t: str, a: int, c: int, k: str):
engine.stop()
-def main_vosk(a: int, c: int, m: str, t: str):
+def main_vosk(a: int, c: int, m: str, t: str, tm: str, on: str):
global thread_data
stream = AudioStream(a, c)
- engine = VoskRecognizer(m, None if t == 'none' else t)
+ engine = VoskRecognizer(
+ m, None if t == 'none' else t,
+ tm, on
+ )
stream.open_stream()
engine.start()
@@ -78,6 +81,8 @@ if __name__ == "__main__":
parser.add_argument('-k', '--api_key', default='', help='API KEY for Gummy model')
# vosk only
parser.add_argument('-m', '--model_path', default='', help='The path to the vosk model.')
+ parser.add_argument('-tm', '--translation_model', default='', help='Google translate API KEY')
+ parser.add_argument('-on', '--ollama_name', default='', help='Ollama model name for translation')
args = parser.parse_args()
if int(args.port) == 0:
@@ -98,7 +103,9 @@ if __name__ == "__main__":
int(args.audio_type),
int(args.chunk_rate),
args.model_path,
- args.target_language
+ args.target_language,
+ args.translation_model,
+ args.ollama_name
)
else:
raise ValueError('Invalid caption engine specified.')
diff --git a/engine/utils/translation.py b/engine/utils/translation.py
index d45e2bf..33d0c6f 100644
--- a/engine/utils/translation.py
+++ b/engine/utils/translation.py
@@ -2,7 +2,7 @@ from ollama import chat
from ollama import ChatResponse
import asyncio
from googletrans import Translator
-from .sysout import stdout, stdout_obj
+from .sysout import stdout_cmd, stdout_obj
lang_map = {
'en': 'English',
@@ -13,38 +13,29 @@ lang_map = {
'ru': 'Russian',
'ja': 'Japanese',
'ko': 'Korean',
- 'zh': 'Chinese'
+ 'zh-cn': 'Chinese'
}
-def ollama_translate(model: str, target: str, text: str, chunk_size = 3):
- stream = chat(
+def ollama_translate(model: str, target: str, text: str, time_s: str):
+ response: ChatResponse = chat(
model=model,
messages=[
{"role": "system", "content": f"/no_think Translate the following content into {lang_map[target]}, and do not output any additional information."},
{"role": "user", "content": text}
- ],
- stream=True
+ ]
)
- chunk_content = ""
- in_thinking = False
- count = 0
- for chunk in stream:
- if count == 0 and chunk['message']['content'].startswith("
{{ $t('engine.ollamaNote') }}
+ + {{ $t('engine.ollama') }} +{{ $t('engine.apikeyInfo') }}
- {{ $t('engine.apikey') }} + {{ $t('engine.apikey') }}{{ $t('engine.modelPathInfo') }}
- {{ $t('engine.modelPath') }} + {{ $t('engine.modelPath') }}{{ $t('engine.startTimeoutInfo') }}
{{ $t('engine.startTimeout') }}