From 3792eb88b6ac90374495ad3a9f64752b1abaeed6 Mon Sep 17 00:00:00 2001 From: himeditator Date: Sat, 26 Jul 2025 23:37:24 +0800 Subject: [PATCH 1/7] =?UTF-8?q?refactor(engine):=20=E9=87=8D=E6=9E=84?= =?UTF-8?q?=E5=AD=97=E5=B9=95=E5=BC=95=E6=93=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 更新 GummyTranslator 类,优化字幕生成逻辑 - 移除 audioprcs 模块,音频处理功能转移到 utils 模块 - 重构 sysaudio 模块,提高音频流管理的灵活性和稳定性 - 修改 TODO.md,完成按时间降序排列字幕记录的功能 - 更新文档,说明因资源限制将不再维护英文和日文文档 --- .gitignore | 2 +- docs/TODO.md | 2 +- docs/api-docs/caption-engine.md | 51 +++++++++++++++++ docs/engine-manual/en.md | 2 + docs/engine-manual/ja.md | 2 + docs/user-manual/en.md | 2 + docs/user-manual/ja.md | 2 + engine/audio2text/gummy.py | 47 +++++++--------- engine/audioprcs/__init__.py | 1 - engine/main-gummy.py | 21 ++----- engine/main-vosk.py | 17 ++---- engine/sysaudio/__init__.py | 10 ++++ engine/sysaudio/darwin.py | 77 ++++++++++++++++---------- engine/sysaudio/linux.py | 46 +++++++++------ engine/sysaudio/win.py | 57 +++++++++++-------- engine/utils/__init__.py | 2 + engine/{audioprcs => utils}/process.py | 22 ++++---- engine/utils/sysout.py | 18 ++++++ 18 files changed, 245 insertions(+), 136 deletions(-) create mode 100644 docs/api-docs/caption-engine.md delete mode 100644 engine/audioprcs/__init__.py create mode 100644 engine/utils/__init__.py rename engine/{audioprcs => utils}/process.py (75%) create mode 100644 engine/utils/sysout.py diff --git a/.gitignore b/.gitignore index 374c927..81ae53f 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,4 @@ __pycache__ subenv engine/build engine/models -engine/test +engine/notebook diff --git a/docs/TODO.md b/docs/TODO.md index 030c1c3..71bad11 100644 --- a/docs/TODO.md +++ b/docs/TODO.md @@ -15,10 +15,10 @@ - [x] 可以调整字幕时间轴 *2025/07/14* - [x] 可以导出 srt 格式的字幕记录 *2025/07/14* - [x] 可以获取字幕引擎的系统资源消耗情况 *2025/07/15* +- [x] 添加字幕记录按时间降序排列选择 *2025/07/26* ## 待完成 -- [ ] 修改字幕记录展示逻辑 - [ ] 重构字幕引擎 - [ ] 验证 / 添加基于 sherpa-onnx 的字幕引擎 diff --git a/docs/api-docs/caption-engine.md b/docs/api-docs/caption-engine.md new file mode 100644 index 0000000..461db42 --- /dev/null +++ b/docs/api-docs/caption-engine.md @@ -0,0 +1,51 @@ +# caption engine api-doc + +本文档主要 Electron 主进程和字幕引擎进程的通信约定。 + +## 原理说明 + +本项目的 Python 进程通过标准输出向 Electron 主进程发送数据。 + +Python 进程标准输出 (`sys.stdout`) 的内容一定为一行一行的字符串。且每行字符串均可以解释为一个 JSON 对象。每个 JSON 对象一定有 `command` 参数。 + +## 输出约定 + +当 JSON 对象的 `command` 参数为下列值时,表示的对应的含义: + +### `print` + +```js +{ + command: "print", + content: string +} +``` + +输出 Python 端打印的内容。 + +### `info` + +```js +{ + command: "info", + content: string +} +``` + +Python 端打印的提示信息。 + +### `caption` + +```js +{ + command: "caption", + index: number, + time_s: string, + time_t: string, + end: boolean, + text: string, + translation: string +} +``` + +Python 端监听到的音频流转换为的字幕数据。 \ No newline at end of file diff --git a/docs/engine-manual/en.md b/docs/engine-manual/en.md index b060855..78d6e27 100644 --- a/docs/engine-manual/en.md +++ b/docs/engine-manual/en.md @@ -2,6 +2,8 @@ Corresponding Version: v0.5.1 +**Note: Due to limited personal resources, the English and Japanese documentation files for this project (except for the README document) will no longer be maintained. The content of this document may not be consistent with the latest version of the project. If you are willing to help with translation, please submit relevant Pull Requests.** + ![](../../assets/media/structure_en.png) ## Introduction to the Caption Engine diff --git a/docs/engine-manual/ja.md b/docs/engine-manual/ja.md index f1d9abe..4f9efcc 100644 --- a/docs/engine-manual/ja.md +++ b/docs/engine-manual/ja.md @@ -4,6 +4,8 @@ この文書は大規模モデルを使用して翻訳されていますので、内容に正確でない部分があるかもしれません。 +**注意:個人のリソースが限られているため、このプロジェクトの英語および日本語のドキュメント(README ドキュメントを除く)のメンテナンスは行われません。このドキュメントの内容は最新版のプロジェクトと一致しない場合があります。翻訳のお手伝いをしていただける場合は、関連するプルリクエストを提出してください。** + ![](../../assets/media/structure_ja.png) ## 字幕エンジンの紹介 diff --git a/docs/user-manual/en.md b/docs/user-manual/en.md index 5972c60..bbac9d5 100644 --- a/docs/user-manual/en.md +++ b/docs/user-manual/en.md @@ -2,6 +2,8 @@ Corresponding Version: v0.5.1 +**Note: Due to limited personal resources, the English and Japanese documentation files for this project (except for the README document) will no longer be maintained. The content of this document may not be consistent with the latest version of the project. If you are willing to help with translation, please submit relevant Pull Requests.** + ## Software Introduction Auto Caption is a cross-platform caption display software that can real-time capture system audio input (recording) or output (playback) streaming data and use an audio-to-text model to generate captions for the corresponding audio. The default caption engine provided by the software (using Alibaba Cloud Gummy model) supports recognition and translation in nine languages (Chinese, English, Japanese, Korean, German, French, Russian, Spanish, Italian). diff --git a/docs/user-manual/ja.md b/docs/user-manual/ja.md index aee9e47..2350c9b 100644 --- a/docs/user-manual/ja.md +++ b/docs/user-manual/ja.md @@ -4,6 +4,8 @@ この文書は大規模モデルを使用して翻訳されていますので、内容に正確でない部分があるかもしれません。 +**注意:個人のリソースが限られているため、このプロジェクトの英語および日本語のドキュメント(README ドキュメントを除く)のメンテナンスは行われません。このドキュメントの内容は最新版のプロジェクトと一致しない場合があります。翻訳のお手伝いをしていただける場合は、関連するプルリクエストを提出してください。** + ## ソフトウェアの概要 Auto Caption は、クロスプラットフォームの字幕表示ソフトウェアで、システムの音声入力(録音)または出力(音声再生)のストリーミングデータをリアルタイムで取得し、音声からテキストに変換するモデルを利用して対応する音声の字幕を生成します。このソフトウェアが提供するデフォルトの字幕エンジン(アリババクラウド Gummy モデルを使用)は、9つの言語(中国語、英語、日本語、韓国語、ドイツ語、フランス語、ロシア語、スペイン語、イタリア語)の認識と翻訳をサポートしています。 diff --git a/engine/audio2text/gummy.py b/engine/audio2text/gummy.py index ceca937..072f5a2 100644 --- a/engine/audio2text/gummy.py +++ b/engine/audio2text/gummy.py @@ -6,8 +6,8 @@ from dashscope.audio.asr import ( ) import dashscope from datetime import datetime -import json -import sys +from utils import stdout_cmd, stdout_obj + class Callback(TranslationRecognizerCallback): """ @@ -17,15 +17,16 @@ class Callback(TranslationRecognizerCallback): super().__init__() self.usage = 0 self.cur_id = -1 + self.index = 0 self.time_str = '' def on_open(self) -> None: - # print("on_open") - pass + self.cur_id = -1 + self.time_str = '' + stdout_cmd('info', 'Gummy translator started.') def on_close(self) -> None: - # print("on_close") - pass + stdout_cmd('info', 'Gummy translator closed.') def on_event( self, @@ -35,17 +36,18 @@ class Callback(TranslationRecognizerCallback): usage ) -> None: caption = {} + if transcription_result is not None: - caption['index'] = transcription_result.sentence_id - caption['text'] = transcription_result.text - if caption['index'] != self.cur_id: - self.cur_id = caption['index'] - cur_time = datetime.now().strftime('%H:%M:%S.%f')[:-3] - caption['time_s'] = cur_time - self.time_str = cur_time - else: - caption['time_s'] = self.time_str + if self.cur_id != transcription_result.sentence_id: + self.time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3] + self.cur_id = transcription_result.sentence_id + self.index += 1 + caption['command'] = 'caption' + caption['index'] = self.index + caption['time_s'] = self.time_str caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3] + caption['end'] = transcription_result.is_sentence_end + caption['text'] = transcription_result.text caption['translation'] = "" if translation_result is not None: @@ -55,19 +57,8 @@ class Callback(TranslationRecognizerCallback): if usage: self.usage += usage['duration'] - # print(caption) - self.send_to_node(caption) + stdout_obj(caption) - def send_to_node(self, data): - """ - 将数据发送到 Node.js 进程 - """ - try: - json_data = json.dumps(data) + '\n' - sys.stdout.write(json_data) - sys.stdout.flush() - except Exception as e: - print(f"Error sending data to Node.js: {e}", file=sys.stderr) class GummyTranslator: """ @@ -78,7 +69,7 @@ class GummyTranslator: source: 源语言代码字符串(zh, en, ja 等) target: 目标语言代码字符串(zh, en, ja 等) """ - def __init__(self, rate, source, target, api_key): + def __init__(self, rate: int, source: str, target: str | None, api_key: str | None): if api_key: dashscope.api_key = api_key self.translator = TranslationRecognizerRealtime( diff --git a/engine/audioprcs/__init__.py b/engine/audioprcs/__init__.py deleted file mode 100644 index 0d542f8..0000000 --- a/engine/audioprcs/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .process import mergeChunkChannels, resampleRawChunk, resampleMonoChunk diff --git a/engine/main-gummy.py b/engine/main-gummy.py index d4f0ea9..690faae 100644 --- a/engine/main-gummy.py +++ b/engine/main-gummy.py @@ -1,21 +1,11 @@ import sys import argparse - -if sys.platform == 'win32': - from sysaudio.win import AudioStream -elif sys.platform == 'darwin': - from sysaudio.darwin import AudioStream -elif sys.platform == 'linux': - from sysaudio.linux import AudioStream -else: - raise NotImplementedError(f"Unsupported platform: {sys.platform}") - -from audioprcs import mergeChunkChannels +from sysaudio import AudioStream +from utils import merge_chunk_channels from audio2text import InvalidParameter, GummyTranslator def convert_audio_to_text(s_lang, t_lang, audio_type, chunk_rate, api_key): - sys.stdout.reconfigure(line_buffering=True) # type: ignore stream = AudioStream(audio_type, chunk_rate) if t_lang == 'none': @@ -23,20 +13,21 @@ def convert_audio_to_text(s_lang, t_lang, audio_type, chunk_rate, api_key): else: gummy = GummyTranslator(stream.RATE, s_lang, t_lang, api_key) - stream.openStream() + stream.open_stream() gummy.start() while True: try: chunk = stream.read_chunk() - chunk_mono = mergeChunkChannels(chunk, stream.CHANNELS) + if chunk is None: continue + chunk_mono = merge_chunk_channels(chunk, stream.CHANNELS) try: gummy.send_audio_frame(chunk_mono) except InvalidParameter: gummy.start() gummy.send_audio_frame(chunk_mono) except KeyboardInterrupt: - stream.closeStream() + stream.close_stream() gummy.stop() break diff --git a/engine/main-vosk.py b/engine/main-vosk.py index cf407f7..a095909 100644 --- a/engine/main-vosk.py +++ b/engine/main-vosk.py @@ -4,17 +4,9 @@ import argparse from datetime import datetime import numpy.core.multiarray -if sys.platform == 'win32': - from sysaudio.win import AudioStream -elif sys.platform == 'darwin': - from sysaudio.darwin import AudioStream -elif sys.platform == 'linux': - from sysaudio.linux import AudioStream -else: - raise NotImplementedError(f"Unsupported platform: {sys.platform}") - +from sysaudio import AudioStream from vosk import Model, KaldiRecognizer, SetLogLevel -from audioprcs import resampleRawChunk +from utils import resample_chunk_mono SetLogLevel(-1) @@ -30,7 +22,7 @@ def convert_audio_to_text(audio_type, chunk_rate, model_path): recognizer = KaldiRecognizer(model, 16000) stream = AudioStream(audio_type, chunk_rate) - stream.openStream() + stream.open_stream() time_str = '' cur_id = 0 @@ -38,7 +30,8 @@ def convert_audio_to_text(audio_type, chunk_rate, model_path): while True: chunk = stream.read_chunk() - chunk_mono = resampleRawChunk(chunk, stream.CHANNELS, stream.RATE, 16000) + if chunk is None: continue + chunk_mono = resample_chunk_mono(chunk, stream.CHANNELS, stream.RATE, 16000) caption = {} if recognizer.AcceptWaveform(chunk_mono): diff --git a/engine/sysaudio/__init__.py b/engine/sysaudio/__init__.py index e69de29..0bd8205 100644 --- a/engine/sysaudio/__init__.py +++ b/engine/sysaudio/__init__.py @@ -0,0 +1,10 @@ +import sys + +if sys.platform == "win32": + from .win import AudioStream +elif sys.platform == "darwin": + from .darwin import AudioStream +elif sys.platform == "linux": + from .linux import AudioStream +else: + raise NotImplementedError(f"Unsupported platform: {sys.platform}") \ No newline at end of file diff --git a/engine/sysaudio/darwin.py b/engine/sysaudio/darwin.py index 4c5d6dd..a3a8d6e 100644 --- a/engine/sysaudio/darwin.py +++ b/engine/sysaudio/darwin.py @@ -1,11 +1,24 @@ """获取 MacOS 系统音频输入/输出流""" import pyaudio +from textwrap import dedent + + +def get_blackhole_device(mic: pyaudio.PyAudio): + """ + 获取 BlackHole 设备 + """ + device_count = mic.get_device_count() + for i in range(device_count): + dev_info = mic.get_device_info_by_index(i) + if 'blackhole' in str(dev_info["name"]).lower(): + return dev_info + raise Exception("The device containing BlackHole was not found.") class AudioStream: """ - 获取系统音频流(支持 BlackHole 作为系统音频输出捕获) + 获取系统音频流(如果要捕获输出音频,仅支持 BlackHole 作为系统音频输出捕获) 初始化参数: audio_type: 0-系统音频输出流(需配合 BlackHole),1-系统音频输入流 @@ -15,46 +28,40 @@ class AudioStream: self.audio_type = audio_type self.mic = pyaudio.PyAudio() if self.audio_type == 0: - self.device = self.getOutputDeviceInfo() + self.device = get_blackhole_device(self.mic) else: self.device = self.mic.get_default_input_device_info() + self.stop_signal = False self.stream = None - self.SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16) + self.INDEX = self.device["index"] self.FORMAT = pyaudio.paInt16 - self.CHANNELS = self.device["maxInputChannels"] + self.SAMP_WIDTH = pyaudio.get_sample_size(self.FORMAT) + self.CHANNELS = int(self.device["maxInputChannels"]) self.RATE = int(self.device["defaultSampleRate"]) self.CHUNK = self.RATE // chunk_rate - self.INDEX = self.device["index"] - def getOutputDeviceInfo(self): - """查找指定关键词的输入设备""" - device_count = self.mic.get_device_count() - for i in range(device_count): - dev_info = self.mic.get_device_info_by_index(i) - if 'blackhole' in dev_info["name"].lower(): - return dev_info - raise Exception("The device containing BlackHole was not found.") - - def printInfo(self): + def get_info(self): dev_info = f""" - 采样输入设备: + 采样设备: - 设备类型:{ "音频输出" if self.audio_type == 0 else "音频输入" } - - 序号:{self.device['index']} - - 名称:{self.device['name']} + - 设备序号:{self.device['index']} + - 设备名称:{self.device['name']} - 最大输入通道数:{self.device['maxInputChannels']} - 默认低输入延迟:{self.device['defaultLowInputLatency']}s - 默认高输入延迟:{self.device['defaultHighInputLatency']}s - 默认采样率:{self.device['defaultSampleRate']}Hz + - 是否回环设备:{self.device['isLoopbackDevice']} - 音频样本块大小:{self.CHUNK} + 设备序号:{self.INDEX} + 样本格式:{self.FORMAT} 样本位宽:{self.SAMP_WIDTH} - 采样格式:{self.FORMAT} - 音频通道数:{self.CHANNELS} - 音频采样率:{self.RATE} + 样本通道数:{self.CHANNELS} + 样本采样率:{self.RATE} + 样本块大小:{self.CHUNK} """ - print(dev_info) + return dedent(dev_info).strip() - def openStream(self): + def open_stream(self): """ 打开并返回系统音频输出流 """ @@ -72,14 +79,24 @@ class AudioStream: """ 读取音频数据 """ + if self.stop_signal: + self.close_stream() + return None if not self.stream: return None return self.stream.read(self.CHUNK, exception_on_overflow=False) - def closeStream(self): + def close_stream_signal(self): """ - 关闭系统音频输出流 + 线程安全的关闭系统音频输入流,不一定会立即关闭 """ - if self.stream is None: return - self.stream.stop_stream() - self.stream.close() - self.stream = None + self.stop_signal = True + + def close_stream(self): + """ + 立即关闭系统音频输入流 + """ + if self.stream is not None: + self.stream.stop_stream() + self.stream.close() + self.stream = None + self.stop_signal = False diff --git a/engine/sysaudio/linux.py b/engine/sysaudio/linux.py index 58be353..0a5644a 100644 --- a/engine/sysaudio/linux.py +++ b/engine/sysaudio/linux.py @@ -1,8 +1,10 @@ """获取 Linux 系统音频输入流""" import subprocess +from textwrap import dedent -def findMonitorSource(): + +def find_monitor_source(): result = subprocess.run( ["pactl", "list", "short", "sources"], stdout=subprocess.PIPE, text=True @@ -16,7 +18,8 @@ def findMonitorSource(): raise RuntimeError("System output monitor device not found") -def findInputSource(): + +def find_input_source(): result = subprocess.run( ["pactl", "list", "short", "sources"], stdout=subprocess.PIPE, text=True @@ -28,8 +31,10 @@ def findInputSource(): name = parts[1] if ".monitor" not in name: return name + raise RuntimeError("Microphone input device not found") + class AudioStream: """ 获取系统音频流 @@ -42,34 +47,33 @@ class AudioStream: self.audio_type = audio_type if self.audio_type == 0: - self.source = findMonitorSource() + self.source = find_monitor_source() else: - self.source = findInputSource() - + self.source = find_input_source() + self.stop_signal = False self.process = None - - self.SAMP_WIDTH = 2 self.FORMAT = 16 + self.SAMP_WIDTH = 2 self.CHANNELS = 2 self.RATE = 48000 self.CHUNK = self.RATE // chunk_rate - def printInfo(self): + def get_info(self): dev_info = f""" 音频捕获进程: - 捕获类型:{"音频输出" if self.audio_type == 0 else "音频输入"} - 设备源:{self.source} - - 捕获进程PID:{self.process.pid if self.process else "None"} + - 捕获进程 PID:{self.process.pid if self.process else "None"} - 音频样本块大小:{self.CHUNK} + 样本格式:{self.FORMAT} 样本位宽:{self.SAMP_WIDTH} - 采样格式:{self.FORMAT} - 音频通道数:{self.CHANNELS} - 音频采样率:{self.RATE} + 样本通道数:{self.CHANNELS} + 样本采样率:{self.RATE} + 样本块大小:{self.CHUNK} """ print(dev_info) - def openStream(self): + def open_stream(self): """ 启动音频捕获进程 """ @@ -82,13 +86,23 @@ class AudioStream: """ 读取音频数据 """ - if self.process: + if self.stop_signal: + self.close_stream() + return None + if self.process and self.process.stdout: return self.process.stdout.read(self.CHUNK) return None - def closeStream(self): + def close_stream_signal(self): + """ + 线程安全的关闭系统音频输入流,不一定会立即关闭 + """ + self.stop_signal = True + + def close_stream(self): """ 关闭系统音频捕获进程 """ if self.process: self.process.terminate() + self.stop_signal = False diff --git a/engine/sysaudio/win.py b/engine/sysaudio/win.py index c6765ce..247b434 100644 --- a/engine/sysaudio/win.py +++ b/engine/sysaudio/win.py @@ -1,14 +1,15 @@ """获取 Windows 系统音频输入/输出流""" import pyaudiowpatch as pyaudio +from textwrap import dedent -def getDefaultLoopbackDevice(mic: pyaudio.PyAudio, info = True)->dict: +def get_default_loopback_device(mic: pyaudio.PyAudio, info = True)->dict: """ 获取默认的系统音频输出的回环设备 Args: - mic (pyaudio.PyAudio): pyaudio对象 - info (bool, optional): 是否打印设备信息 + mic: pyaudio对象 + info: 是否打印设备信息 Returns: dict: 系统音频输出的回环设备 @@ -51,38 +52,40 @@ class AudioStream: self.audio_type = audio_type self.mic = pyaudio.PyAudio() if self.audio_type == 0: - self.device = getDefaultLoopbackDevice(self.mic, False) + self.device = get_default_loopback_device(self.mic, False) else: self.device = self.mic.get_default_input_device_info() + self.stop_signal = False self.stream = None - self.SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16) + self.INDEX = self.device["index"] self.FORMAT = pyaudio.paInt16 + self.SAMP_WIDTH = pyaudio.get_sample_size(self.FORMAT) self.CHANNELS = int(self.device["maxInputChannels"]) self.RATE = int(self.device["defaultSampleRate"]) self.CHUNK = self.RATE // chunk_rate - self.INDEX = self.device["index"] - def printInfo(self): + def get_info(self): dev_info = f""" 采样设备: - 设备类型:{ "音频输出" if self.audio_type == 0 else "音频输入" } - - 序号:{self.device['index']} - - 名称:{self.device['name']} + - 设备序号:{self.device['index']} + - 设备名称:{self.device['name']} - 最大输入通道数:{self.device['maxInputChannels']} - 默认低输入延迟:{self.device['defaultLowInputLatency']}s - 默认高输入延迟:{self.device['defaultHighInputLatency']}s - 默认采样率:{self.device['defaultSampleRate']}Hz - 是否回环设备:{self.device['isLoopbackDevice']} - 音频样本块大小:{self.CHUNK} + 设备序号:{self.INDEX} + 样本格式:{self.FORMAT} 样本位宽:{self.SAMP_WIDTH} - 采样格式:{self.FORMAT} - 音频通道数:{self.CHANNELS} - 音频采样率:{self.RATE} + 样本通道数:{self.CHANNELS} + 样本采样率:{self.RATE} + 样本块大小:{self.CHUNK} """ - print(dev_info) + return dedent(dev_info).strip() - def openStream(self): + def open_stream(self): """ 打开并返回系统音频输出流 """ @@ -96,18 +99,28 @@ class AudioStream: ) return self.stream - def read_chunk(self): + def read_chunk(self) -> bytes | None: """ 读取音频数据 """ + if self.stop_signal: + self.close_stream() + return None if not self.stream: return None return self.stream.read(self.CHUNK, exception_on_overflow=False) - def closeStream(self): + def close_stream_signal(self): """ - 关闭系统音频输出流 + 线程安全的关闭系统音频输入流,不一定会立即关闭 """ - if self.stream is None: return - self.stream.stop_stream() - self.stream.close() - self.stream = None + self.stop_signal = True + + def close_stream(self): + """ + 关闭系统音频输入流 + """ + if self.stream is not None: + self.stream.stop_stream() + self.stream.close() + self.stream = None + self.stop_signal = False diff --git a/engine/utils/__init__.py b/engine/utils/__init__.py new file mode 100644 index 0000000..2589863 --- /dev/null +++ b/engine/utils/__init__.py @@ -0,0 +1,2 @@ +from .process import merge_chunk_channels, resample_chunk_mono, resample_mono_chunk +from .sysout import stdout, stdout_cmd, stdout_obj, stderr \ No newline at end of file diff --git a/engine/audioprcs/process.py b/engine/utils/process.py similarity index 75% rename from engine/audioprcs/process.py rename to engine/utils/process.py index 9298593..01c854d 100644 --- a/engine/audioprcs/process.py +++ b/engine/utils/process.py @@ -1,16 +1,17 @@ import samplerate import numpy as np -def mergeChunkChannels(chunk, channels): + +def merge_chunk_channels(chunk: bytes, channels: int) -> bytes: """ 将当前多通道音频数据块转换为单通道音频数据块 Args: - chunk: (bytes)多通道音频数据块 + chunk: 多通道音频数据块 channels: 通道数 Returns: - (bytes)单通道音频数据块 + 单通道音频数据块 """ # (length * channels,) chunk_np = np.frombuffer(chunk, dtype=np.int16) @@ -22,19 +23,19 @@ def mergeChunkChannels(chunk, channels): return chunk_mono.tobytes() -def resampleRawChunk(chunk, channels, orig_sr, target_sr, mode="sinc_best"): +def resample_chunk_mono(chunk: bytes, channels: int, orig_sr: int, target_sr: int, mode="sinc_best") -> bytes: """ 将当前多通道音频数据块转换成单通道音频数据块,然后进行重采样 Args: - chunk: (bytes)多通道音频数据块 + chunk: 多通道音频数据块 channels: 通道数 orig_sr: 原始采样率 target_sr: 目标采样率 mode: 重采样模式,可选:'sinc_best' | 'sinc_medium' | 'sinc_fastest' | 'zero_order_hold' | 'linear' Return: - (bytes)单通道音频数据块 + 单通道音频数据块 """ # (length * channels,) chunk_np = np.frombuffer(chunk, dtype=np.int16) @@ -44,22 +45,23 @@ def resampleRawChunk(chunk, channels, orig_sr, target_sr, mode="sinc_best"): chunk_mono_f = np.mean(chunk_np.astype(np.float32), axis=1) chunk_mono = chunk_mono_f.astype(np.int16) ratio = target_sr / orig_sr - chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode) + chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode) chunk_mono_r = np.round(chunk_mono_r).astype(np.int16) return chunk_mono_r.tobytes() -def resampleMonoChunk(chunk, orig_sr, target_sr, mode="sinc_best"): + +def resample_mono_chunk(chunk: bytes, orig_sr: int, target_sr: int, mode="sinc_best") -> bytes: """ 将当前单通道音频块进行重采样 Args: - chunk: (bytes)单通道音频数据块 + chunk: 单通道音频数据块 orig_sr: 原始采样率 target_sr: 目标采样率 mode: 重采样模式,可选:'sinc_best' | 'sinc_medium' | 'sinc_fastest' | 'zero_order_hold' | 'linear' Return: - (bytes)单通道音频数据块 + 单通道音频数据块 """ chunk_np = np.frombuffer(chunk, dtype=np.int16) ratio = target_sr / orig_sr diff --git a/engine/utils/sysout.py b/engine/utils/sysout.py new file mode 100644 index 0000000..574b2cf --- /dev/null +++ b/engine/utils/sysout.py @@ -0,0 +1,18 @@ +import sys +import json + +def stdout(text: str): + stdout_cmd("print", text) + +def stdout_cmd(command: str, content = ""): + msg = { "command": command, "content": content } + sys.stdout.write(json.dumps(msg) + "\n") + sys.stdout.flush() + +def stdout_obj(obj): + sys.stdout.write(json.dumps(obj) + "\n") + sys.stdout.flush() + +def stderr(text: str): + sys.stderr.write(text + "\n") + sys.stderr.flush() From b658ef5440a8aa04a412eb388296cc35b40281e8 Mon Sep 17 00:00:00 2001 From: himeditator Date: Sun, 27 Jul 2025 17:15:12 +0800 Subject: [PATCH 2/7] =?UTF-8?q?feat(engine):=20=E4=BC=98=E5=8C=96=E5=AD=97?= =?UTF-8?q?=E5=B9=95=E5=BC=95=E6=93=8E=E8=BE=93=E5=87=BA=E6=A0=BC=E5=BC=8F?= =?UTF-8?q?=E3=80=81=E5=87=86=E5=A4=87=E5=90=88=E5=B9=B6=E4=B8=A4=E4=B8=AA?= =?UTF-8?q?=E5=AD=97=E5=B9=95=E5=BC=95=E6=93=8E?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 重构字幕引擎相关代码 - 准备合并两个字幕引擎 --- docs/api-docs/caption-engine.md | 15 ++++- engine/audio2text/gummy.py | 10 ++-- engine/audio2text/vosk.py | 59 +++++++++++++++++++ engine/main-vosk.py | 1 + engine/main.py | 37 ++++++++++++ package-lock.json | 8 +++ package.json | 3 +- src/main/utils/AllConfig.ts | 19 +++--- src/main/utils/CaptionEngine.ts | 68 +++++++++++++--------- src/main/utils/Log.ts | 21 +++++++ src/renderer/src/components/CaptionLog.vue | 9 ++- 11 files changed, 205 insertions(+), 45 deletions(-) create mode 100644 engine/audio2text/vosk.py create mode 100644 engine/main.py create mode 100644 src/main/utils/Log.ts diff --git a/docs/api-docs/caption-engine.md b/docs/api-docs/caption-engine.md index 461db42..d284b5a 100644 --- a/docs/api-docs/caption-engine.md +++ b/docs/api-docs/caption-engine.md @@ -32,7 +32,19 @@ Python 进程标准输出 (`sys.stdout`) 的内容一定为一行一行的字符 } ``` -Python 端打印的提示信息。 +Python 端打印的提示信息,比起 `print`,该信息更希望 Electron 端的关注。 + +### `usage` + +```js +{ + command: "usage", + content: string +} +``` + +Gummy 字幕引擎结束时打印计费消耗信息。 + ### `caption` @@ -42,7 +54,6 @@ Python 端打印的提示信息。 index: number, time_s: string, time_t: string, - end: boolean, text: string, translation: string } diff --git a/engine/audio2text/gummy.py b/engine/audio2text/gummy.py index 072f5a2..e49f47b 100644 --- a/engine/audio2text/gummy.py +++ b/engine/audio2text/gummy.py @@ -15,18 +15,20 @@ class Callback(TranslationRecognizerCallback): """ def __init__(self): super().__init__() + self.index = 0 self.usage = 0 self.cur_id = -1 - self.index = 0 self.time_str = '' def on_open(self) -> None: + self.usage = 0 self.cur_id = -1 self.time_str = '' stdout_cmd('info', 'Gummy translator started.') def on_close(self) -> None: stdout_cmd('info', 'Gummy translator closed.') + stdout_cmd('usage', str(self.usage)) def on_event( self, @@ -46,7 +48,6 @@ class Callback(TranslationRecognizerCallback): caption['index'] = self.index caption['time_s'] = self.time_str caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3] - caption['end'] = transcription_result.is_sentence_end caption['text'] = transcription_result.text caption['translation'] = "" @@ -57,7 +58,8 @@ class Callback(TranslationRecognizerCallback): if usage: self.usage += usage['duration'] - stdout_obj(caption) + if 'text' in caption: + stdout_obj(caption) class GummyTranslator: @@ -88,7 +90,7 @@ class GummyTranslator: self.translator.start() def send_audio_frame(self, data): - """发送音频帧""" + """发送音频帧,擎将自动识别并将识别结果输出到标准输出中""" self.translator.send_audio_frame(data) def stop(self): diff --git a/engine/audio2text/vosk.py b/engine/audio2text/vosk.py new file mode 100644 index 0000000..7c34459 --- /dev/null +++ b/engine/audio2text/vosk.py @@ -0,0 +1,59 @@ +import json +from datetime import datetime + +from vosk import Model, KaldiRecognizer, SetLogLevel +from utils import stdout_obj + +class VoskRecognizer: + """ + 使用 Vosk 引擎流式处理的音频数据,并在标准输出中输出与 Auto Caption 软件可读取的 JSON 字符串数据 + + 初始化参数: + model_path: Vosk 识别模型路径 + """ + def __int__(self, model_path: str): + SetLogLevel(-1) + if model_path.startswith('"'): + model_path = model_path[1:] + if model_path.endswith('"'): + model_path = model_path[:-1] + self.model_path = model_path + self.time_str = '' + self.cur_id = 0 + self.prev_content = '' + + self.model = Model(self.model_path) + self.recognizer = KaldiRecognizer(self.model, 16000) + + def send_audio_frame(self, data: bytes): + """ + 发送音频帧给 Vosk 引擎,引擎将自动识别并将识别结果输出到标准输出中 + + Args: + data: 音频帧数据,采样率必须为 16000Hz + """ + caption = {} + caption['command'] = 'caption' + caption['translation'] = '' + + if self.recognizer.AcceptWaveform(data): + content = json.loads(self.recognizer.Result()).get('text', '') + caption['index'] = self.cur_id + caption['text'] = content + caption['time_s'] = self.time_str + caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3] + self.prev_content = '' + self.cur_id += 1 + else: + content = json.loads(self.recognizer.PartialResult()).get('partial', '') + if content == '' or content == self.prev_content: + return + if self.prev_content == '': + self.time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3] + caption['index'] = self.cur_id + caption['text'] = content + caption['time_s'] = self.time_str + caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3] + self.prev_content = content + + stdout_obj(caption) diff --git a/engine/main-vosk.py b/engine/main-vosk.py index a095909..5a3fb32 100644 --- a/engine/main-vosk.py +++ b/engine/main-vosk.py @@ -49,6 +49,7 @@ def convert_audio_to_text(audio_type, chunk_rate, model_path): continue if prev_content == '': time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3] + caption['command'] = 'caption' caption['index'] = cur_id caption['text'] = content caption['time_s'] = time_str diff --git a/engine/main.py b/engine/main.py new file mode 100644 index 0000000..6ea3d53 --- /dev/null +++ b/engine/main.py @@ -0,0 +1,37 @@ +import argparse + +def gummy_engine(s, t, a, c, k): + pass + +def vosk_engine(a, c, m): + pass + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Convert system audio stream to text') + # both + parser.add_argument('-e', '--caption_engine', default='gummy', help='Caption engine: gummy or vosk') + parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output, 1 for input') + parser.add_argument('-c', '--chunk_rate', default=20, help='Number of audio stream chunks collected per second') + # gummy + parser.add_argument('-s', '--source_language', default='en', help='Source language code') + parser.add_argument('-t', '--target_language', default='zh', help='Target language code') + parser.add_argument('-k', '--api_key', default='', help='API KEY for Gummy model') + # vosk + parser.add_argument('-m', '--model_path', default='', help='The path to the vosk model.') + args = parser.parse_args() + if args.caption_engine == 'gummy': + gummy_engine( + args.source_language, + args.target_language, + int(args.audio_type), + int(args.chunk_rate), + args.api_key + ) + elif args.caption_engine == 'vosk': + vosk_engine( + int(args.audio_type), + int(args.chunk_rate), + args.model_path + ) + else: + raise ValueError('Invalid caption engine specified.') \ No newline at end of file diff --git a/package-lock.json b/package-lock.json index 7bb8f63..8a9850e 100644 --- a/package-lock.json +++ b/package-lock.json @@ -22,6 +22,7 @@ "@electron-toolkit/eslint-config-ts": "^3.0.0", "@electron-toolkit/tsconfig": "^1.0.1", "@types/node": "^22.14.1", + "@types/pidusage": "^2.0.5", "@vitejs/plugin-vue": "^5.2.3", "electron": "^35.1.5", "electron-builder": "^25.1.8", @@ -2296,6 +2297,13 @@ "undici-types": "~6.21.0" } }, + "node_modules/@types/pidusage": { + "version": "2.0.5", + "resolved": "https://registry.npmmirror.com/@types/pidusage/-/pidusage-2.0.5.tgz", + "integrity": "sha512-MIiyZI4/MK9UGUXWt0jJcCZhVw7YdhBuTOuqP/BjuLDLZ2PmmViMIQgZiWxtaMicQfAz/kMrZ5T7PKxFSkTeUA==", + "dev": true, + "license": "MIT" + }, "node_modules/@types/plist": { "version": "3.0.5", "resolved": "https://registry.npmmirror.com/@types/plist/-/plist-3.0.5.tgz", diff --git a/package.json b/package.json index f2ed497..7a8cef4 100644 --- a/package.json +++ b/package.json @@ -13,7 +13,7 @@ "typecheck:web": "vue-tsc --noEmit -p tsconfig.web.json --composite false", "typecheck": "npm run typecheck:node && npm run typecheck:web", "start": "electron-vite preview", - "dev": "electron-vite dev", + "dev": "chcp 65001 && electron-vite dev", "build": "npm run typecheck && electron-vite build", "postinstall": "electron-builder install-app-deps", "build:unpack": "npm run build && electron-builder --dir", @@ -35,6 +35,7 @@ "@electron-toolkit/eslint-config-ts": "^3.0.0", "@electron-toolkit/tsconfig": "^1.0.1", "@types/node": "^22.14.1", + "@types/pidusage": "^2.0.5", "@vitejs/plugin-vue": "^5.2.3", "electron": "^35.1.5", "electron-builder": "^25.1.8", diff --git a/src/main/utils/AllConfig.ts b/src/main/utils/AllConfig.ts index 8bfd015..d041f1d 100644 --- a/src/main/utils/AllConfig.ts +++ b/src/main/utils/AllConfig.ts @@ -2,6 +2,7 @@ import { UILanguage, UITheme, Styles, Controls, CaptionItem, FullConfig } from '../types' +import { Log } from './Log' import { app, BrowserWindow } from 'electron' import * as path from 'path' import * as fs from 'fs' @@ -48,6 +49,7 @@ class AllConfig { uiTheme: UITheme = 'system'; styles: Styles = {...defaultStyles}; controls: Controls = {...defaultControls}; + lastLogIndex: number = -1; captionLog: CaptionItem[] = []; constructor() {} @@ -61,7 +63,7 @@ class AllConfig { if(config.leftBarWidth) this.leftBarWidth = config.leftBarWidth if(config.styles) this.setStyles(config.styles) if(config.controls) this.setControls(config.controls) - console.log('[INFO] Read Config from:', configPath) + Log.info('Read Config from:', configPath) } } @@ -75,7 +77,7 @@ class AllConfig { } const configPath = path.join(app.getPath('userData'), 'config.json') fs.writeFileSync(configPath, JSON.stringify(config, null, 2)) - console.log('[INFO] Write Config to:', configPath) + Log.info('Write Config to:', configPath) } public getFullConfig(): FullConfig { @@ -96,7 +98,7 @@ class AllConfig { this.styles[key] = args[key] } } - console.log('[INFO] Set Styles:', this.styles) + Log.info('Set Styles:', this.styles) } public resetStyles() { @@ -105,7 +107,7 @@ class AllConfig { public sendStyles(window: BrowserWindow) { window.webContents.send('both.styles.set', this.styles) - console.log(`[INFO] Send Styles to #${window.id}:`, this.styles) + Log.info(`Send Styles to #${window.id}:`, this.styles) } public setControls(args: Object) { @@ -116,27 +118,28 @@ class AllConfig { } } this.controls.engineEnabled = engineEnabled - console.log('[INFO] Set Controls:', this.controls) + Log.info('Set Controls:', this.controls) } public sendControls(window: BrowserWindow) { window.webContents.send('control.controls.set', this.controls) - console.log(`[INFO] Send Controls to #${window.id}:`, this.controls) + Log.info(`Send Controls to #${window.id}:`, this.controls) } public updateCaptionLog(log: CaptionItem) { let command: 'add' | 'upd' = 'add' if( this.captionLog.length && - this.captionLog[this.captionLog.length - 1].index === log.index && - this.captionLog[this.captionLog.length - 1].time_s === log.time_s + this.lastLogIndex === log.index ) { this.captionLog.splice(this.captionLog.length - 1, 1, log) command = 'upd' } else { this.captionLog.push(log) + this.lastLogIndex = log.index } + this.captionLog[this.captionLog.length - 1].index = this.captionLog.length for(const window of BrowserWindow.getAllWindows()){ this.sendCaptionLog(window, command) } diff --git a/src/main/utils/CaptionEngine.ts b/src/main/utils/CaptionEngine.ts index 0c03160..8f102b5 100644 --- a/src/main/utils/CaptionEngine.ts +++ b/src/main/utils/CaptionEngine.ts @@ -5,6 +5,7 @@ import path from 'path' import { controlWindow } from '../ControlWindow' import { allConfig } from './AllConfig' import { i18n } from '../i18n' +import { Log } from './Log' export class CaptionEngine { appPath: string = '' @@ -14,7 +15,7 @@ export class CaptionEngine { private getApp(): boolean { if (allConfig.controls.customized && allConfig.controls.customizedApp) { - console.log('[INFO] Using customized engine') + Log.info('Using customized engine') this.appPath = allConfig.controls.customizedApp this.command = allConfig.controls.customizedCommand.split(' ') } @@ -25,9 +26,7 @@ export class CaptionEngine { return false } let gummyName = 'main-gummy' - if (process.platform === 'win32') { - gummyName += '.exe' - } + if (process.platform === 'win32') { gummyName += '.exe' } this.command = [] if (is.dev) { this.appPath = path.join( @@ -56,31 +55,33 @@ export class CaptionEngine { else if(allConfig.controls.engine === 'vosk'){ allConfig.controls.customized = false let voskName = 'main-vosk' - if (process.platform === 'win32') { - voskName += '.exe' - } + if (process.platform === 'win32') { voskName += '.exe' } + this.command = [] if (is.dev) { this.appPath = path.join( - app.getAppPath(), - 'engine', 'dist', voskName + app.getAppPath(), 'engine', + 'subenv', 'Scripts', 'python.exe' ) + this.command.push(path.join( + app.getAppPath(), 'engine', 'main-vosk.py' + )) } else { this.appPath = path.join( process.resourcesPath, 'engine', voskName ) } - this.command = [] this.command.push('-a', allConfig.controls.audio ? '1' : '0') this.command.push('-m', `"${allConfig.controls.modelPath}"`) } - console.log('[INFO] Engine Path:', this.appPath) - console.log('[INFO] Engine Command:', this.command) + Log.info('Engine Path:', this.appPath) + Log.info('Engine Command:', this.command) return true } public start() { if (this.processStatus !== 'stopped') { + Log.warn('Caption engine status is not stopped, cannot start') return } if(!this.getApp()){ return } @@ -90,12 +91,12 @@ export class CaptionEngine { } catch (e) { controlWindow.sendErrorMessage(i18n('engine.start.error') + e) - console.error('[ERROR] Error starting subprocess:', e) + Log.error('Error starting engine:', e) return } this.processStatus = 'running' - console.log('[INFO] Caption Engine Started, PID:', this.process.pid) + Log.info('Caption Engine Started, PID:', this.process.pid) allConfig.controls.engineEnabled = true if(controlWindow.window){ @@ -111,27 +112,23 @@ export class CaptionEngine { lines.forEach((line: string) => { if (line.trim()) { try { - const caption = JSON.parse(line); - if(caption.index === undefined) { - console.log('[INFO] Engine Bad Output:', caption); - } - else allConfig.updateCaptionLog(caption); + const data_obj = JSON.parse(line) + handleEngineData(data_obj) } catch (e) { controlWindow.sendErrorMessage(i18n('engine.output.parse.error') + e) - console.error('[ERROR] Error parsing JSON:', e); + Log.error('Error parsing JSON:', e) } } }); }); - this.process.stderr.on('data', (data) => { + this.process.stderr.on('data', (data: any) => { if(this.processStatus === 'stopping') return controlWindow.sendErrorMessage(i18n('engine.error') + data) - console.error(`[ERROR] Subprocess Error: ${data}`); + Log.error(`Engine Error: ${data}`); }); this.process.on('close', (code: any) => { - console.log(`[INFO] Subprocess exited with code ${code}`); this.process = undefined; allConfig.controls.engineEnabled = false if(controlWindow.window){ @@ -139,14 +136,14 @@ export class CaptionEngine { controlWindow.window.webContents.send('control.engine.stopped') } this.processStatus = 'stopped' - console.log('[INFO] Caption engine process stopped') + Log.info(`Engine exited with code ${code}`) }); } public stop() { if(this.processStatus !== 'running') return if (this.process.pid) { - console.log('[INFO] Trying to stop process, PID:', this.process.pid) + Log.info('Trying to stop process, PID:', this.process.pid) let cmd = `kill ${this.process.pid}`; if (process.platform === "win32") { cmd = `taskkill /pid ${this.process.pid} /t /f` @@ -154,7 +151,7 @@ export class CaptionEngine { exec(cmd, (error) => { if (error) { controlWindow.sendErrorMessage(i18n('engine.shutdown.error') + error) - console.error(`[ERROR] Failed to kill process: ${error}`) + Log.error(`Failed to kill process: ${error}`) } }) } @@ -166,11 +163,26 @@ export class CaptionEngine { controlWindow.window.webContents.send('control.engine.stopped') } this.processStatus = 'stopped' - console.log('[INFO] Process PID undefined, caption engine process stopped') + Log.info('Process PID undefined, caption engine process stopped') return } this.processStatus = 'stopping' - console.log('[INFO] Caption engine process stopping') + Log.info('Caption engine process stopping') + } +} + +function handleEngineData(data: any) { + if(data.command === 'caption') { + allConfig.updateCaptionLog(data); + } + else if(data.command === 'print') { + Log.info('Engine print:', data.content) + } + else if(data.command === 'info') { + Log.info('Engine info:', data.content) + } + else if(data.command === 'usage') { + Log.info('Caption engine usage: ', data.content) } } diff --git a/src/main/utils/Log.ts b/src/main/utils/Log.ts new file mode 100644 index 0000000..f2568ce --- /dev/null +++ b/src/main/utils/Log.ts @@ -0,0 +1,21 @@ +function getTimeString() { + const now = new Date() + const HH = String(now.getHours()).padStart(2, '0') + const MM = String(now.getMinutes()).padStart(2, '0') + const SS = String(now.getSeconds()).padStart(2, '0') + return `${HH}:${MM}:${SS}` +} + +export class Log { + static info(...msg: any[]){ + console.log(`[INFO ${getTimeString()}]`, ...msg) + } + + static warn(...msg: any[]){ + console.log(`[WARN ${getTimeString()}]`, ...msg) + } + + static error(...msg: any[]){ + console.log(`[ERROR ${getTimeString()}]`, ...msg) + } +} diff --git a/src/renderer/src/components/CaptionLog.vue b/src/renderer/src/components/CaptionLog.vue index ada58b7..ac416d6 100644 --- a/src/renderer/src/components/CaptionLog.vue +++ b/src/renderer/src/components/CaptionLog.vue @@ -174,6 +174,12 @@ const columns = [ dataIndex: 'index', key: 'index', width: 80, + sorter: (a: CaptionItem, b: CaptionItem) => { + if(a.index <= b.index) return -1 + return 1 + }, + sortDirections: ['descend'], + defaultSortOrder: 'descend', }, { title: 'time', @@ -184,8 +190,7 @@ const columns = [ if(a.time_s <= b.time_s) return -1 return 1 }, - sortDirections: ['descend'], - defaultSortOrder: 'descend', + sortDirections: ['descend', 'ascend'], }, { title: 'content', From cd9f3a847de3800bd8cafb2d6615f6f6301044ad Mon Sep 17 00:00:00 2001 From: himeditator Date: Mon, 28 Jul 2025 15:49:52 +0800 Subject: [PATCH 3/7] =?UTF-8?q?feat(engine):=20=E9=87=8D=E6=9E=84=E5=AD=97?= =?UTF-8?q?=E5=B9=95=E5=BC=95=E6=93=8E=E5=B9=B6=E5=AE=9E=E7=8E=B0=20WebSoc?= =?UTF-8?q?ket=20=E9=80=9A=E4=BF=A1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 重构了 Gummy 和 Vosk 字幕引擎的代码,提高了可扩展性和可读性 - 合并 Gummy 和 Vosk 引擎为单个可执行文件 - 实现了字幕引擎和主程序之间的 WebSocket 通信,避免了孤儿进程问题 --- docs/CHANGELOG.md | 15 ++ docs/TODO.md | 3 +- electron-builder.yml | 13 +- engine/audio2text/__init__.py | 3 +- engine/audio2text/gummy.py | 3 +- engine/audio2text/vosk.py | 15 +- engine/main-gummy.py | 49 ------ engine/main-gummy.spec | 39 ----- engine/main-vosk.py | 77 --------- engine/main.py | 69 +++++++- engine/{main-vosk.spec => main.spec} | 4 +- engine/utils/__init__.py | 6 +- engine/utils/{process.py => audioprcs.py} | 21 ++- engine/utils/server.py | 37 +++++ engine/utils/thdata.py | 5 + src/main/utils/CaptionEngine.ts | 166 +++++++++---------- src/main/utils/Log.ts | 4 +- src/renderer/src/components/EngineStatus.vue | 4 +- src/renderer/src/stores/engineControl.ts | 2 +- 19 files changed, 242 insertions(+), 293 deletions(-) delete mode 100644 engine/main-gummy.py delete mode 100644 engine/main-gummy.spec delete mode 100644 engine/main-vosk.py rename engine/{main-vosk.spec => main.spec} (95%) rename engine/utils/{process.py => audioprcs.py} (82%) create mode 100644 engine/utils/server.py create mode 100644 engine/utils/thdata.py diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index d6fcad9..a1afb67 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -114,3 +114,18 @@ - 修复无法调用自定义字幕引擎的 bug - 修复自定义字幕引擎的参数失效 bug + +## v0.6.0 + +2025-07-xx + +### 新增功能 + +- 新增字幕记录排序功能,可选择字幕记录正序或倒叙显示 + +### 项目优化 + +- 重构字幕引擎,提示字幕引擎代码的可扩展性和可读性 +- 合并 Gummy 和 Vosk 引擎为单个可执行文件 +- 字幕引擎和主程序添加 WebScoket 通信,完全避免字幕引擎成为孤儿进程 + diff --git a/docs/TODO.md b/docs/TODO.md index 71bad11..a725c93 100644 --- a/docs/TODO.md +++ b/docs/TODO.md @@ -16,10 +16,11 @@ - [x] 可以导出 srt 格式的字幕记录 *2025/07/14* - [x] 可以获取字幕引擎的系统资源消耗情况 *2025/07/15* - [x] 添加字幕记录按时间降序排列选择 *2025/07/26* +- [x] 重构字幕引擎 *2025/07/28* ## 待完成 -- [ ] 重构字幕引擎 +- [ ] 优化前端界面提示消息 - [ ] 验证 / 添加基于 sherpa-onnx 的字幕引擎 ## 后续计划 diff --git a/electron-builder.yml b/electron-builder.yml index 4dce5d8..4cfad9d 100644 --- a/electron-builder.yml +++ b/electron-builder.yml @@ -11,20 +11,15 @@ files: - '!{.env,.env.*,.npmrc,pnpm-lock.yaml}' - '!{tsconfig.json,tsconfig.node.json,tsconfig.web.json}' - '!engine/*' - - '!engine-test/*' - '!docs/*' - '!assets/*' extraResources: # For Windows - - from: ./engine/dist/main-gummy.exe - to: ./engine/main-gummy.exe - - from: ./engine/dist/main-vosk.exe - to: ./engine/main-vosk.exe + - from: ./engine/dist/main.exe + to: ./engine/main.exe # For macOS and Linux - # - from: ./engine/dist/main-gummy - # to: ./engine/main-gummy - # - from: ./engine/dist/main-vosk - # to: ./engine/main-vosk + # - from: ./engine/dist/main + # to: ./engine/main win: executableName: auto-caption icon: build/icon.png diff --git a/engine/audio2text/__init__.py b/engine/audio2text/__init__.py index 6192084..988d76e 100644 --- a/engine/audio2text/__init__.py +++ b/engine/audio2text/__init__.py @@ -1,2 +1,3 @@ from dashscope.common.error import InvalidParameter -from .gummy import GummyTranslator +from .gummy import GummyRecognizer +from .vosk import VoskRecognizer \ No newline at end of file diff --git a/engine/audio2text/gummy.py b/engine/audio2text/gummy.py index e49f47b..1f503b8 100644 --- a/engine/audio2text/gummy.py +++ b/engine/audio2text/gummy.py @@ -62,7 +62,7 @@ class Callback(TranslationRecognizerCallback): stdout_obj(caption) -class GummyTranslator: +class GummyRecognizer: """ 使用 Gummy 引擎流式处理的音频数据,并在标准输出中输出与 Auto Caption 软件可读取的 JSON 字符串数据 @@ -70,6 +70,7 @@ class GummyTranslator: rate: 音频采样率 source: 源语言代码字符串(zh, en, ja 等) target: 目标语言代码字符串(zh, en, ja 等) + api_key: 阿里云百炼平台 API KEY """ def __init__(self, rate: int, source: str, target: str | None, api_key: str | None): if api_key: diff --git a/engine/audio2text/vosk.py b/engine/audio2text/vosk.py index 7c34459..402b7fd 100644 --- a/engine/audio2text/vosk.py +++ b/engine/audio2text/vosk.py @@ -2,7 +2,8 @@ import json from datetime import datetime from vosk import Model, KaldiRecognizer, SetLogLevel -from utils import stdout_obj +from utils import stdout_cmd, stdout_obj + class VoskRecognizer: """ @@ -11,7 +12,7 @@ class VoskRecognizer: 初始化参数: model_path: Vosk 识别模型路径 """ - def __int__(self, model_path: str): + def __init__(self, model_path: str): SetLogLevel(-1) if model_path.startswith('"'): model_path = model_path[1:] @@ -24,7 +25,11 @@ class VoskRecognizer: self.model = Model(self.model_path) self.recognizer = KaldiRecognizer(self.model, 16000) - + + def start(self): + """启动 Vosk 引擎""" + stdout_cmd('info', 'Vosk recognizer started.') + def send_audio_frame(self, data: bytes): """ 发送音频帧给 Vosk 引擎,引擎将自动识别并将识别结果输出到标准输出中 @@ -57,3 +62,7 @@ class VoskRecognizer: self.prev_content = content stdout_obj(caption) + + def stop(self): + """停止 Vosk 引擎""" + stdout_cmd('info', 'Vosk recognizer closed.') \ No newline at end of file diff --git a/engine/main-gummy.py b/engine/main-gummy.py deleted file mode 100644 index 690faae..0000000 --- a/engine/main-gummy.py +++ /dev/null @@ -1,49 +0,0 @@ -import sys -import argparse -from sysaudio import AudioStream -from utils import merge_chunk_channels -from audio2text import InvalidParameter, GummyTranslator - - -def convert_audio_to_text(s_lang, t_lang, audio_type, chunk_rate, api_key): - stream = AudioStream(audio_type, chunk_rate) - - if t_lang == 'none': - gummy = GummyTranslator(stream.RATE, s_lang, None, api_key) - else: - gummy = GummyTranslator(stream.RATE, s_lang, t_lang, api_key) - - stream.open_stream() - gummy.start() - - while True: - try: - chunk = stream.read_chunk() - if chunk is None: continue - chunk_mono = merge_chunk_channels(chunk, stream.CHANNELS) - try: - gummy.send_audio_frame(chunk_mono) - except InvalidParameter: - gummy.start() - gummy.send_audio_frame(chunk_mono) - except KeyboardInterrupt: - stream.close_stream() - gummy.stop() - break - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Convert system audio stream to text') - parser.add_argument('-s', '--source_language', default='en', help='Source language code') - parser.add_argument('-t', '--target_language', default='zh', help='Target language code') - parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output audio stream, 1 for input audio stream') - parser.add_argument('-c', '--chunk_rate', default=20, help='The number of audio stream chunks collected per second.') - parser.add_argument('-k', '--api_key', default='', help='API KEY for Gummy model') - args = parser.parse_args() - convert_audio_to_text( - args.source_language, - args.target_language, - int(args.audio_type), - int(args.chunk_rate), - args.api_key - ) diff --git a/engine/main-gummy.spec b/engine/main-gummy.spec deleted file mode 100644 index cada0d1..0000000 --- a/engine/main-gummy.spec +++ /dev/null @@ -1,39 +0,0 @@ -# -*- mode: python ; coding: utf-8 -*- - - -a = Analysis( - ['main-gummy.py'], - pathex=[], - binaries=[], - datas=[], - hiddenimports=[], - hookspath=[], - hooksconfig={}, - runtime_hooks=[], - excludes=[], - noarchive=False, - optimize=0, -) -pyz = PYZ(a.pure) - -exe = EXE( - pyz, - a.scripts, - a.binaries, - a.datas, - [], - name='main-gummy', - debug=False, - bootloader_ignore_signals=False, - strip=False, - upx=True, - upx_exclude=[], - runtime_tmpdir=None, - console=True, - disable_windowed_traceback=False, - argv_emulation=False, - target_arch=None, - codesign_identity=None, - entitlements_file=None, - onefile=True, -) diff --git a/engine/main-vosk.py b/engine/main-vosk.py deleted file mode 100644 index 5a3fb32..0000000 --- a/engine/main-vosk.py +++ /dev/null @@ -1,77 +0,0 @@ -import sys -import json -import argparse -from datetime import datetime -import numpy.core.multiarray - -from sysaudio import AudioStream -from vosk import Model, KaldiRecognizer, SetLogLevel -from utils import resample_chunk_mono - -SetLogLevel(-1) - -def convert_audio_to_text(audio_type, chunk_rate, model_path): - sys.stdout.reconfigure(line_buffering=True) # type: ignore - - if model_path.startswith('"'): - model_path = model_path[1:] - if model_path.endswith('"'): - model_path = model_path[:-1] - - model = Model(model_path) - recognizer = KaldiRecognizer(model, 16000) - - stream = AudioStream(audio_type, chunk_rate) - stream.open_stream() - - time_str = '' - cur_id = 0 - prev_content = '' - - while True: - chunk = stream.read_chunk() - if chunk is None: continue - chunk_mono = resample_chunk_mono(chunk, stream.CHANNELS, stream.RATE, 16000) - - caption = {} - if recognizer.AcceptWaveform(chunk_mono): - content = json.loads(recognizer.Result()).get('text', '') - caption['index'] = cur_id - caption['text'] = content - caption['time_s'] = time_str - caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3] - caption['translation'] = '' - prev_content = '' - cur_id += 1 - else: - content = json.loads(recognizer.PartialResult()).get('partial', '') - if content == '' or content == prev_content: - continue - if prev_content == '': - time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3] - caption['command'] = 'caption' - caption['index'] = cur_id - caption['text'] = content - caption['time_s'] = time_str - caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3] - caption['translation'] = '' - prev_content = content - try: - json_str = json.dumps(caption) + '\n' - sys.stdout.write(json_str) - sys.stdout.flush() - except Exception as e: - print(e) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Convert system audio stream to text') - parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output audio stream, 1 for input audio stream') - parser.add_argument('-c', '--chunk_rate', default=20, help='The number of audio stream chunks collected per second.') - parser.add_argument('-m', '--model_path', default='', help='The path to the vosk model.') - args = parser.parse_args() - convert_audio_to_text( - int(args.audio_type), - int(args.chunk_rate), - args.model_path - ) diff --git a/engine/main.py b/engine/main.py index 6ea3d53..09583e5 100644 --- a/engine/main.py +++ b/engine/main.py @@ -1,10 +1,59 @@ import argparse +from utils import stdout_cmd +from utils import thread_data, start_server +from utils import merge_chunk_channels, resample_chunk_mono +from audio2text import InvalidParameter, GummyRecognizer +from audio2text import VoskRecognizer +from sysaudio import AudioStream -def gummy_engine(s, t, a, c, k): - pass -def vosk_engine(a, c, m): - pass +def main_gummy(s: str, t: str, a: int, c: int, k: str): + stream = AudioStream(a, c) + if t == 'none': + engine = GummyRecognizer(stream.RATE, s, None, k) + else: + engine = GummyRecognizer(stream.RATE, s, t, k) + + stream.open_stream() + engine.start() + + while thread_data.status == "running": + try: + chunk = stream.read_chunk() + if chunk is None: continue + chunk_mono = merge_chunk_channels(chunk, stream.CHANNELS) + try: + engine.send_audio_frame(chunk_mono) + except InvalidParameter: + stdout_cmd('info', 'Gummy engine stopped, restart engine') + engine.start() + engine.send_audio_frame(chunk_mono) + except KeyboardInterrupt: + break + + stream.close_stream() + engine.stop() + + +def main_vosk(a: int, c: int, m: str): + stream = AudioStream(a, c) + engine = VoskRecognizer(m) + + stream.open_stream() + engine.start() + + while thread_data.status == "running": + try: + chunk = stream.read_chunk() + if chunk is None: continue + chunk_mono = resample_chunk_mono(chunk, stream.CHANNELS, stream.RATE, 16000) + engine.send_audio_frame(chunk_mono) + except KeyboardInterrupt: + break + + stream.close_stream() + engine.stop() + if __name__ == "__main__": parser = argparse.ArgumentParser(description='Convert system audio stream to text') @@ -12,15 +61,23 @@ if __name__ == "__main__": parser.add_argument('-e', '--caption_engine', default='gummy', help='Caption engine: gummy or vosk') parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output, 1 for input') parser.add_argument('-c', '--chunk_rate', default=20, help='Number of audio stream chunks collected per second') + parser.add_argument('-p', '--port', default=7070, help='The port to run the server on, 0 for no server') # gummy parser.add_argument('-s', '--source_language', default='en', help='Source language code') parser.add_argument('-t', '--target_language', default='zh', help='Target language code') parser.add_argument('-k', '--api_key', default='', help='API KEY for Gummy model') # vosk parser.add_argument('-m', '--model_path', default='', help='The path to the vosk model.') + # for test args = parser.parse_args() + + if int(args.port) == 0: + thread_data.status = "running" + else: + start_server(int(args.port)) + if args.caption_engine == 'gummy': - gummy_engine( + main_gummy( args.source_language, args.target_language, int(args.audio_type), @@ -28,7 +85,7 @@ if __name__ == "__main__": args.api_key ) elif args.caption_engine == 'vosk': - vosk_engine( + main_vosk( int(args.audio_type), int(args.chunk_rate), args.model_path diff --git a/engine/main-vosk.spec b/engine/main.spec similarity index 95% rename from engine/main-vosk.spec rename to engine/main.spec index 0324833..81fdd2e 100644 --- a/engine/main-vosk.spec +++ b/engine/main.spec @@ -9,7 +9,7 @@ else: vosk_path = str(Path('./subenv/lib/python3.12/site-packages/vosk').resolve()) a = Analysis( - ['main-vosk.py'], + ['main.py'], pathex=[], binaries=[], datas=[(vosk_path, 'vosk')], @@ -30,7 +30,7 @@ exe = EXE( a.binaries, a.datas, [], - name='main-vosk', + name='main', debug=False, bootloader_ignore_signals=False, strip=False, diff --git a/engine/utils/__init__.py b/engine/utils/__init__.py index 2589863..5de3464 100644 --- a/engine/utils/__init__.py +++ b/engine/utils/__init__.py @@ -1,2 +1,4 @@ -from .process import merge_chunk_channels, resample_chunk_mono, resample_mono_chunk -from .sysout import stdout, stdout_cmd, stdout_obj, stderr \ No newline at end of file +from .audioprcs import merge_chunk_channels, resample_chunk_mono, resample_mono_chunk +from .sysout import stdout, stdout_cmd, stdout_obj, stderr +from .thdata import thread_data +from .server import start_server \ No newline at end of file diff --git a/engine/utils/process.py b/engine/utils/audioprcs.py similarity index 82% rename from engine/utils/process.py rename to engine/utils/audioprcs.py index 01c854d..7f24563 100644 --- a/engine/utils/process.py +++ b/engine/utils/audioprcs.py @@ -1,6 +1,6 @@ import samplerate import numpy as np - +import numpy.core.multiarray def merge_chunk_channels(chunk: bytes, channels: int) -> bytes: """ @@ -13,6 +13,7 @@ def merge_chunk_channels(chunk: bytes, channels: int) -> bytes: Returns: 单通道音频数据块 """ + if channels == 1: return chunk # (length * channels,) chunk_np = np.frombuffer(chunk, dtype=np.int16) # (length, channels) @@ -37,13 +38,17 @@ def resample_chunk_mono(chunk: bytes, channels: int, orig_sr: int, target_sr: in Return: 单通道音频数据块 """ - # (length * channels,) - chunk_np = np.frombuffer(chunk, dtype=np.int16) - # (length, channels) - chunk_np = chunk_np.reshape(-1, channels) - # (length,) - chunk_mono_f = np.mean(chunk_np.astype(np.float32), axis=1) - chunk_mono = chunk_mono_f.astype(np.int16) + if channels == 1: + chunk_mono = chunk + else: + # (length * channels,) + chunk_np = np.frombuffer(chunk, dtype=np.int16) + # (length, channels) + chunk_np = chunk_np.reshape(-1, channels) + # (length,) + chunk_mono_f = np.mean(chunk_np.astype(np.float32), axis=1) + chunk_mono = chunk_mono_f.astype(np.int16) + ratio = target_sr / orig_sr chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode) chunk_mono_r = np.round(chunk_mono_r).astype(np.int16) diff --git a/engine/utils/server.py b/engine/utils/server.py new file mode 100644 index 0000000..48fe3ce --- /dev/null +++ b/engine/utils/server.py @@ -0,0 +1,37 @@ +import socket +import threading +import json +from utils import thread_data, stdout_cmd, stderr + + +def handle_client(client_socket): + global thread_data + while True: + try: + data = client_socket.recv(4096).decode('utf-8') + if not data: + break + data = json.loads(data) + + if data['command'] == 'stop': + if thread_data.status == 'running': + thread_data.status = 'stop' + break + except Exception as e: + stderr(f'Communication error: {e}') + break + + thread_data.status = 'stop' + client_socket.close() + + +def start_server(port: int): + server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + server.bind(('localhost', port)) + server.listen(1) + stdout_cmd('ready') + + client, addr = server.accept() + client_handler = threading.Thread(target=handle_client, args=(client,)) + client_handler.daemon = True + client_handler.start() diff --git a/engine/utils/thdata.py b/engine/utils/thdata.py new file mode 100644 index 0000000..656880f --- /dev/null +++ b/engine/utils/thdata.py @@ -0,0 +1,5 @@ +class ThreadData: + def __init__(self): + self.status = "running" + +thread_data = ThreadData() \ No newline at end of file diff --git a/src/main/utils/CaptionEngine.ts b/src/main/utils/CaptionEngine.ts index 8f102b5..6bd3408 100644 --- a/src/main/utils/CaptionEngine.ts +++ b/src/main/utils/CaptionEngine.ts @@ -1,7 +1,8 @@ -import { spawn, exec } from 'child_process' +import { spawn } from 'child_process' import { app } from 'electron' import { is } from '@electron-toolkit/utils' import path from 'path' +import net from 'net' import { controlWindow } from '../ControlWindow' import { allConfig } from './AllConfig' import { i18n } from '../i18n' @@ -11,91 +12,87 @@ export class CaptionEngine { appPath: string = '' command: string[] = [] process: any | undefined - processStatus: 'running' | 'stopping' | 'stopped' = 'stopped' + client: net.Socket | undefined + status: 'running' | 'stopping' | 'stopped' = 'stopped' private getApp(): boolean { - if (allConfig.controls.customized && allConfig.controls.customizedApp) { + if (allConfig.controls.customized) { Log.info('Using customized engine') this.appPath = allConfig.controls.customizedApp this.command = allConfig.controls.customizedCommand.split(' ') } - else if (allConfig.controls.engine === 'gummy') { - allConfig.controls.customized = false - if(!allConfig.controls.API_KEY && !process.env.DASHSCOPE_API_KEY) { + else { + if(allConfig.controls.engine === 'gummy' && + !allConfig.controls.API_KEY && !process.env.DASHSCOPE_API_KEY + ) { controlWindow.sendErrorMessage(i18n('gummy.key.missing')) return false } - let gummyName = 'main-gummy' - if (process.platform === 'win32') { gummyName += '.exe' } this.command = [] if (is.dev) { - this.appPath = path.join( - app.getAppPath(), 'engine', - 'subenv', 'Scripts', 'python.exe' - ) - this.command.push(path.join( - app.getAppPath(), 'engine', 'main-gummy.py' - )) + // this.appPath = path.join( + // app.getAppPath(), 'engine', + // 'subenv', 'Scripts', 'python.exe' + // ) + // this.command.push(path.join( + // app.getAppPath(), 'engine', 'main.py' + // )) + this.appPath = path.join(app.getAppPath(), 'engine', 'dist', 'main.exe') } else { - this.appPath = path.join( - process.resourcesPath, 'engine', gummyName + this.appPath = path.join(process.resourcesPath, 'engine', 'main.exe') + } + + if(allConfig.controls.engine === 'gummy') { + this.command.push('-e', 'gummy') + this.command.push('-s', allConfig.controls.sourceLang) + this.command.push( + '-t', allConfig.controls.translation ? + allConfig.controls.targetLang : 'none' ) + this.command.push('-a', allConfig.controls.audio ? '1' : '0') + if(allConfig.controls.API_KEY) { + this.command.push('-k', allConfig.controls.API_KEY) + } } - this.command.push('-s', allConfig.controls.sourceLang) - this.command.push( - '-t', allConfig.controls.translation ? - allConfig.controls.targetLang : 'none' - ) - this.command.push('-a', allConfig.controls.audio ? '1' : '0') - if(allConfig.controls.API_KEY) { - this.command.push('-k', allConfig.controls.API_KEY) + else if(allConfig.controls.engine === 'vosk'){ + this.command.push('-e', 'vosk') + this.command.push('-a', allConfig.controls.audio ? '1' : '0') + this.command.push('-m', `"${allConfig.controls.modelPath}"`) } } - else if(allConfig.controls.engine === 'vosk'){ - allConfig.controls.customized = false - let voskName = 'main-vosk' - if (process.platform === 'win32') { voskName += '.exe' } - this.command = [] - if (is.dev) { - this.appPath = path.join( - app.getAppPath(), 'engine', - 'subenv', 'Scripts', 'python.exe' - ) - this.command.push(path.join( - app.getAppPath(), 'engine', 'main-vosk.py' - )) - } - else { - this.appPath = path.join( - process.resourcesPath, 'engine', voskName - ) - } - this.command.push('-a', allConfig.controls.audio ? '1' : '0') - this.command.push('-m', `"${allConfig.controls.modelPath}"`) - } Log.info('Engine Path:', this.appPath) Log.info('Engine Command:', this.command) return true } + public connect() { + if(this.client) { Log.warn('Client already exists, ignoring...') } + Log.info('Connecting to caption engine server...'); + this.client = net.createConnection({ port: 7070 }, () => { + Log.info('Connected to caption engine server'); + }); + this.status = 'running' + } + + public sendCommand(command: string, content: string = "") { + if(this.client === undefined) { + Log.error('Client not initialized yet') + return + } + const data = JSON.stringify({command, content}) + this.client.write(data); + Log.info(`Send data to python server: ${data}`); + } + public start() { - if (this.processStatus !== 'stopped') { - Log.warn('Caption engine status is not stopped, cannot start') + if (this.status !== 'stopped') { + Log.warn('Casption engine is not stopped, current status:', this.status) return } if(!this.getApp()){ return } - try { - this.process = spawn(this.appPath, this.command) - } - catch (e) { - controlWindow.sendErrorMessage(i18n('engine.start.error') + e) - Log.error('Error starting engine:', e) - return - } - - this.processStatus = 'running' + this.process = spawn(this.appPath, this.command) Log.info('Caption Engine Started, PID:', this.process.pid) allConfig.controls.engineEnabled = true @@ -123,7 +120,7 @@ export class CaptionEngine { }); this.process.stderr.on('data', (data: any) => { - if(this.processStatus === 'stopping') return + if(this.status === 'stopping') return controlWindow.sendErrorMessage(i18n('engine.error') + data) Log.error(`Engine Error: ${data}`); }); @@ -135,54 +132,43 @@ export class CaptionEngine { allConfig.sendControls(controlWindow.window) controlWindow.window.webContents.send('control.engine.stopped') } - this.processStatus = 'stopped' + this.status = 'stopped' Log.info(`Engine exited with code ${code}`) }); } public stop() { - if(this.processStatus !== 'running') return - if (this.process.pid) { - Log.info('Trying to stop process, PID:', this.process.pid) - let cmd = `kill ${this.process.pid}`; - if (process.platform === "win32") { - cmd = `taskkill /pid ${this.process.pid} /t /f` - } - exec(cmd, (error) => { - if (error) { - controlWindow.sendErrorMessage(i18n('engine.shutdown.error') + error) - Log.error(`Failed to kill process: ${error}`) - } - }) - } - else { - this.process = undefined; - allConfig.controls.engineEnabled = false - if(controlWindow.window){ - allConfig.sendControls(controlWindow.window) - controlWindow.window.webContents.send('control.engine.stopped') - } - this.processStatus = 'stopped' - Log.info('Process PID undefined, caption engine process stopped') + if(this.status !== 'running'){ + Log.warn('Engine is not running, current status:', this.status) return } - this.processStatus = 'stopping' - Log.info('Caption engine process stopping') + this.sendCommand('stop') + if(this.client){ + this.client.destroy() + this.client = undefined + } + this.status = 'stopping' + Log.info('Caption engine process stopping...') } } function handleEngineData(data: any) { - if(data.command === 'caption') { + if(data.command === 'ready'){ + captionEngine.connect() + } + else if(data.command === 'caption') { allConfig.updateCaptionLog(data); } else if(data.command === 'print') { - Log.info('Engine print:', data.content) + console.log(data.content) + // Log.info('Engine Print:', data.content) } else if(data.command === 'info') { - Log.info('Engine info:', data.content) + Log.info('Engine Info:', data.content) } else if(data.command === 'usage') { - Log.info('Caption engine usage: ', data.content) + console.error(data.content) + // Log.info('Gummy Engine Usage: ', data.content) } } diff --git a/src/main/utils/Log.ts b/src/main/utils/Log.ts index f2568ce..93f1022 100644 --- a/src/main/utils/Log.ts +++ b/src/main/utils/Log.ts @@ -12,10 +12,10 @@ export class Log { } static warn(...msg: any[]){ - console.log(`[WARN ${getTimeString()}]`, ...msg) + console.warn(`[WARN ${getTimeString()}]`, ...msg) } static error(...msg: any[]){ - console.log(`[ERROR ${getTimeString()}]`, ...msg) + console.error(`[ERROR ${getTimeString()}]`, ...msg) } } diff --git a/src/renderer/src/components/EngineStatus.vue b/src/renderer/src/components/EngineStatus.vue index 607433e..f838d72 100644 --- a/src/renderer/src/components/EngineStatus.vue +++ b/src/renderer/src/components/EngineStatus.vue @@ -4,7 +4,7 @@ @@ -130,7 +130,7 @@ const showAbout = ref(false) const captionLog = useCaptionLogStore() const { captionData } = storeToRefs(captionLog) const engineControl = useEngineControlStore() -const { engineEnabled, engine, customized, customizedApp } = storeToRefs(engineControl) +const { engineEnabled, engine, customized } = storeToRefs(engineControl) const pid = ref(0) const ppid = ref(0) diff --git a/src/renderer/src/stores/engineControl.ts b/src/renderer/src/stores/engineControl.ts index 5a6d13b..64f3fb9 100644 --- a/src/renderer/src/stores/engineControl.ts +++ b/src/renderer/src/stores/engineControl.ts @@ -82,7 +82,7 @@ export const useEngineControlStore = defineStore('engineControl', () => { notification.open({ message: t('noti.started'), description: - ((customized.value && customizedApp.value) ? str1 : str0) + + (customized.value ? str1 : str0) + `${t('noti.pidInfo')}${args}` }); }) From e4f937e6b60ed1aba0bd1abdf47c8c2ac90a2e12 Mon Sep 17 00:00:00 2001 From: himeditator Date: Mon, 28 Jul 2025 21:44:49 +0800 Subject: [PATCH 4/7] =?UTF-8?q?feat(engine):=20=E4=BC=98=E5=8C=96=E5=AD=97?= =?UTF-8?q?=E5=B9=95=E5=BC=95=E6=93=8E=E9=80=9A=E4=BF=A1=E5=92=8C=E6=8E=A7?= =?UTF-8?q?=E5=88=B6=E9=80=BB=E8=BE=91=EF=BC=8C=E4=BC=98=E5=8C=96=E7=AA=97?= =?UTF-8?q?=E5=8F=A3=E4=BF=A1=E6=81=AF=E5=B1=95=E7=A4=BA?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - 优化错误处理和引擎重启逻辑 - 添加字幕引擎强制终止功能 - 调整通知和错误提示的显示位置 - 优化日志记录精度到毫秒级 --- docs/CHANGELOG.md | 6 +- docs/api-docs/caption-engine.md | 72 ++++++++++---- engine/audio2text/gummy.py | 7 +- engine/main.py | 28 ++++-- engine/utils/audioprcs.py | 2 +- engine/utils/server.py | 9 +- src/main/utils/CaptionEngine.ts | 94 +++++++++++++------ src/main/utils/Log.ts | 3 +- src/renderer/src/components/CaptionStyle.vue | 3 +- src/renderer/src/components/EngineControl.vue | 1 + src/renderer/src/components/EngineStatus.vue | 14 ++- src/renderer/src/stores/engineControl.ts | 4 +- 12 files changed, 171 insertions(+), 72 deletions(-) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index a1afb67..0f51d25 100644 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -123,9 +123,13 @@ - 新增字幕记录排序功能,可选择字幕记录正序或倒叙显示 +### 优化体验 + +- 交换窗口界面信息和错误提示弹窗的位置,防止提示信息挡住操作 + ### 项目优化 - 重构字幕引擎,提示字幕引擎代码的可扩展性和可读性 -- 合并 Gummy 和 Vosk 引擎为单个可执行文件 +- 合并 Gummy 和 Vosk 引擎为单个可执行文件,减小软件体积 - 字幕引擎和主程序添加 WebScoket 通信,完全避免字幕引擎成为孤儿进程 diff --git a/docs/api-docs/caption-engine.md b/docs/api-docs/caption-engine.md index d284b5a..52799b5 100644 --- a/docs/api-docs/caption-engine.md +++ b/docs/api-docs/caption-engine.md @@ -1,17 +1,63 @@ # caption engine api-doc -本文档主要 Electron 主进程和字幕引擎进程的通信约定。 +本文档主要介绍字幕引擎和 Electron 主进程进程的通信约定。 ## 原理说明 -本项目的 Python 进程通过标准输出向 Electron 主进程发送数据。 +本项目的 Python 进程通过标准输出向 Electron 主进程发送数据。Python 进程标准输出 (`sys.stdout`) 的内容一定为一行一行的字符串。且每行字符串均可以解释为一个 JSON 对象。每个 JSON 对象一定有 `command` 参数。 -Python 进程标准输出 (`sys.stdout`) 的内容一定为一行一行的字符串。且每行字符串均可以解释为一个 JSON 对象。每个 JSON 对象一定有 `command` 参数。 +Electron 主进程通过 WebSocket 向 Python 进程发送数据。发送的数据均是转化为字符串的对象,对象格式一定为: -## 输出约定 +```js +{ + command: string, + content: string +} +``` + +## 标准输出约定 + +> 数据传递方向:字幕引擎进程 => Electron 主进程 当 JSON 对象的 `command` 参数为下列值时,表示的对应的含义: +### `connect` + +```js +{ + command: "connect", + content: "" +} +``` + +字幕引擎 WebSocket 服务已经准备好,命令 Electron 主进程连接字幕引擎 WebSocket 服务 + +### `kill` + +```js +{ + command: "connect", + content: "" +} +``` + +命令 Electron 主进程强制结束字幕引擎进程。 + +### `caption` + +```js +{ + command: "caption", + index: number, + time_s: string, + time_t: string, + text: string, + translation: string +} +``` + +Python 端监听到的音频流转换为的字幕数据。 + ### `print` ```js @@ -45,18 +91,12 @@ Python 端打印的提示信息,比起 `print`,该信息更希望 Electron Gummy 字幕引擎结束时打印计费消耗信息。 +## WebSocket -### `caption` +> 数据传递方向:Electron 主进程 => 字幕引擎进程 -```js -{ - command: "caption", - index: number, - time_s: string, - time_t: string, - text: string, - translation: string -} -``` +当 JSON 对象的 `command` 参数为下列值时,表示的对应的含义: -Python 端监听到的音频流转换为的字幕数据。 \ No newline at end of file +### `stop` + +命令当前字幕引擎停止监听并结束任务。 \ No newline at end of file diff --git a/engine/audio2text/gummy.py b/engine/audio2text/gummy.py index 1f503b8..7d95b9b 100644 --- a/engine/audio2text/gummy.py +++ b/engine/audio2text/gummy.py @@ -6,7 +6,7 @@ from dashscope.audio.asr import ( ) import dashscope from datetime import datetime -from utils import stdout_cmd, stdout_obj +from utils import stdout_cmd, stdout_obj, stderr class Callback(TranslationRecognizerCallback): @@ -96,4 +96,7 @@ class GummyRecognizer: def stop(self): """停止 Gummy 引擎""" - self.translator.stop() + try: + self.translator.stop() + except Exception: + return diff --git a/engine/main.py b/engine/main.py index 09583e5..92ef24d 100644 --- a/engine/main.py +++ b/engine/main.py @@ -1,5 +1,5 @@ import argparse -from utils import stdout_cmd +from utils import stdout_cmd, stderr from utils import thread_data, start_server from utils import merge_chunk_channels, resample_chunk_mono from audio2text import InvalidParameter, GummyRecognizer @@ -8,6 +8,7 @@ from sysaudio import AudioStream def main_gummy(s: str, t: str, a: int, c: int, k: str): + global thread_data stream = AudioStream(a, c) if t == 'none': engine = GummyRecognizer(stream.RATE, s, None, k) @@ -17,6 +18,7 @@ def main_gummy(s: str, t: str, a: int, c: int, k: str): stream.open_stream() engine.start() + restart_count = 0 while thread_data.status == "running": try: chunk = stream.read_chunk() @@ -24,18 +26,22 @@ def main_gummy(s: str, t: str, a: int, c: int, k: str): chunk_mono = merge_chunk_channels(chunk, stream.CHANNELS) try: engine.send_audio_frame(chunk_mono) - except InvalidParameter: - stdout_cmd('info', 'Gummy engine stopped, restart engine') - engine.start() - engine.send_audio_frame(chunk_mono) + except InvalidParameter as e: + restart_count += 1 + if restart_count > 8: + stderr(str(e)) + thread_data.status = "kill" + break + else: + stdout_cmd('info', f'Gummy engine stopped, trying to restart #{restart_count}') except KeyboardInterrupt: break stream.close_stream() engine.stop() - def main_vosk(a: int, c: int, m: str): + global thread_data stream = AudioStream(a, c) engine = VoskRecognizer(m) @@ -68,9 +74,8 @@ if __name__ == "__main__": parser.add_argument('-k', '--api_key', default='', help='API KEY for Gummy model') # vosk parser.add_argument('-m', '--model_path', default='', help='The path to the vosk model.') - # for test - args = parser.parse_args() - + + args = parser.parse_args() if int(args.port) == 0: thread_data.status = "running" else: @@ -91,4 +96,7 @@ if __name__ == "__main__": args.model_path ) else: - raise ValueError('Invalid caption engine specified.') \ No newline at end of file + raise ValueError('Invalid caption engine specified.') + + if thread_data.status == "kill": + stdout_cmd('kill') \ No newline at end of file diff --git a/engine/utils/audioprcs.py b/engine/utils/audioprcs.py index 7f24563..a4362a2 100644 --- a/engine/utils/audioprcs.py +++ b/engine/utils/audioprcs.py @@ -1,6 +1,6 @@ import samplerate import numpy as np -import numpy.core.multiarray +import numpy.core.multiarray # do not remove def merge_chunk_channels(chunk: bytes, channels: int) -> bytes: """ diff --git a/engine/utils/server.py b/engine/utils/server.py index 48fe3ce..9026e2e 100644 --- a/engine/utils/server.py +++ b/engine/utils/server.py @@ -6,7 +6,7 @@ from utils import thread_data, stdout_cmd, stderr def handle_client(client_socket): global thread_data - while True: + while thread_data.status == 'running': try: data = client_socket.recv(4096).decode('utf-8') if not data: @@ -14,9 +14,8 @@ def handle_client(client_socket): data = json.loads(data) if data['command'] == 'stop': - if thread_data.status == 'running': - thread_data.status = 'stop' - break + thread_data.status = 'stop' + break except Exception as e: stderr(f'Communication error: {e}') break @@ -29,7 +28,7 @@ def start_server(port: int): server = socket.socket(socket.AF_INET, socket.SOCK_STREAM) server.bind(('localhost', port)) server.listen(1) - stdout_cmd('ready') + stdout_cmd('connect') client, addr = server.accept() client_handler = threading.Thread(target=handle_client, args=(client,)) diff --git a/src/main/utils/CaptionEngine.ts b/src/main/utils/CaptionEngine.ts index 6bd3408..80e5ad7 100644 --- a/src/main/utils/CaptionEngine.ts +++ b/src/main/utils/CaptionEngine.ts @@ -1,4 +1,4 @@ -import { spawn } from 'child_process' +import { exec, spawn } from 'child_process' import { app } from 'electron' import { is } from '@electron-toolkit/utils' import path from 'path' @@ -13,11 +13,11 @@ export class CaptionEngine { command: string[] = [] process: any | undefined client: net.Socket | undefined - status: 'running' | 'stopping' | 'stopped' = 'stopped' + status: 'running' | 'starting' | 'stopping' | 'stopped' = 'stopped' private getApp(): boolean { if (allConfig.controls.customized) { - Log.info('Using customized engine') + Log.info('Using customized caption engine') this.appPath = allConfig.controls.customizedApp this.command = allConfig.controls.customizedCommand.split(' ') } @@ -30,14 +30,14 @@ export class CaptionEngine { } this.command = [] if (is.dev) { - // this.appPath = path.join( - // app.getAppPath(), 'engine', - // 'subenv', 'Scripts', 'python.exe' - // ) - // this.command.push(path.join( - // app.getAppPath(), 'engine', 'main.py' - // )) - this.appPath = path.join(app.getAppPath(), 'engine', 'dist', 'main.exe') + this.appPath = path.join( + app.getAppPath(), 'engine', + 'subenv', 'Scripts', 'python.exe' + ) + this.command.push(path.join( + app.getAppPath(), 'engine', 'main.py' + )) + // this.appPath = path.join(app.getAppPath(), 'engine', 'dist', 'main.exe') } else { this.appPath = path.join(process.resourcesPath, 'engine', 'main.exe') @@ -73,6 +73,14 @@ export class CaptionEngine { Log.info('Connected to caption engine server'); }); this.status = 'running' + allConfig.controls.engineEnabled = true + if(controlWindow.window){ + allConfig.sendControls(controlWindow.window) + controlWindow.window.webContents.send( + 'control.engine.started', + this.process.pid + ) + } } public sendCommand(command: string, content: string = "") { @@ -93,19 +101,11 @@ export class CaptionEngine { if(!this.getApp()){ return } this.process = spawn(this.appPath, this.command) - Log.info('Caption Engine Started, PID:', this.process.pid) - - allConfig.controls.engineEnabled = true - if(controlWindow.window){ - allConfig.sendControls(controlWindow.window) - controlWindow.window.webContents.send( - 'control.engine.started', - this.process.pid - ) - } - + this.status = 'starting' + Log.info('Caption Engine Starting, PID:', this.process.pid) + this.process.stdout.on('data', (data: any) => { - const lines = data.toString().split('\n'); + const lines = data.toString().split('\n') lines.forEach((line: string) => { if (line.trim()) { try { @@ -120,13 +120,18 @@ export class CaptionEngine { }); this.process.stderr.on('data', (data: any) => { - if(this.status === 'stopping') return - controlWindow.sendErrorMessage(i18n('engine.error') + data) - Log.error(`Engine Error: ${data}`); + const lines = data.toString().split('\n') + lines.forEach((line: string) => { + if(line.trim()){ + controlWindow.sendErrorMessage(/*i18n('engine.error') +*/ line) + console.error(line) + } + }) }); this.process.on('close', (code: any) => { this.process = undefined; + this.client = undefined allConfig.controls.engineEnabled = false if(controlWindow.window){ allConfig.sendControls(controlWindow.window) @@ -150,25 +155,52 @@ export class CaptionEngine { this.status = 'stopping' Log.info('Caption engine process stopping...') } + + public kill(){ + if(this.status !== 'running'){ + Log.warn('Engine is not running, current status:', this.status) + return + } + if (this.process.pid) { + Log.warn('Trying to kill engine process, PID:', this.process.pid) + if(this.client){ + this.client.destroy() + this.client = undefined + } + let cmd = `kill ${this.process.pid}`; + if (process.platform === "win32") { + cmd = `taskkill /pid ${this.process.pid} /t /f` + } + exec(cmd) + } + this.status = 'stopping' + } } function handleEngineData(data: any) { - if(data.command === 'ready'){ + if(data.command === 'connect'){ captionEngine.connect() } + else if(data.command === 'kill') { + if(captionEngine.status !== 'stopped') { + Log.warn('Error occurred, trying to kill Gummy engine...') + captionEngine.kill() + } + } else if(data.command === 'caption') { allConfig.updateCaptionLog(data); } else if(data.command === 'print') { - console.log(data.content) - // Log.info('Engine Print:', data.content) + Log.info('Engine Print:', data.content) } else if(data.command === 'info') { Log.info('Engine Info:', data.content) } else if(data.command === 'usage') { - console.error(data.content) - // Log.info('Gummy Engine Usage: ', data.content) + Log.info('Gummy Engine Usage: ', data.content) + } + else { + Log.warn('Unknown command:', data) } } diff --git a/src/main/utils/Log.ts b/src/main/utils/Log.ts index 93f1022..c226c08 100644 --- a/src/main/utils/Log.ts +++ b/src/main/utils/Log.ts @@ -3,7 +3,8 @@ function getTimeString() { const HH = String(now.getHours()).padStart(2, '0') const MM = String(now.getMinutes()).padStart(2, '0') const SS = String(now.getSeconds()).padStart(2, '0') - return `${HH}:${MM}:${SS}` + const MS = String(now.getMilliseconds()).padStart(3, '0') + return `${HH}:${MM}:${SS}.${MS}` } export class Log { diff --git a/src/renderer/src/components/CaptionStyle.vue b/src/renderer/src/components/CaptionStyle.vue index 4331d44..5b90392 100644 --- a/src/renderer/src/components/CaptionStyle.vue +++ b/src/renderer/src/components/CaptionStyle.vue @@ -282,7 +282,8 @@ function applyStyle(){ captionStyle.sendStylesChange(); - notification.open({ + notification.open({ + placement: 'topLeft', message: t('noti.styleChange'), description: t('noti.styleInfo') }); diff --git a/src/renderer/src/components/EngineControl.vue b/src/renderer/src/components/EngineControl.vue index 08ffa5a..ab610fc 100644 --- a/src/renderer/src/components/EngineControl.vue +++ b/src/renderer/src/components/EngineControl.vue @@ -164,6 +164,7 @@ function applyChange(){ engineControl.sendControlsChange() notification.open({ + placement: 'topLeft', message: t('noti.engineChange'), description: t('noti.changeInfo') }); diff --git a/src/renderer/src/components/EngineStatus.vue b/src/renderer/src/components/EngineStatus.vue index f838d72..b5f383d 100644 --- a/src/renderer/src/components/EngineStatus.vue +++ b/src/renderer/src/components/EngineStatus.vue @@ -61,12 +61,14 @@ >{{ $t('status.openCaption') }} {{ $t('status.startEngine') }} {{ $t('status.stopEngine') }} @@ -119,13 +121,14 @@