diff --git a/.gitignore b/.gitignore
index 75d2a08..0dd50b6 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,5 +7,6 @@ out
__pycache__
subenv
caption-engine/build
+caption-engine/models
output.wav
-.venv
\ No newline at end of file
+.venv
diff --git a/README.md b/README.md
index 36c77c2..1ccfe38 100644
--- a/README.md
+++ b/README.md
@@ -2,11 +2,13 @@
Auto Caption 是一个跨平台的实时字幕显示软件。
-
+
+
+
+
+
+
| 简体中文
| English
diff --git a/README_en.md b/README_en.md
index a1e302d..7c3be90 100644
--- a/README_en.md
+++ b/README_en.md
@@ -2,11 +2,13 @@
Auto Caption is a cross-platform real-time caption display software.
-
+
+
+
+
+
+
| 简体中文
| English
diff --git a/README_ja.md b/README_ja.md
index d5ed76a..e0a9431 100644
--- a/README_ja.md
+++ b/README_ja.md
@@ -2,11 +2,13 @@
Auto Caption はクロスプラットフォームのリアルタイム字幕表示ソフトウェアです。
-
+
+
+
+
+
+
| 简体中文 | English diff --git a/caption-engine/audioprcs/process.py b/caption-engine/audioprcs/process.py index ff9c61a..650081b 100644 --- a/caption-engine/audioprcs/process.py +++ b/caption-engine/audioprcs/process.py @@ -47,3 +47,22 @@ def resampleRawChunk(chunk, channels, orig_sr, target_sr, mode="sinc_best"): chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode) chunk_mono_r = np.round(chunk_mono_r).astype(np.int16) return chunk_mono_r.tobytes() + +def resampleMonoChunk(chunk, orig_sr, target_sr, mode="sinc_best"): + """ + 将当前单通道进行重采样 + + Args: + chunk: (bytes)单通道音频数据块 + orig_sr: 原始采样率 + target_sr: 目标采样率 + mode: 重采样模式,可选:'sinc_best' | 'sinc_medium' | 'sinc_fastest' | 'zero_order_hold' | 'linear' + + Return: + (bytes)单通道音频数据块 + """ + chunk_np = np.frombuffer(chunk, dtype=np.int16) + ratio = target_sr / orig_sr + chunk_r = samplerate.resample(chunk_np, ratio, converter_type=mode) + chunk_r = np.round(chunk_r).astype(np.int16) + return chunk_r.tobytes() diff --git a/caption-engine/main-vosk.py b/caption-engine/main-vosk.py new file mode 100644 index 0000000..cf407f7 --- /dev/null +++ b/caption-engine/main-vosk.py @@ -0,0 +1,83 @@ +import sys +import json +import argparse +from datetime import datetime +import numpy.core.multiarray + +if sys.platform == 'win32': + from sysaudio.win import AudioStream +elif sys.platform == 'darwin': + from sysaudio.darwin import AudioStream +elif sys.platform == 'linux': + from sysaudio.linux import AudioStream +else: + raise NotImplementedError(f"Unsupported platform: {sys.platform}") + +from vosk import Model, KaldiRecognizer, SetLogLevel +from audioprcs import resampleRawChunk + +SetLogLevel(-1) + +def convert_audio_to_text(audio_type, chunk_rate, model_path): + sys.stdout.reconfigure(line_buffering=True) # type: ignore + + if model_path.startswith('"'): + model_path = model_path[1:] + if model_path.endswith('"'): + model_path = model_path[:-1] + + model = Model(model_path) + recognizer = KaldiRecognizer(model, 16000) + + stream = AudioStream(audio_type, chunk_rate) + stream.openStream() + + time_str = '' + cur_id = 0 + prev_content = '' + + while True: + chunk = stream.read_chunk() + chunk_mono = resampleRawChunk(chunk, stream.CHANNELS, stream.RATE, 16000) + + caption = {} + if recognizer.AcceptWaveform(chunk_mono): + content = json.loads(recognizer.Result()).get('text', '') + caption['index'] = cur_id + caption['text'] = content + caption['time_s'] = time_str + caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3] + caption['translation'] = '' + prev_content = '' + cur_id += 1 + else: + content = json.loads(recognizer.PartialResult()).get('partial', '') + if content == '' or content == prev_content: + continue + if prev_content == '': + time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3] + caption['index'] = cur_id + caption['text'] = content + caption['time_s'] = time_str + caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3] + caption['translation'] = '' + prev_content = content + try: + json_str = json.dumps(caption) + '\n' + sys.stdout.write(json_str) + sys.stdout.flush() + except Exception as e: + print(e) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Convert system audio stream to text') + parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output audio stream, 1 for input audio stream') + parser.add_argument('-c', '--chunk_rate', default=20, help='The number of audio stream chunks collected per second.') + parser.add_argument('-m', '--model_path', default='', help='The path to the vosk model.') + args = parser.parse_args() + convert_audio_to_text( + int(args.audio_type), + int(args.chunk_rate), + args.model_path + ) diff --git a/caption-engine/main-vosk.spec b/caption-engine/main-vosk.spec new file mode 100644 index 0000000..aab7f83 --- /dev/null +++ b/caption-engine/main-vosk.spec @@ -0,0 +1,42 @@ +# -*- mode: python ; coding: utf-8 -*- + +from pathlib import Path + +vosk_path = str(Path('./subenv/Lib/site-packages/vosk').resolve()) + +a = Analysis( + ['main-vosk.py'], + pathex=[], + binaries=[], + datas=[(vosk_path, 'vosk')], + hiddenimports=[], + hookspath=[], + hooksconfig={}, + runtime_hooks=[], + excludes=[], + noarchive=False, + optimize=0, +) + +pyz = PYZ(a.pure) + +exe = EXE( + pyz, + a.scripts, + a.binaries, + a.datas, + [], + name='main-vosk', + debug=False, + bootloader_ignore_signals=False, + strip=False, + upx=True, + upx_exclude=[], + runtime_tmpdir=None, + console=True, + disable_windowed_traceback=False, + argv_emulation=False, + target_arch=None, + codesign_identity=None, + entitlements_file=None, +) diff --git a/caption-engine/requirements.txt b/caption-engine/requirements.txt index 981c2fb..0f168fb 100644 --- a/caption-engine/requirements.txt +++ b/caption-engine/requirements.txt @@ -3,4 +3,5 @@ numpy samplerate PyAudio PyAudioWPatch # Windows only +vosk pyinstaller diff --git a/caption-engine/sysaudio/win.py b/caption-engine/sysaudio/win.py index 5db1d9d..c6765ce 100644 --- a/caption-engine/sysaudio/win.py +++ b/caption-engine/sysaudio/win.py @@ -57,7 +57,7 @@ class AudioStream: self.stream = None self.SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16) self.FORMAT = pyaudio.paInt16 - self.CHANNELS = self.device["maxInputChannels"] + self.CHANNELS = int(self.device["maxInputChannels"]) self.RATE = int(self.device["defaultSampleRate"]) self.CHUNK = self.RATE // chunk_rate self.INDEX = self.device["index"] diff --git a/electron-builder.yml b/electron-builder.yml index a80df13..c3efa72 100644 --- a/electron-builder.yml +++ b/electron-builder.yml @@ -10,8 +10,15 @@ files: - '!{.env,.env.*,.npmrc,pnpm-lock.yaml}' - '!{tsconfig.json,tsconfig.node.json,tsconfig.web.json}' extraResources: - from: ./caption-engine/dist/main-gummy.exe - to: ./caption-engine/main-gummy.exe + - from: ./caption-engine/dist/main-gummy.exe + to: ./caption-engine/main-gummy.exe + - from: ./caption-engine/dist/main-vosk.exe + to: ./caption-engine/main-vosk.exe + # For macOS and Linux + # - from: ./caption-engine/dist/main-gummy + # to: ./caption-engine/main-gummy + # - from: ./caption-engine/dist/main-vosk + # to: ./caption-engine/main-vosk asarUnpack: - resources/** win: diff --git a/engine-test/vosk.ipynb b/engine-test/vosk.ipynb new file mode 100644 index 0000000..6238a68 --- /dev/null +++ b/engine-test/vosk.ipynb @@ -0,0 +1,124 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "6fb12704", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "d:\\Projects\\auto-caption\\caption-engine\\subenv\\Lib\\site-packages\\vosk\\__init__.py\n" + ] + } + ], + "source": [ + "import vosk\n", + "print(vosk.__file__)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "63a06f5c", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + " 采样设备:\n", + " - 设备类型:音频输入\n", + " - 序号:1\n", + " - 名称:麦克风阵列 (Realtek(R) Audio)\n", + " - 最大输入通道数:2\n", + " - 默认低输入延迟:0.09s\n", + " - 默认高输入延迟:0.18s\n", + " - 默认采样率:44100.0Hz\n", + " - 是否回环设备:False\n", + "\n", + " 音频样本块大小:2205\n", + " 样本位宽:2\n", + " 采样格式:8\n", + " 音频通道数:2\n", + " 音频采样率:44100\n", + " \n" + ] + } + ], + "source": [ + "import sys\n", + "import os\n", + "import json\n", + "from vosk import Model, KaldiRecognizer\n", + "\n", + "current_dir = os.getcwd() \n", + "sys.path.append(os.path.join(current_dir, '../caption-engine'))\n", + "\n", + "from sysaudio.win import AudioStream\n", + "from audioprcs import resampleRawChunk, mergeChunkChannels\n", + "\n", + "stream = AudioStream(1)\n", + "stream.printInfo()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "5d5a0afa", + "metadata": {}, + "outputs": [], + "source": [ + "model = Model(os.path.join(\n", + " current_dir,\n", + " '../caption-engine/models/vosk-model-small-cn-0.22'\n", + "))\n", + "recognizer = KaldiRecognizer(model, 16000)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "7e9d1530", + "metadata": {}, + "outputs": [], + "source": [ + "stream.openStream()\n", + "\n", + "for i in range(200):\n", + " chunk = stream.read_chunk()\n", + " chunk_mono = resampleRawChunk(chunk, stream.CHANNELS, stream.RATE, 16000)\n", + " if recognizer.AcceptWaveform(chunk_mono):\n", + " result = json.loads(recognizer.Result())\n", + " print(\"acc:\", result.get(\"text\", \"\"))\n", + " else:\n", + " partial = json.loads(recognizer.PartialResult())\n", + " print(\"else:\", partial.get(\"partial\", \"\"))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "subenv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/main/types/index.ts b/src/main/types/index.ts index 9bdb881..59ca5a4 100644 --- a/src/main/types/index.ts +++ b/src/main/types/index.ts @@ -6,10 +6,11 @@ export interface Controls { engineEnabled: boolean, sourceLang: string, targetLang: string, - engine: 'gummy', + engine: string, audio: 0 | 1, translation: boolean, API_KEY: string, + modelPath: string, customized: boolean, customizedApp: string, customizedCommand: string diff --git a/src/main/utils/AllConfig.ts b/src/main/utils/AllConfig.ts index 8d16045..d3318ab 100644 --- a/src/main/utils/AllConfig.ts +++ b/src/main/utils/AllConfig.ts @@ -34,6 +34,7 @@ const defaultControls: Controls = { audio: 0, engineEnabled: false, API_KEY: '', + modelPath: '', translation: true, customized: false, customizedApp: '', diff --git a/src/main/utils/CaptionEngine.ts b/src/main/utils/CaptionEngine.ts index 6ca6595..c254de9 100644 --- a/src/main/utils/CaptionEngine.ts +++ b/src/main/utils/CaptionEngine.ts @@ -13,26 +13,20 @@ export class CaptionEngine { processStatus: 'running' | 'stopping' | 'stopped' = 'stopped' private getApp(): boolean { + allConfig.controls.customized = false if (allConfig.controls.customized && allConfig.controls.customizedApp) { this.appPath = allConfig.controls.customizedApp this.command = [allConfig.controls.customizedCommand] + allConfig.controls.customized = true } else if (allConfig.controls.engine === 'gummy') { - allConfig.controls.customized = false if(!allConfig.controls.API_KEY && !process.env.DASHSCOPE_API_KEY) { controlWindow.sendErrorMessage(i18n('gummy.key.missing')) return false } - let gummyName = '' + let gummyName = 'main-gummy' if (process.platform === 'win32') { - gummyName = 'main-gummy.exe' - } - else if (process.platform === 'darwin' || process.platform === 'linux') { - gummyName = 'main-gummy' - } - else { - controlWindow.sendErrorMessage(i18n('platform.unsupported') + process.platform) - throw new Error(i18n('platform.unsupported')) + gummyName += '.exe' } if (is.dev) { this.appPath = path.join( @@ -55,10 +49,29 @@ export class CaptionEngine { if(allConfig.controls.API_KEY) { this.command.push('-k', allConfig.controls.API_KEY) } - - console.log('[INFO] Engine Path:', this.appPath) - console.log('[INFO] Engine Command:', this.command) } + else if(allConfig.controls.engine === 'vosk'){ + let voskName = 'main-vosk' + if (process.platform === 'win32') { + voskName += '.exe' + } + if (is.dev) { + this.appPath = path.join( + app.getAppPath(), + 'caption-engine', 'dist', voskName + ) + } + else { + this.appPath = path.join( + process.resourcesPath, 'caption-engine', voskName + ) + } + this.command = [] + this.command.push('-a', allConfig.controls.audio ? '1' : '0') + this.command.push('-m', `"${allConfig.controls.modelPath}"`) + } + console.log('[INFO] Engine Path:', this.appPath) + console.log('[INFO] Engine Command:', this.command) return true } diff --git a/src/renderer/src/components/EngineControl.vue b/src/renderer/src/components/EngineControl.vue index b459f46..06b85cf 100644 --- a/src/renderer/src/components/EngineControl.vue +++ b/src/renderer/src/components/EngineControl.vue @@ -16,6 +16,7 @@