diff --git a/.gitignore b/.gitignore index 4a5feda..f385eb0 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,4 @@ out *.log* __pycache__ subenv -build \ No newline at end of file +python-subprocess/build \ No newline at end of file diff --git a/README.md b/README.md index 2fee440..cedbaa8 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,9 @@ - 灵活的字幕引擎选择 - 多语言识别与翻译 - 字幕记录展示与导出 +- 生成音频输出和麦克风输入的字幕 + +说明:Windows 平台支持生成音频输出和麦克风输入的字幕,Linux 平台仅支持生成麦克风输入的字幕。 ## 🚀 项目运行 @@ -32,11 +35,42 @@ npm install ### 构建字幕引擎 -字幕引擎原理:所谓的字幕引擎实际上是一个子程序,它会实时获取系统音频输入(录音)或输出(播放声音)的流式数据,并调用音频转文字的模型生成对应音频的字幕。生成的字幕通过 IPC 输出为转换为字符串的 JSON 数据,并返回给主程序。主程序读取字幕数据,处理后显示在窗口上。 +> #### 背景介绍 +> +> 所谓的字幕引擎实际上是一个子程序,它会实时获取系统音频输入(录音)或输出(播放声音)的流式数据,并调用音频转文字的模型生成对应音频的字幕。生成的字幕通过 IPC 输出为转换为字符串的 JSON 数据,并返回给主程序。主程序读取字幕数据,处理后显示在窗口上。 +> +>目前项目默认使用[阿里云 Gummy 模型](https://help.aliyun.com/zh/model-studio/gummy-speech-recognition-translation/),需要获取阿里云百炼平台的 API KEY 并配置到环境变量中才能正常使用该模型,相关介绍:[获取API KEY](https://help.aliyun.com/zh/model-studio/get-api-key)、[将API Key配置到环境变量](https://help.aliyun.com/zh/model-studio/configure-api-key-through-environment-variables)。 +> +> 本项目的 gummy 字幕引擎是一个 python 子程序,通过 pyinstaller 打包为可执行文件。 运行字幕引擎子程序的代码在 `src\main\utils\engine.ts` 文件中。 -目前项目默认使用 [阿里云 Gummy 模型](https://help.aliyun.com/zh/model-studio/gummy-speech-recognition-translation/),需要有阿里云百炼平台的 API KEY 才能正常使用该模型。 +首先进入 `python-subprocess` 文件夹,执行如下指令创建虚拟环境: -gummy 字幕引擎是一个 python 子程序,可以选择配置好 python 环境后直接运行该程序,也可以使用 pyinstaller 构建一个可执行文件。 运行字幕引擎子程序的代码在 `src\main\utils\engine.ts` 文件中 +```bash +python -m venv subenv +``` + +然后激活虚拟环境: + +```bash +# Windows +subenv/Scripts/activate +# Linux +source myenv/bin/activate +``` + +然后安装依赖: + +```bash +pip install -r requirements.txt +``` + +然后使用 `pyinstaller` 构建项目: + +```bash +pyinstaller --onefile main-gummy.py +``` + +此时项目构建完成,在进入 `python-subprocess/dist` 文件夹可见对应的可执行文件。即可进行后续操作。 ### 运行项目 diff --git a/electron-builder.yml b/electron-builder.yml index d222608..ef543cc 100644 --- a/electron-builder.yml +++ b/electron-builder.yml @@ -1,4 +1,4 @@ -appId: com.electron.app +appId: com.himeditator.autocaption productName: auto-caption directories: buildResources: build @@ -9,10 +9,14 @@ files: - '!{.eslintcache,eslint.config.mjs,.prettierignore,.prettierrc.yaml,dev-app-update.yml,CHANGELOG.md,README.md}' - '!{.env,.env.*,.npmrc,pnpm-lock.yaml}' - '!{tsconfig.json,tsconfig.node.json,tsconfig.web.json}' +extraResources: + from: ./python-subprocess/dist/main-gummy.exe + to: ./python-subprocess/dist/main-gummy.exe asarUnpack: - resources/** win: executableName: auto-caption + icon: resources/icon.png nsis: artifactName: ${name}-${version}-setup.${ext} shortcutName: ${productName} diff --git a/python-subprocess/main-gummy.py b/python-subprocess/main-gummy.py new file mode 100644 index 0000000..a169a34 --- /dev/null +++ b/python-subprocess/main-gummy.py @@ -0,0 +1,48 @@ +import sys + +if sys.platform == 'win32': + from sysaudio.win import AudioStream, mergeStreamChannels +elif sys.platform == 'linux': + from sysaudio.linux import AudioStream, mergeStreamChannels +else: + raise NotImplementedError(f"Unsupported platform: {sys.platform}") + +from audio2text.gummy import GummyTranslator +import sys +import argparse + +def convert_audio_to_text(s_lang, t_lang, audio_type): + sys.stdout.reconfigure(line_buffering=True) + stream = AudioStream(audio_type) + stream.openStream() + + if t_lang == 'none': + gummy = GummyTranslator(stream.RATE, s_lang, None) + else: + gummy = GummyTranslator(stream.RATE, s_lang, t_lang) + gummy.translator.start() + + while True: + try: + if not stream.stream: continue + data = stream.stream.read(stream.CHUNK) + data = mergeStreamChannels(data, stream.CHANNELS) + gummy.translator.send_audio_frame(data) + except KeyboardInterrupt: + stream.closeStream() + gummy.translator.stop() + break + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description='Convert system audio stream to text') + parser.add_argument('-s', '--source_language', default='en', help='Source language code') + parser.add_argument('-t', '--target_language', default='zh', help='Target language code') + parser.add_argument('-a', '--audio_type', default='0', help='Audio stream source: 0 for output audio stream, 1 for input audio stream') + args = parser.parse_args() + convert_audio_to_text( + args.source_language, + args.target_language, + 0 if args.audio_type == '0' else 1 + ) + \ No newline at end of file diff --git a/python-subprocess/main.spec b/python-subprocess/main-gummy.spec similarity index 93% rename from python-subprocess/main.spec rename to python-subprocess/main-gummy.spec index 2ba8dd9..bb6ac23 100644 --- a/python-subprocess/main.spec +++ b/python-subprocess/main-gummy.spec @@ -2,7 +2,7 @@ a = Analysis( - ['main.py'], + ['main-gummy.py'], pathex=[], binaries=[], datas=[], @@ -22,7 +22,7 @@ exe = EXE( a.binaries, a.datas, [], - name='main', + name='main-gummy', debug=False, bootloader_ignore_signals=False, strip=False, diff --git a/python-subprocess/main.py b/python-subprocess/main.py deleted file mode 100644 index 6edf167..0000000 --- a/python-subprocess/main.py +++ /dev/null @@ -1,27 +0,0 @@ -from sysaudio.win import LoopbackStream, mergeStreamChannels -from audio2text.gummy import GummyTranslator -import sys -import argparse - -def convert_audio_to_text(s_lang, t_lang, audio_source): - sys.stdout.reconfigure(line_buffering=True) - loopback = LoopbackStream() - loopback.openStream() - - gummy = GummyTranslator(loopback.RATE, s_lang, t_lang) - gummy.translator.start() - - while True: - if not loopback.stream: continue - data = loopback.stream.read(loopback.CHUNK) - data = mergeStreamChannels(data, loopback.CHANNELS) - gummy.translator.send_audio_frame(data) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description='Convert system audio stream to text') - parser.add_argument('-s', '--s_lang', default='en', help='Source language code') - parser.add_argument('-t', '--t_lang', default='zh', help='Target language code') - parser.add_argument('-a', '--audio', default=0, help='Audio stream source: 0 for output audio stream, 1 for input audio stream') - args = parser.parse_args() - convert_audio_to_text(args.s_lang, args.t_lang, args.audio) \ No newline at end of file diff --git a/python-subprocess/requirements.txt b/python-subprocess/requirements.txt index c70e828..82df817 100644 Binary files a/python-subprocess/requirements.txt and b/python-subprocess/requirements.txt differ diff --git a/python-subprocess/sysaudio/linux.py b/python-subprocess/sysaudio/linux.py new file mode 100644 index 0000000..3473515 --- /dev/null +++ b/python-subprocess/sysaudio/linux.py @@ -0,0 +1,79 @@ +import pyaudio +import numpy as np + +def mergeStreamChannels(data, channels): + """ + 将当前多通道流数据合并为单通道流数据 + + Args: + data: 多通道数据 + channels: 通道数 + + Returns: + mono_data_bytes: 单通道数据 + """ + # (length * channels,) + data_np = np.frombuffer(data, dtype=np.int16) + # (length, channels) + data_np_r = data_np.reshape(-1, channels) + # (length,) + mono_data = np.mean(data_np_r.astype(np.float32), axis=1) + mono_data = mono_data.astype(np.int16) + mono_data_bytes = mono_data.tobytes() + return mono_data_bytes + + +class AudioStream: + def __init__(self, audio_type=1): + self.audio_type = audio_type + self.mic = pyaudio.PyAudio() + self.device = self.mic.get_default_input_device_info() + self.stream = None + self.SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16) + self.FORMAT = pyaudio.paInt16 + self.CHANNELS = self.device["maxInputChannels"] + self.RATE = int(self.device["defaultSampleRate"]) + self.CHUNK = self.RATE // 20 + self.INDEX = self.device["index"] + + def printInfo(self): + dev_info = f""" + 采样输入设备: + - 设备类型:{ "音频输入(Linux平台目前仅支持该项)" } + - 序号:{self.device['index']} + - 名称:{self.device['name']} + - 最大输入通道数:{self.device['maxInputChannels']} + - 默认低输入延迟:{self.device['defaultLowInputLatency']}s + - 默认高输入延迟:{self.device['defaultHighInputLatency']}s + - 默认采样率:{self.device['defaultSampleRate']}Hz + + 音频样本块大小:{self.CHUNK} + 样本位宽:{self.SAMP_WIDTH} + 音频数据格式:{self.FORMAT} + 音频通道数:{self.CHANNELS} + 音频采样率:{self.RATE} + """ + print(dev_info) + + def openStream(self): + """ + 打开并返回系统音频输出流 + """ + if self.stream: return self.stream + self.stream = self.mic.open( + format = self.FORMAT, + channels = self.CHANNELS, + rate = self.RATE, + input = True, + input_device_index = self.INDEX + ) + return self.stream + + def closeStream(self): + """ + 关闭系统音频输出流 + """ + if self.stream is None: return + self.stream.stop_stream() + self.stream.close() + self.stream = None \ No newline at end of file diff --git a/python-subprocess/sysaudio/win.py b/python-subprocess/sysaudio/win.py index 6363720..f7c7af6 100644 --- a/python-subprocess/sysaudio/win.py +++ b/python-subprocess/sysaudio/win.py @@ -61,28 +61,39 @@ def mergeStreamChannels(data, channels): mono_data_bytes = mono_data.tobytes() return mono_data_bytes -class LoopbackStream: - def __init__(self): +class AudioStream: + """ + 获取系统音频流 + + 参数: + audio_type: (默认)0-系统音频输出流,1-系统音频输入流 + """ + def __init__(self, audio_type=0): + self.audio_type = audio_type self.mic = pyaudio.PyAudio() - self.loopback = getDefaultLoopbackDevice(self.mic, False) + if self.audio_type == 0: + self.device = getDefaultLoopbackDevice(self.mic, False) + else: + self.device = self.mic.get_default_input_device_info() self.stream = None self.SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16) self.FORMAT = pyaudio.paInt16 - self.CHANNELS = self.loopback["maxInputChannels"] - self.RATE = int(self.loopback["defaultSampleRate"]) + self.CHANNELS = self.device["maxInputChannels"] + self.RATE = int(self.device["defaultSampleRate"]) self.CHUNK = self.RATE // 20 - self.INDEX = self.loopback["index"] + self.INDEX = self.device["index"] def printInfo(self): dev_info = f""" - 采样输入设备: - - 序号:{self.loopback['index']} - - 名称:{self.loopback['name']} - - 最大输入通道数:{self.loopback['maxInputChannels']} - - 默认低输入延迟:{self.loopback['defaultLowInputLatency']}s - - 默认高输入延迟:{self.loopback['defaultHighInputLatency']}s - - 默认采样率:{self.loopback['defaultSampleRate']}Hz - - 是否回环设备:{self.loopback['isLoopbackDevice']} + 采样设备: + - 设备类型:{ "音频输入" if self.audio_type == 0 else "音频输出" } + - 序号:{self.device['index']} + - 名称:{self.device['name']} + - 最大输入通道数:{self.device['maxInputChannels']} + - 默认低输入延迟:{self.device['defaultLowInputLatency']}s + - 默认高输入延迟:{self.device['defaultHighInputLatency']}s + - 默认采样率:{self.device['defaultSampleRate']}Hz + - 是否回环设备:{self.device['isLoopbackDevice']} 音频样本块大小:{self.CHUNK} 样本位宽:{self.SAMP_WIDTH} diff --git a/src/main/index.ts b/src/main/index.ts index fc084a9..aeb76db 100644 --- a/src/main/index.ts +++ b/src/main/index.ts @@ -2,6 +2,7 @@ import { app, BrowserWindow } from 'electron' import { electronApp, optimizer } from '@electron-toolkit/utils' import { controlWindow } from './control' import { captionWindow } from './caption' +import { captionEngine } from './utils/config' app.whenReady().then(() => { electronApp.setAppUserModelId('com.himeditator.autocaption') @@ -22,6 +23,10 @@ app.whenReady().then(() => { }) }) +app.on('will-quit', async () => { + captionEngine.stop() +}); + app.on('window-all-closed', () => { if (process.platform !== 'darwin') { app.quit() diff --git a/src/main/types/index.ts b/src/main/types/index.ts index 033a285..90e76e5 100644 --- a/src/main/types/index.ts +++ b/src/main/types/index.ts @@ -23,6 +23,7 @@ export interface Controls { sourceLang: string, targetLang: string, engine: string, + audio: 0 | 1, translation: boolean, customized: boolean, customizedApp: string, diff --git a/src/main/utils/config.ts b/src/main/utils/config.ts index 86d08e3..d1b5212 100644 --- a/src/main/utils/config.ts +++ b/src/main/utils/config.ts @@ -22,6 +22,7 @@ export const controls: Controls = { sourceLang: 'en', targetLang: 'zh', engine: 'gummy', + audio: 0, engineEnabled: false, translation: true, customized: false, @@ -74,6 +75,7 @@ export function setControls(args: any) { controls.sourceLang = args.sourceLang controls.targetLang = args.targetLang controls.engine = args.engine + controls.audio = args.audio controls.translation = args.translation controls.customized = args.customized controls.customizedApp = args.customizedApp diff --git a/src/main/utils/engine.ts b/src/main/utils/engine.ts index 2ac9aff..56992c7 100644 --- a/src/main/utils/engine.ts +++ b/src/main/utils/engine.ts @@ -1,5 +1,6 @@ -import { spawn } from 'child_process' +import { spawn, exec } from 'child_process' import { app } from 'electron' +import { is } from '@electron-toolkit/utils' import path from 'path' import { addCaptionLog, controls } from './config' @@ -14,24 +15,29 @@ export class CaptionEngine { this.command = [ controls.customizedCommand ] } else if(controls.engine === 'gummy'){ - this.appPath = path.join( - app.getAppPath(), - 'python-subprocess', 'subenv', 'Scripts', 'python.exe' - ) + if(is.dev){ + this.appPath = path.join( + app.getAppPath(), + 'python-subprocess', 'dist', 'main-gummy.exe' + ) + } + else{ + this.appPath = path.join( + process.resourcesPath, + 'python-subprocess', 'dist', 'main-gummy.exe' + ) + } this.command = [] - this.command.push(path.join( - app.getAppPath(), - 'python-subprocess', 'main.py' - )) this.command.push('-s', controls.sourceLang) this.command.push('-t', controls.translation ? controls.targetLang : 'none') + this.command.push('-a', controls.audio ? '1' : '0') - console.log(this.appPath) - console.log(this.command) + console.log('[INFO] engine', this.appPath) + console.log('[INFO] engine command',this.command) } } - public start() { + public start() { if (this.process) { this.stop(); } @@ -70,7 +76,15 @@ export class CaptionEngine { public stop() { if (this.process) { - this.process.kill(); + if (process.platform === "win32" && this.process.pid) { + exec(`taskkill /pid ${this.process.pid} /t /f`, (error) => { + if (error) { + console.error(`Failed to kill process: ${error}`); + } + }); + } else { + this.process.kill('SIGKILL'); + } this.process = undefined; controls.engineEnabled = false; console.log('[INFO] Caption engine process stopped'); diff --git a/src/renderer/src/components/CaptionControl.vue b/src/renderer/src/components/CaptionControl.vue index cc4e1db..da01cef 100644 --- a/src/renderer/src/components/CaptionControl.vue +++ b/src/renderer/src/components/CaptionControl.vue @@ -29,6 +29,14 @@ :options="captionEngine" > +
+ 音频选择 + +
启用翻译 @@ -62,13 +70,15 @@ import { ref, computed, watch } from 'vue' import { storeToRefs } from 'pinia' import { useCaptionControlStore } from '@renderer/stores/captionControl' +import { notification } from 'ant-design-vue' const captionControl = useCaptionControlStore() -const { captionEngine, changeSignal } = storeToRefs(captionControl) +const { captionEngine, audioType, changeSignal } = storeToRefs(captionControl) const currentSourceLang = ref('auto') const currentTargetLang = ref('zh') const currentEngine = ref('gummy') +const currentAudio = ref<0 | 1>(0) const currentTranslation = ref(false) const currentCustomized = ref(false) @@ -88,6 +98,7 @@ function applyChange(){ captionControl.sourceLang = currentSourceLang.value captionControl.targetLang = currentTargetLang.value captionControl.engine = currentEngine.value + captionControl.audio = currentAudio.value captionControl.translation = currentTranslation.value captionControl.customized = currentCustomized.value @@ -95,12 +106,18 @@ function applyChange(){ captionControl.customizedCommand = currentCustomizedCommand.value captionControl.sendControlChange() + + notification.open({ + message: '字幕控制已更改', + description: '如果字幕引擎已经启动,需要关闭后重启才会生效' + }); } function cancelChange(){ currentSourceLang.value = captionControl.sourceLang currentTargetLang.value = captionControl.targetLang currentEngine.value = captionControl.engine + currentAudio.value = captionControl.audio currentTranslation.value = captionControl.translation currentCustomized.value = captionControl.customized diff --git a/src/renderer/src/stores/captionControl.ts b/src/renderer/src/stores/captionControl.ts index 4db7dec..93cfb17 100644 --- a/src/renderer/src/stores/captionControl.ts +++ b/src/renderer/src/stores/captionControl.ts @@ -16,11 +16,23 @@ export const useCaptionControlStore = defineStore('captionControl', () => { ] }, ]) + const audioType = ref([ + { + value: 0, + label: '系统音频输出(扬声器)' + }, + { + value: 1, + label: '系统音频输入(麦克风)' + } + ]) + const engineEnabled = ref(false) const sourceLang = ref('en') const targetLang = ref('zh') const engine = ref('gummy') + const audio = ref<0 | 1>(0) const translation = ref(true) const customized = ref(false) const customizedApp = ref('') @@ -34,6 +46,7 @@ export const useCaptionControlStore = defineStore('captionControl', () => { sourceLang: sourceLang.value, targetLang: targetLang.value, engine: engine.value, + audio: audio.value, translation: translation.value, customized: customized.value, customizedApp: customizedApp.value, @@ -54,6 +67,7 @@ export const useCaptionControlStore = defineStore('captionControl', () => { sourceLang.value = controls.sourceLang targetLang.value = controls.targetLang engine.value = controls.engine + audio.value = controls.audio translation.value = controls.translation customized.value = controls.customized customizedApp.value = controls.customizedApp @@ -73,7 +87,8 @@ export const useCaptionControlStore = defineStore('captionControl', () => { engineEnabled.value = true notification.open({ message: '字幕引擎启动', - description: `原语言:${sourceLang.value},是否翻译:${translation.value?'是':'否'}` + + description: `原语言:${sourceLang.value},是否翻译:${translation.value?'是':'否'},` + + `字幕引擎:${engine.value},音频类型:${audio.value ? '输入音频' : '输出音频'}` + (translation.value ? `,翻译语言:${targetLang.value}` : '') }); }) @@ -88,10 +103,12 @@ export const useCaptionControlStore = defineStore('captionControl', () => { return { captionEngine, // 字幕引擎 + audioType, // 音频类型 engineEnabled, // 字幕引擎是否启用 sourceLang, // 源语言 targetLang, // 目标语言 engine, // 字幕引擎 + audio, // 选择音频 translation, // 是否启用翻译 customized, // 是否使用自定义字幕引擎 customizedApp, // 自定义字幕引擎的应用程序