feat: 增加了音频输入选项，并优化了字幕引擎的构建和运行流程。

- 新增了系统音频输入（麦克风）的选择功能 - 重构了字幕引擎的构建流程，使用 PyInstaller 打包为可执行文件 - 优化了字幕引擎的启动和停止逻辑 - 更新了用户界面，增加了音频选择的控制选项 - 修改了相关的文件路径和构建配置
2026-02-04 04:14:42 +08:00 · 2025-06-21 23:22:19 +08:00
parent 7030aaaae3
commit 42237a229c
15 changed files with 268 additions and 63 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,4 @@ out
 *.log*
 __pycache__
 subenv
-build
+python-subprocess/build
--- a/README.md
+++ b/README.md
@@ -21,6 +21,9 @@
 - 灵活的字幕引擎选择
 - 多语言识别与翻译
 - 字幕记录展示与导出
+- 生成音频输出和麦克风输入的字幕
+
+说明：Windows 平台支持生成音频输出和麦克风输入的字幕，Linux 平台仅支持生成麦克风输入的字幕。

 ## 🚀 项目运行

@@ -32,11 +35,42 @@ npm install

 ### 构建字幕引擎

-字幕引擎原理：所谓的字幕引擎实际上是一个子程序，它会实时获取系统音频输入（录音）或输出（播放声音）的流式数据，并调用音频转文字的模型生成对应音频的字幕。生成的字幕通过 IPC 输出为转换为字符串的 JSON 数据，并返回给主程序。主程序读取字幕数据，处理后显示在窗口上。
+> #### 背景介绍
+>
+> 所谓的字幕引擎实际上是一个子程序，它会实时获取系统音频输入（录音）或输出（播放声音）的流式数据，并调用音频转文字的模型生成对应音频的字幕。生成的字幕通过 IPC 输出为转换为字符串的 JSON 数据，并返回给主程序。主程序读取字幕数据，处理后显示在窗口上。
+>
+>目前项目默认使用[阿里云 Gummy 模型](https://help.aliyun.com/zh/model-studio/gummy-speech-recognition-translation/)，需要获取阿里云百炼平台的 API KEY 并配置到环境变量中才能正常使用该模型，相关介绍：[获取API KEY](https://help.aliyun.com/zh/model-studio/get-api-key)、[将API Key配置到环境变量](https://help.aliyun.com/zh/model-studio/configure-api-key-through-environment-variables)。
+>
+> 本项目的 gummy 字幕引擎是一个 python 子程序，通过 pyinstaller 打包为可执行文件。 运行字幕引擎子程序的代码在 `src\main\utils\engine.ts` 文件中。

-目前项目默认使用 [阿里云 Gummy 模型](https://help.aliyun.com/zh/model-studio/gummy-speech-recognition-translation/)，需要有阿里云百炼平台的 API KEY 才能正常使用该模型。
+首先进入 `python-subprocess` 文件夹，执行如下指令创建虚拟环境：

-gummy 字幕引擎是一个 python 子程序，可以选择配置好 python 环境后直接运行该程序，也可以使用 pyinstaller 构建一个可执行文件。 运行字幕引擎子程序的代码在 `src\main\utils\engine.ts` 文件中
+```bash
+python -m venv subenv
+```
+
+然后激活虚拟环境：
+
+```bash
+# Windows
+subenv/Scripts/activate
+# Linux
+source myenv/bin/activate
+```
+
+然后安装依赖：
+
+```bash
+pip install -r requirements.txt
+```
+
+然后使用 `pyinstaller` 构建项目：
+
+```bash
+pyinstaller --onefile main-gummy.py
+```
+
+此时项目构建完成，在进入 `python-subprocess/dist` 文件夹可见对应的可执行文件。即可进行后续操作。

 ### 运行项目

--- a/electron-builder.yml
+++ b/electron-builder.yml
@@ -1,4 +1,4 @@
-appId: com.electron.app
+appId: com.himeditator.autocaption
 productName: auto-caption
 directories:
  buildResources: build
@@ -9,10 +9,14 @@ files:
  - '!{.eslintcache,eslint.config.mjs,.prettierignore,.prettierrc.yaml,dev-app-update.yml,CHANGELOG.md,README.md}'
  - '!{.env,.env.*,.npmrc,pnpm-lock.yaml}'
  - '!{tsconfig.json,tsconfig.node.json,tsconfig.web.json}'
+extraResources:
+  from: ./python-subprocess/dist/main-gummy.exe
+  to: ./python-subprocess/dist/main-gummy.exe
 asarUnpack:
  - resources/**
 win:
  executableName: auto-caption
+  icon: resources/icon.png
 nsis:
  artifactName: ${name}-${version}-setup.${ext}
  shortcutName: ${productName}
--- a/python-subprocess/main-gummy.py
+++ b/python-subprocess/main-gummy.py
@@ -0,0 +1,48 @@
+import sys
+
+if sys.platform == 'win32':
+    from sysaudio.win import AudioStream, mergeStreamChannels
+elif sys.platform == 'linux':
+    from sysaudio.linux import AudioStream, mergeStreamChannels
+else:
+    raise NotImplementedError(f"Unsupported platform: {sys.platform}")
+
+from audio2text.gummy import GummyTranslator
+import sys
+import argparse
+
+def convert_audio_to_text(s_lang, t_lang, audio_type):
+    sys.stdout.reconfigure(line_buffering=True)
+    stream = AudioStream(audio_type)
+    stream.openStream()
+
+    if t_lang == 'none':
+        gummy = GummyTranslator(stream.RATE, s_lang, None)
+    else:
+        gummy = GummyTranslator(stream.RATE, s_lang, t_lang)
+    gummy.translator.start()
+
+    while True:
+        try:
+            if not stream.stream: continue
+            data = stream.stream.read(stream.CHUNK)
+            data = mergeStreamChannels(data, stream.CHANNELS)
+            gummy.translator.send_audio_frame(data)
+        except KeyboardInterrupt:
+            stream.closeStream()
+            gummy.translator.stop()
+            break
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='Convert system audio stream to text')
+    parser.add_argument('-s', '--source_language', default='en', help='Source language code')
+    parser.add_argument('-t', '--target_language', default='zh', help='Target language code')
+    parser.add_argument('-a', '--audio_type', default='0', help='Audio stream source: 0 for output audio stream, 1 for input audio stream')
+    args = parser.parse_args()
+    convert_audio_to_text(
+        args.source_language,
+        args.target_language,
+        0 if args.audio_type == '0' else 1
+    )
+    
--- a/python-subprocess/main-gummy.spec
+++ b/python-subprocess/main-gummy.spec
@@ -2,7 +2,7 @@


 a = Analysis(
-    ['main.py'],
+    ['main-gummy.py'],
    pathex=[],
    binaries=[],
    datas=[],
@@ -22,7 +22,7 @@ exe = EXE(
    a.binaries,
    a.datas,
    [],
-    name='main',
+    name='main-gummy',
    debug=False,
    bootloader_ignore_signals=False,
    strip=False,
--- a/python-subprocess/main.py
+++ b/python-subprocess/main.py
@@ -1,27 +0,0 @@
-from sysaudio.win import LoopbackStream, mergeStreamChannels
-from audio2text.gummy import GummyTranslator
-import sys
-import argparse
-
-def convert_audio_to_text(s_lang, t_lang, audio_source):
-    sys.stdout.reconfigure(line_buffering=True)
-    loopback = LoopbackStream()
-    loopback.openStream()
-
-    gummy = GummyTranslator(loopback.RATE, s_lang, t_lang)
-    gummy.translator.start()
-
-    while True:
-        if not loopback.stream: continue
-        data = loopback.stream.read(loopback.CHUNK)
-        data = mergeStreamChannels(data, loopback.CHANNELS)
-        gummy.translator.send_audio_frame(data)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description='Convert system audio stream to text')
-    parser.add_argument('-s', '--s_lang', default='en', help='Source language code')
-    parser.add_argument('-t', '--t_lang', default='zh', help='Target language code')
-    parser.add_argument('-a', '--audio', default=0, help='Audio stream source: 0 for output audio stream, 1 for input audio stream')
-    args = parser.parse_args()
-    convert_audio_to_text(args.s_lang, args.t_lang, args.audio)
--- a/python-subprocess/requirements.txt
+++ b/python-subprocess/requirements.txt
--- a/python-subprocess/sysaudio/linux.py
+++ b/python-subprocess/sysaudio/linux.py
@@ -0,0 +1,79 @@
+import pyaudio
+import numpy as np
+
+def mergeStreamChannels(data, channels):
+    """
+    将当前多通道流数据合并为单通道流数据
+
+    Args:
+        data: 多通道数据
+        channels: 通道数
+
+    Returns:
+        mono_data_bytes: 单通道数据
+    """
+    # (length * channels,)
+    data_np = np.frombuffer(data, dtype=np.int16)
+    # (length, channels)
+    data_np_r = data_np.reshape(-1, channels)
+    # (length,)
+    mono_data = np.mean(data_np_r.astype(np.float32), axis=1)
+    mono_data = mono_data.astype(np.int16)
+    mono_data_bytes = mono_data.tobytes()
+    return mono_data_bytes
+
+
+class AudioStream:
+    def __init__(self, audio_type=1):
+        self.audio_type = audio_type
+        self.mic = pyaudio.PyAudio()
+        self.device = self.mic.get_default_input_device_info()
+        self.stream = None
+        self.SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16)
+        self.FORMAT = pyaudio.paInt16
+        self.CHANNELS = self.device["maxInputChannels"]
+        self.RATE = int(self.device["defaultSampleRate"])
+        self.CHUNK = self.RATE // 20
+        self.INDEX = self.device["index"]
+
+    def printInfo(self):
+        dev_info = f"""
+        采样输入设备：
+            - 设备类型：{ "音频输入（Linux平台目前仅支持该项）" }
+            - 序号：{self.device['index']}
+            - 名称：{self.device['name']}
+            - 最大输入通道数：{self.device['maxInputChannels']}
+            - 默认低输入延迟：{self.device['defaultLowInputLatency']}s
+            - 默认高输入延迟：{self.device['defaultHighInputLatency']}s
+            - 默认采样率：{self.device['defaultSampleRate']}Hz
+
+        音频样本块大小：{self.CHUNK}
+        样本位宽：{self.SAMP_WIDTH}
+        音频数据格式：{self.FORMAT}
+        音频通道数：{self.CHANNELS}
+        音频采样率：{self.RATE}
+        """
+        print(dev_info)
+
+    def openStream(self):
+        """
+        打开并返回系统音频输出流
+        """
+        if self.stream: return self.stream
+        self.stream = self.mic.open(
+            format = self.FORMAT,
+            channels = self.CHANNELS,
+            rate = self.RATE,
+            input = True,
+            input_device_index = self.INDEX
+        )
+        return self.stream
+    
+    def closeStream(self):
+        """
+        关闭系统音频输出流
+        """
+        if self.stream is None: return
+        self.stream.stop_stream()
+        self.stream.close()
+        self.stream = None
--- a/python-subprocess/sysaudio/win.py
+++ b/python-subprocess/sysaudio/win.py
@@ -61,28 +61,39 @@ def mergeStreamChannels(data, channels):
    mono_data_bytes = mono_data.tobytes()
    return mono_data_bytes

-class LoopbackStream:
-    def __init__(self):
+class AudioStream:
+    """
+    获取系统音频流
+    
+    参数：
+        audio_type: （默认）0-系统音频输出流，1-系统音频输入流
+    """
+    def __init__(self, audio_type=0):
+        self.audio_type = audio_type
        self.mic = pyaudio.PyAudio()
-        self.loopback = getDefaultLoopbackDevice(self.mic, False)
+        if self.audio_type == 0:
+            self.device = getDefaultLoopbackDevice(self.mic, False)
+        else:
+            self.device = self.mic.get_default_input_device_info()
        self.stream = None
        self.SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16)
        self.FORMAT = pyaudio.paInt16
-        self.CHANNELS = self.loopback["maxInputChannels"]
-        self.RATE = int(self.loopback["defaultSampleRate"])
+        self.CHANNELS = self.device["maxInputChannels"]
+        self.RATE = int(self.device["defaultSampleRate"])
        self.CHUNK = self.RATE // 20
-        self.INDEX = self.loopback["index"]
+        self.INDEX = self.device["index"]

    def printInfo(self):
        dev_info = f"""
-        采样输入设备：
-            - 序号：{self.loopback['index']}
-            - 名称：{self.loopback['name']}
-            - 最大输入通道数：{self.loopback['maxInputChannels']}
-            - 默认低输入延迟：{self.loopback['defaultLowInputLatency']}s
-            - 默认高输入延迟：{self.loopback['defaultHighInputLatency']}s
-            - 默认采样率：{self.loopback['defaultSampleRate']}Hz
-            - 是否回环设备：{self.loopback['isLoopbackDevice']}
+        采样设备：
+            - 设备类型：{ "音频输入" if self.audio_type == 0 else "音频输出" }
+            - 序号：{self.device['index']}
+            - 名称：{self.device['name']}
+            - 最大输入通道数：{self.device['maxInputChannels']}
+            - 默认低输入延迟：{self.device['defaultLowInputLatency']}s
+            - 默认高输入延迟：{self.device['defaultHighInputLatency']}s
+            - 默认采样率：{self.device['defaultSampleRate']}Hz
+            - 是否回环设备：{self.device['isLoopbackDevice']}

        音频样本块大小：{self.CHUNK}
        样本位宽：{self.SAMP_WIDTH}
--- a/src/main/index.ts
+++ b/src/main/index.ts
@@ -2,6 +2,7 @@ import { app, BrowserWindow } from 'electron'
 import { electronApp, optimizer } from '@electron-toolkit/utils'
 import { controlWindow } from './control'
 import { captionWindow } from './caption'
+import { captionEngine } from './utils/config'

 app.whenReady().then(() => {
  electronApp.setAppUserModelId('com.himeditator.autocaption')
@@ -22,6 +23,10 @@ app.whenReady().then(() => {
  })
 })

+app.on('will-quit', async () => { 
+  captionEngine.stop()
+});
+
 app.on('window-all-closed', () => {
  if (process.platform !== 'darwin') {
    app.quit()
--- a/src/main/types/index.ts
+++ b/src/main/types/index.ts
@@ -23,6 +23,7 @@ export interface Controls {
  sourceLang: string,
  targetLang: string,
  engine: string,
+  audio: 0 | 1,
  translation: boolean,
  customized: boolean,
  customizedApp: string,
--- a/src/main/utils/config.ts
+++ b/src/main/utils/config.ts
@@ -22,6 +22,7 @@ export const controls: Controls = {
  sourceLang: 'en',
  targetLang: 'zh',
  engine: 'gummy',
+  audio: 0,
  engineEnabled: false,
  translation: true,
  customized: false,
@@ -74,6 +75,7 @@ export function setControls(args: any) {
  controls.sourceLang = args.sourceLang
  controls.targetLang = args.targetLang
  controls.engine = args.engine
+  controls.audio = args.audio
  controls.translation = args.translation
  controls.customized = args.customized
  controls.customizedApp = args.customizedApp
--- a/src/main/utils/engine.ts
+++ b/src/main/utils/engine.ts
@@ -1,5 +1,6 @@
-import { spawn } from 'child_process'
+import { spawn, exec } from 'child_process'
 import { app } from 'electron'
+import { is } from '@electron-toolkit/utils'
 import path from 'path'
 import { addCaptionLog, controls } from './config'

@@ -14,24 +15,29 @@ export class CaptionEngine {
            this.command = [ controls.customizedCommand ]
        }
        else if(controls.engine === 'gummy'){
-            this.appPath = path.join(
-                app.getAppPath(),
-                'python-subprocess', 'subenv', 'Scripts', 'python.exe'
-            )
+            if(is.dev){
+                this.appPath = path.join(
+                    app.getAppPath(),
+                    'python-subprocess', 'dist', 'main-gummy.exe'
+                )
+            }
+            else{
+                this.appPath = path.join(
+                    process.resourcesPath,
+                    'python-subprocess', 'dist', 'main-gummy.exe'
+                )
+            }
            this.command = []
-            this.command.push(path.join(
-                app.getAppPath(),
-                'python-subprocess', 'main.py'
-            ))
            this.command.push('-s', controls.sourceLang)
            this.command.push('-t',  controls.translation ? controls.targetLang : 'none')
+            this.command.push('-a', controls.audio ? '1' : '0')

-            console.log(this.appPath)
-            console.log(this.command)
+            console.log('[INFO] engine', this.appPath)
+            console.log('[INFO] engine command',this.command)
        }
    }

-    public start() { 
+    public start() {
        if (this.process) {
            this.stop();
        }
@@ -70,7 +76,15 @@ export class CaptionEngine {

    public stop() {
        if (this.process) {
-            this.process.kill();
+            if (process.platform === "win32" && this.process.pid) {
+                exec(`taskkill /pid ${this.process.pid} /t /f`, (error) => {
+                    if (error) {
+                        console.error(`Failed to kill process: ${error}`);
+                    }
+                });
+            } else {
+                this.process.kill('SIGKILL');
+            }
            this.process = undefined;
            controls.engineEnabled = false;
            console.log('[INFO] Caption engine process stopped');
--- a/src/renderer/src/components/CaptionControl.vue
+++ b/src/renderer/src/components/CaptionControl.vue
@@ -29,6 +29,14 @@
        :options="captionEngine"
      ></a-select>
    </div>
+    <div class="control-item">
+      <span class="control-label">音频选择</span>
+      <a-select
+        class="control-input"
+        v-model:value="currentAudio"
+        :options="audioType"
+      ></a-select>
+    </div>
    <div class="control-item">
      <span class="control-label">启用翻译</span>
      <a-switch v-model:checked="currentTranslation" />
@@ -62,13 +70,15 @@
 import { ref, computed, watch } from 'vue'
 import { storeToRefs } from 'pinia'
 import { useCaptionControlStore } from '@renderer/stores/captionControl'
+import { notification } from 'ant-design-vue'

 const captionControl = useCaptionControlStore()
-const { captionEngine, changeSignal } = storeToRefs(captionControl)
+const { captionEngine, audioType, changeSignal } = storeToRefs(captionControl)

 const currentSourceLang = ref('auto')
 const currentTargetLang = ref('zh')
 const currentEngine = ref('gummy')
+const currentAudio = ref<0 | 1>(0)
 const currentTranslation = ref<boolean>(false)

 const currentCustomized = ref<boolean>(false)
@@ -88,6 +98,7 @@ function applyChange(){
  captionControl.sourceLang = currentSourceLang.value
  captionControl.targetLang = currentTargetLang.value
  captionControl.engine = currentEngine.value
+  captionControl.audio = currentAudio.value
  captionControl.translation = currentTranslation.value

  captionControl.customized = currentCustomized.value
@@ -95,12 +106,18 @@ function applyChange(){
  captionControl.customizedCommand = currentCustomizedCommand.value

  captionControl.sendControlChange()
+
+  notification.open({
+      message: '字幕控制已更改',
+      description: '如果字幕引擎已经启动，需要关闭后重启才会生效'
+  });
 }

 function cancelChange(){
  currentSourceLang.value = captionControl.sourceLang
  currentTargetLang.value = captionControl.targetLang
  currentEngine.value = captionControl.engine
+  currentAudio.value = captionControl.audio
  currentTranslation.value = captionControl.translation

  currentCustomized.value = captionControl.customized
--- a/src/renderer/src/stores/captionControl.ts
+++ b/src/renderer/src/stores/captionControl.ts
@@ -16,11 +16,23 @@ export const useCaptionControlStore = defineStore('captionControl', () => {
      ]
    },
  ])
+  const audioType = ref([
+    {
+      value: 0,
+      label: '系统音频输出（扬声器）'
+    },
+    {
+      value: 1,
+      label: '系统音频输入（麦克风）'
+    }
+  ])
+
  const engineEnabled = ref(false)

  const sourceLang = ref<string>('en')
  const targetLang = ref<string>('zh')
  const engine = ref<string>('gummy')
+  const audio = ref<0 | 1>(0)
  const translation = ref<boolean>(true)
  const customized = ref<boolean>(false)
  const customizedApp = ref<string>('')
@@ -34,6 +46,7 @@ export const useCaptionControlStore = defineStore('captionControl', () => {
      sourceLang: sourceLang.value,
      targetLang: targetLang.value,
      engine: engine.value,
+      audio: audio.value,
      translation: translation.value,
      customized: customized.value,
      customizedApp: customizedApp.value,
@@ -54,6 +67,7 @@ export const useCaptionControlStore = defineStore('captionControl', () => {
    sourceLang.value = controls.sourceLang
    targetLang.value = controls.targetLang
    engine.value = controls.engine
+    audio.value = controls.audio
    translation.value = controls.translation
    customized.value = controls.customized
    customizedApp.value = controls.customizedApp
@@ -73,7 +87,8 @@ export const useCaptionControlStore = defineStore('captionControl', () => {
    engineEnabled.value = true
    notification.open({
      message: '字幕引擎启动',
-      description: `原语言：${sourceLang.value}，是否翻译：${translation.value?'是':'否'}` + 
+      description: `原语言：${sourceLang.value}，是否翻译：${translation.value?'是':'否'}，` + 
+        `字幕引擎：${engine.value}，音频类型：${audio.value ? '输入音频' : '输出音频'}` +
        (translation.value ? `，翻译语言：${targetLang.value}` : '')
    });
  })
@@ -88,10 +103,12 @@ export const useCaptionControlStore = defineStore('captionControl', () => {

  return {
    captionEngine,      // 字幕引擎
+    audioType,          // 音频类型
    engineEnabled,      // 字幕引擎是否启用
    sourceLang,         // 源语言
    targetLang,         // 目标语言
    engine,             // 字幕引擎
+    audio,              // 选择音频
    translation,        // 是否启用翻译
    customized,         // 是否使用自定义字幕引擎
    customizedApp,      // 自定义字幕引擎的应用程序