feat(sysaudio): 支持 macOS 系统音频流采集

- 新增 darwin.py 文件实现 macOS 音频流采集功能 - 修改 main-gummy.py 以支持 macOS 平台 - 更新 AllConfig 和 CaptionEngine 以适配新平台
2026-02-04 04:14:42 +08:00 · 2025-07-08 17:04:15 +08:00
parent 65da30f83d
commit 7e953db6bd
14 changed files with 141 additions and 36 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -8,3 +8,4 @@ __pycache__
 subenv
 caption-engine/build
 output.wav
+.venv
--- a/caption-engine/main-gummy.py
+++ b/caption-engine/main-gummy.py
@@ -3,6 +3,8 @@ import argparse

 if sys.platform == 'win32':
    from sysaudio.win import AudioStream
+elif sys.platform == 'darwin':
+    from sysaudio.darwin import AudioStream
 elif sys.platform == 'linux':
    from sysaudio.linux import AudioStream
 else:
@@ -12,9 +14,9 @@ from audioprcs import mergeChunkChannels
 from audio2text import InvalidParameter, GummyTranslator


-def convert_audio_to_text(s_lang, t_lang, audio_type):
+def convert_audio_to_text(s_lang, t_lang, audio_type, chunk_rate):
    sys.stdout.reconfigure(line_buffering=True) # type: ignore
-    stream = AudioStream(audio_type)
+    stream = AudioStream(audio_type, chunk_rate)

    if t_lang == 'none':
        gummy = GummyTranslator(stream.RATE, s_lang, None)
@@ -43,10 +45,12 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Convert system audio stream to text')
    parser.add_argument('-s', '--source_language', default='en', help='Source language code')
    parser.add_argument('-t', '--target_language', default='zh', help='Target language code')
-    parser.add_argument('-a', '--audio_type', default='0', help='Audio stream source: 0 for output audio stream, 1 for input audio stream')
+    parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output audio stream, 1 for input audio stream')
+    parser.add_argument('-c', '--chunk_rate', default=20, help='The number of audio stream chunks collected per second.')
    args = parser.parse_args()
    convert_audio_to_text(
        args.source_language,
        args.target_language,
-        int(args.audio_type)
+        int(args.audio_type),
+        int(args.chunk_rate)
    )
--- a/caption-engine/requirements.txt
+++ b/caption-engine/requirements.txt
@@ -1,6 +1,6 @@
-dashscope==1.23.5
-numpy==2.2.6
-samplerate==0.2.1
-PyAudio==0.2.14
-PyAudioWPatch==0.2.12.7 # Windows only
-pyinstaller==6.14.1
+dashscope
+numpy
+samplerate
+PyAudio
+PyAudioWPatch # Windows only
+pyinstaller
--- a/caption-engine/sysaudio/darwin.py
+++ b/caption-engine/sysaudio/darwin.py
@@ -0,0 +1,85 @@
+"""获取 MacOS 系统音频输入/输出流"""
+
+import pyaudio
+
+
+class AudioStream:
+    """
+    获取系统音频流（支持 BlackHole 作为系统音频输出捕获）
+
+    初始化参数：
+        audio_type: 0-系统音频输出流（需配合 BlackHole），1-系统音频输入流
+        chunk_rate: 每秒采集音频块的数量，默认为20
+    """
+    def __init__(self, audio_type=0, chunk_rate=20):
+        self.audio_type = audio_type
+        self.mic = pyaudio.PyAudio()
+        if self.audio_type == 0:
+            self.device = self.getOutputDeviceInfo()
+        else:
+            self.device = self.mic.get_default_input_device_info()
+        self.stream = None
+        self.SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16)
+        self.FORMAT = pyaudio.paInt16
+        self.CHANNELS = self.device["maxInputChannels"]
+        self.RATE = int(self.device["defaultSampleRate"])
+        self.CHUNK = self.RATE // chunk_rate
+        self.INDEX = self.device["index"]
+
+    def getOutputDeviceInfo(self):
+        """查找指定关键词的输入设备"""
+        device_count = self.mic.get_device_count()
+        for i in range(device_count):
+            dev_info = self.mic.get_device_info_by_index(i)
+            if 'blackhole' in dev_info["name"].lower():    
+                return dev_info
+        raise Exception("The device containing BlackHole was not found.")
+
+    def printInfo(self):
+        dev_info = f"""
+        采样输入设备：
+            - 设备类型：{ "音频输出" if self.audio_type == 0 else "音频输入" }
+            - 序号：{self.device['index']}
+            - 名称：{self.device['name']}
+            - 最大输入通道数：{self.device['maxInputChannels']}
+            - 默认低输入延迟：{self.device['defaultLowInputLatency']}s
+            - 默认高输入延迟：{self.device['defaultHighInputLatency']}s
+            - 默认采样率：{self.device['defaultSampleRate']}Hz
+
+        音频样本块大小：{self.CHUNK}
+        样本位宽：{self.SAMP_WIDTH}
+        采样格式：{self.FORMAT}
+        音频通道数：{self.CHANNELS}
+        音频采样率：{self.RATE}
+        """
+        print(dev_info)
+
+    def openStream(self):
+        """
+        打开并返回系统音频输出流
+        """
+        if self.stream: return self.stream
+        self.stream = self.mic.open(
+            format = self.FORMAT,
+            channels = int(self.CHANNELS),
+            rate = self.RATE,
+            input = True,
+            input_device_index = int(self.INDEX)
+        )
+        return self.stream
+
+    def read_chunk(self):
+        """
+        读取音频数据
+        """
+        if not self.stream: return None
+        return self.stream.read(self.CHUNK, exception_on_overflow=False)
+
+    def closeStream(self):
+        """
+        关闭系统音频输出流
+        """
+        if self.stream is None: return
+        self.stream.stop_stream()
+        self.stream.close()
+        self.stream = None
--- a/caption-engine/sysaudio/linux.py
+++ b/caption-engine/sysaudio/linux.py
@@ -1,3 +1,5 @@
+"""获取 Linux 系统音频输入流"""
+
 import pyaudio


--- a/caption-engine/sysaudio/win.py
+++ b/caption-engine/sysaudio/win.py
@@ -1,4 +1,4 @@
-"""获取 Windows 系统音频输出流"""
+"""获取 Windows 系统音频输入/输出流"""

 import pyaudiowpatch as pyaudio

@@ -101,7 +101,7 @@ class AudioStream:
        读取音频数据
        """
        if not self.stream: return None
-        return self.stream.read(self.CHUNK)
+        return self.stream.read(self.CHUNK, exception_on_overflow=False)

    def closeStream(self):
        """
--- a/engine-test/resample.ipynb
+++ b/engine-test/resample.ipynb
@@ -2,7 +2,7 @@
 "cells": [
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 7,
   "id": "1e12f3ef",
   "metadata": {},
   "outputs": [
@@ -11,15 +11,14 @@
     "output_type": "stream",
     "text": [
      "\n",
-      "        采样设备：\n",
+      "        采样输入设备：\n",
      "            - 设备类型：音频输出\n",
-      "            - 序号：26\n",
-      "            - 名称：耳机 (HUAWEI FreeLace 活力版) [Loopback]\n",
+      "            - 序号：0\n",
+      "            - 名称：BlackHole 2ch\n",
      "            - 最大输入通道数：2\n",
-      "            - 默认低输入延迟：0.003s\n",
-      "            - 默认高输入延迟：0.01s\n",
+      "            - 默认低输入延迟：0.01s\n",
+      "            - 默认高输入延迟：0.1s\n",
      "            - 默认采样率：48000.0Hz\n",
-      "            - 是否回环设备：True\n",
      "\n",
      "        音频样本块大小：2400\n",
      "        样本位宽：2\n",
@@ -38,7 +37,7 @@
    "current_dir = os.getcwd() \n",
    "sys.path.append(os.path.join(current_dir, '../caption-engine'))\n",
    "\n",
-    "from sysaudio.win import AudioStream\n",
+    "from sysaudio.darwin import AudioStream\n",
    "from audioprcs import resampleRawChunk, mergeChunkChannels\n",
    "\n",
    "stream = AudioStream(0)\n",
@@ -47,7 +46,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 5,
   "id": "a72914f4",
   "metadata": {},
   "outputs": [
@@ -84,7 +83,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 8,
   "id": "a6e8a098",
   "metadata": {},
   "outputs": [
@@ -168,7 +167,7 @@
 ],
 "metadata": {
  "kernelspec": {
-   "display_name": "mystd",
+   "display_name": ".venv",
   "language": "python",
   "name": "python3"
  },
@@ -182,7 +181,7 @@
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.9.6"
  }
 },
 "nbformat": 4,
--- a/src/main/types/index.ts
+++ b/src/main/types/index.ts
@@ -37,6 +37,7 @@ export interface CaptionItem {
 }

 export interface FullConfig {
+  platform: string,
  uiLanguage: UILanguage,
  uiTheme: UITheme,
  leftBarWidth: number,
--- a/src/main/utils/AllConfig.ts
+++ b/src/main/utils/AllConfig.ts
@@ -51,6 +51,7 @@ class AllConfig {
      if(config.uiTheme) this.uiTheme = config.uiTheme
      if(config.leftBarWidth) this.leftBarWidth = config.leftBarWidth
      if(config.styles) this.setStyles(config.styles)
+      if(process.platform !== 'win32' && process.platform !== 'darwin') config.controls.audio = 1
      if(config.controls) this.setControls(config.controls)
      console.log('[INFO] Read Config from:', configPath)
    }
@@ -71,6 +72,7 @@ class AllConfig {

  public getFullConfig(): FullConfig {
    return {
+      platform: process.platform,
      uiLanguage: this.uiLanguage,
      uiTheme: this.uiTheme,
      leftBarWidth: this.leftBarWidth,
--- a/src/main/utils/CaptionEngine.ts
+++ b/src/main/utils/CaptionEngine.ts
@@ -27,7 +27,7 @@ export class CaptionEngine {
      if (process.platform === 'win32') {
        gummyName = 'main-gummy.exe'
      }
-      else if (process.platform === 'linux') {
+      else if (process.platform === 'darwin' || process.platform === 'linux') {
        gummyName = 'main-gummy'
      }
      else {
@@ -124,16 +124,16 @@ export class CaptionEngine {
    if(this.processStatus !== 'running') return
    if (this.process) {
      console.log('[INFO] Trying to stop process, PID:', this.process.pid)
-      if (process.platform === "win32" && this.process.pid) {
-        exec(`taskkill /pid ${this.process.pid} /t /f`, (error) => {
+      let cmd = `kill ${this.process.pid}`;
+      if (process.platform === "win32") {
+        cmd = `taskkill /pid ${this.process.pid} /t /f`
+      }
+      exec(cmd, (error) => {
        if (error) {
          controlWindow.sendErrorMessage(i18n('engine.shutdown.error') + error)
          console.error(`[ERROR] Failed to kill process: ${error}`)
        }
-        });
-      } else {
-        this.process.kill('SIGKILL');
-      }
+      })
    }
    this.processStatus = 'stopping'
    console.log('[INFO] Caption engine process stopping')
--- a/src/renderer/src/App.vue
+++ b/src/renderer/src/App.vue
@@ -16,6 +16,7 @@ onMounted(() => {
    useGeneralSettingStore().uiTheme = data.uiTheme
    useGeneralSettingStore().leftBarWidth = data.leftBarWidth
    useCaptionStyleStore().setStyles(data.styles)
+    useEngineControlStore().platform = data.platform
    useEngineControlStore().setControls(data.controls)
    useCaptionLogStore().captionData = data.captionLog
  })
--- a/src/renderer/src/components/EngineControl.vue
+++ b/src/renderer/src/components/EngineControl.vue
@@ -32,6 +32,7 @@
    <div class="input-item">
      <span class="input-label">{{ $t('engine.audioType') }}</span>
      <a-select
+        :disabled="platform !== 'win32' && platform !== 'darwin'"
        class="input-area"
        v-model:value="currentAudio"
        :options="audioType"
@@ -87,7 +88,7 @@ import { useI18n } from 'vue-i18n'
 const { t } = useI18n()

 const engineControl = useEngineControlStore()
-const { captionEngine, audioType, changeSignal } = storeToRefs(engineControl)
+const { platform, captionEngine, audioType, changeSignal } = storeToRefs(engineControl)

 const currentSourceLang = ref('auto')
 const currentTargetLang = ref('zh')
--- a/src/renderer/src/stores/engineControl.ts
+++ b/src/renderer/src/stores/engineControl.ts
@@ -1,4 +1,4 @@
-import { ref } from 'vue'
+import { ref, watch } from 'vue'
 import { defineStore } from 'pinia'

 import { notification } from 'ant-design-vue'
@@ -12,6 +12,7 @@ import { useGeneralSettingStore } from './generalSetting'

 export const useEngineControlStore = defineStore('engineControl', () => {
  const { t } = useI18n()
+  const platform = ref('unknown')

  const captionEngine = ref(engines[useGeneralSettingStore().uiLanguage])
  const audioType = ref(audioTypes[useGeneralSettingStore().uiLanguage])
@@ -91,7 +92,14 @@ export const useEngineControlStore = defineStore('engineControl', () => {
    });
  })

+  watch(platform, (newValue) => {
+    if(newValue !== 'win32' && newValue !== 'darwin') {
+      audio.value = 1
+    }
+  })
+
  return {
+    platform,           // 系统平台
    captionEngine,      // 字幕引擎
    audioType,          // 音频类型
    engineEnabled,      // 字幕引擎是否启用
--- a/src/renderer/src/types/index.ts
+++ b/src/renderer/src/types/index.ts
@@ -37,6 +37,7 @@ export interface CaptionItem {
 }

 export interface FullConfig {
+  platform: string,
  uiLanguage: UILanguage,
  uiTheme: UITheme,
  leftBarWidth: number,