feat(sysaudio): 支持 macOS 系统音频流采集

- 新增 darwin.py 文件实现 macOS 音频流采集功能 - 修改 main-gummy.py 以支持 macOS 平台 - 更新 AllConfig 和 CaptionEngine 以适配新平台
2026-06-03 23:33:09 +08:00 · 2025-07-08 17:04:15 +08:00
parent 65da30f83d
commit 7e953db6bd
14 changed files with 141 additions and 36 deletions
--- a/caption-engine/main-gummy.py
+++ b/caption-engine/main-gummy.py
@@ -3,6 +3,8 @@ import argparse

 if sys.platform == 'win32':
    from sysaudio.win import AudioStream
+elif sys.platform == 'darwin':
+    from sysaudio.darwin import AudioStream
 elif sys.platform == 'linux':
    from sysaudio.linux import AudioStream
 else:
@@ -12,9 +14,9 @@ from audioprcs import mergeChunkChannels
 from audio2text import InvalidParameter, GummyTranslator


-def convert_audio_to_text(s_lang, t_lang, audio_type):
+def convert_audio_to_text(s_lang, t_lang, audio_type, chunk_rate):
    sys.stdout.reconfigure(line_buffering=True) # type: ignore
-    stream = AudioStream(audio_type)
+    stream = AudioStream(audio_type, chunk_rate)

    if t_lang == 'none':
        gummy = GummyTranslator(stream.RATE, s_lang, None)
@@ -43,10 +45,12 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Convert system audio stream to text')
    parser.add_argument('-s', '--source_language', default='en', help='Source language code')
    parser.add_argument('-t', '--target_language', default='zh', help='Target language code')
-    parser.add_argument('-a', '--audio_type', default='0', help='Audio stream source: 0 for output audio stream, 1 for input audio stream')
+    parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output audio stream, 1 for input audio stream')
+    parser.add_argument('-c', '--chunk_rate', default=20, help='The number of audio stream chunks collected per second.')
    args = parser.parse_args()
    convert_audio_to_text(
        args.source_language,
        args.target_language,
-        int(args.audio_type)
+        int(args.audio_type),
+        int(args.chunk_rate)
    )
--- a/caption-engine/requirements.txt
+++ b/caption-engine/requirements.txt
@@ -1,6 +1,6 @@
-dashscope==1.23.5
-numpy==2.2.6
-samplerate==0.2.1
-PyAudio==0.2.14
-PyAudioWPatch==0.2.12.7 # Windows only
-pyinstaller==6.14.1
+dashscope
+numpy
+samplerate
+PyAudio
+PyAudioWPatch # Windows only
+pyinstaller
--- a/caption-engine/sysaudio/darwin.py
+++ b/caption-engine/sysaudio/darwin.py
@@ -0,0 +1,85 @@
+"""获取 MacOS 系统音频输入/输出流"""
+
+import pyaudio
+
+
+class AudioStream:
+    """
+    获取系统音频流（支持 BlackHole 作为系统音频输出捕获）
+
+    初始化参数：
+        audio_type: 0-系统音频输出流（需配合 BlackHole），1-系统音频输入流
+        chunk_rate: 每秒采集音频块的数量，默认为20
+    """
+    def __init__(self, audio_type=0, chunk_rate=20):
+        self.audio_type = audio_type
+        self.mic = pyaudio.PyAudio()
+        if self.audio_type == 0:
+            self.device = self.getOutputDeviceInfo()
+        else:
+            self.device = self.mic.get_default_input_device_info()
+        self.stream = None
+        self.SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16)
+        self.FORMAT = pyaudio.paInt16
+        self.CHANNELS = self.device["maxInputChannels"]
+        self.RATE = int(self.device["defaultSampleRate"])
+        self.CHUNK = self.RATE // chunk_rate
+        self.INDEX = self.device["index"]
+
+    def getOutputDeviceInfo(self):
+        """查找指定关键词的输入设备"""
+        device_count = self.mic.get_device_count()
+        for i in range(device_count):
+            dev_info = self.mic.get_device_info_by_index(i)
+            if 'blackhole' in dev_info["name"].lower():    
+                return dev_info
+        raise Exception("The device containing BlackHole was not found.")
+
+    def printInfo(self):
+        dev_info = f"""
+        采样输入设备：
+            - 设备类型：{ "音频输出" if self.audio_type == 0 else "音频输入" }
+            - 序号：{self.device['index']}
+            - 名称：{self.device['name']}
+            - 最大输入通道数：{self.device['maxInputChannels']}
+            - 默认低输入延迟：{self.device['defaultLowInputLatency']}s
+            - 默认高输入延迟：{self.device['defaultHighInputLatency']}s
+            - 默认采样率：{self.device['defaultSampleRate']}Hz
+
+        音频样本块大小：{self.CHUNK}
+        样本位宽：{self.SAMP_WIDTH}
+        采样格式：{self.FORMAT}
+        音频通道数：{self.CHANNELS}
+        音频采样率：{self.RATE}
+        """
+        print(dev_info)
+
+    def openStream(self):
+        """
+        打开并返回系统音频输出流
+        """
+        if self.stream: return self.stream
+        self.stream = self.mic.open(
+            format = self.FORMAT,
+            channels = int(self.CHANNELS),
+            rate = self.RATE,
+            input = True,
+            input_device_index = int(self.INDEX)
+        )
+        return self.stream
+
+    def read_chunk(self):
+        """
+        读取音频数据
+        """
+        if not self.stream: return None
+        return self.stream.read(self.CHUNK, exception_on_overflow=False)
+
+    def closeStream(self):
+        """
+        关闭系统音频输出流
+        """
+        if self.stream is None: return
+        self.stream.stop_stream()
+        self.stream.close()
+        self.stream = None
--- a/caption-engine/sysaudio/linux.py
+++ b/caption-engine/sysaudio/linux.py
@@ -1,3 +1,5 @@
+"""获取 Linux 系统音频输入流"""
+
 import pyaudio


--- a/caption-engine/sysaudio/win.py
+++ b/caption-engine/sysaudio/win.py
@@ -1,4 +1,4 @@
-"""获取 Windows 系统音频输出流"""
+"""获取 Windows 系统音频输入/输出流"""

 import pyaudiowpatch as pyaudio

@@ -101,7 +101,7 @@ class AudioStream:
        读取音频数据
        """
        if not self.stream: return None
-        return self.stream.read(self.CHUNK)
+        return self.stream.read(self.CHUNK, exception_on_overflow=False)

    def closeStream(self):
        """