refactor(caption-engine): 重构字幕引擎代码结构

- 重构 GummyTranslator 类，增加启动和停止方法 - 优化 AudioStream 类，添加读取音频数据方法 - 更新 main-gummy.py，使用新的 GummyTranslator 和 AudioStream 接口 - 更新文档和 TODO 列表
2026-02-04 12:24:42 +08:00 · 2025-07-06 22:46:46 +08:00
parent 213426dace
commit f2aa075e65
16 changed files with 517 additions and 174 deletions
--- a/caption-engine/sysaudio/linux.py
+++ b/caption-engine/sysaudio/linux.py
@@ -1,30 +1,15 @@
 import pyaudio
-import numpy as np
-
-def mergeStreamChannels(data, channels):
-    """
-    将当前多通道流数据合并为单通道流数据
-
-    Args:
-        data: 多通道数据
-        channels: 通道数
-
-    Returns:
-        mono_data_bytes: 单通道数据
-    """
-    # (length * channels,)
-    data_np = np.frombuffer(data, dtype=np.int16)
-    # (length, channels)
-    data_np_r = data_np.reshape(-1, channels)
-    # (length,)
-    mono_data = np.mean(data_np_r.astype(np.float32), axis=1)
-    mono_data = mono_data.astype(np.int16)
-    mono_data_bytes = mono_data.tobytes()
-    return mono_data_bytes


 class AudioStream:
-    def __init__(self, audio_type=1):
+    """
+    获取系统音频流
+
+    初始化参数：
+        audio_type: 0-系统音频输出流（不支持，不会生效），1-系统音频输入流（默认）
+        chunk_rate: 每秒采集音频块的数量，默认为20
+    """
+    def __init__(self, audio_type=1,  chunk_rate=20):
        self.audio_type = audio_type
        self.mic = pyaudio.PyAudio()
        self.device = self.mic.get_default_input_device_info()
@@ -33,7 +18,7 @@ class AudioStream:
        self.FORMAT = pyaudio.paInt16
        self.CHANNELS = self.device["maxInputChannels"]
        self.RATE = int(self.device["defaultSampleRate"])
-        self.CHUNK = self.RATE // 20
+        self.CHUNK = self.RATE // chunk_rate
        self.INDEX = self.device["index"]

    def printInfo(self):
@@ -62,13 +47,20 @@ class AudioStream:
        if self.stream: return self.stream
        self.stream = self.mic.open(
            format = self.FORMAT,
-            channels = self.CHANNELS,
+            channels = int(self.CHANNELS),
            rate = self.RATE,
            input = True,
-            input_device_index = self.INDEX
+            input_device_index = int(self.INDEX)
        )
        return self.stream
-    
+
+    def read_chunk(self):
+        """
+        读取音频数据
+        """
+        if not self.stream: return None
+        return self.stream.read(self.CHUNK)
+
    def closeStream(self):
        """
        关闭系统音频输出流
@@ -76,4 +68,4 @@ class AudioStream:
        if self.stream is None: return
        self.stream.stop_stream()
        self.stream.close()
-        self.stream = None
+        self.stream = None