refactor(caption-engine): 重构字幕引擎代码结构

- 重构 GummyTranslator 类,增加启动和停止方法
- 优化 AudioStream 类,添加读取音频数据方法
- 更新 main-gummy.py,使用新的 GummyTranslator 和 AudioStream 接口
- 更新文档和 TODO 列表
This commit is contained in:
himeditator
2025-07-06 22:46:46 +08:00
parent 213426dace
commit f2aa075e65
16 changed files with 517 additions and 174 deletions

View File

@@ -1,30 +1,15 @@
import pyaudio
import numpy as np
def mergeStreamChannels(data, channels):
"""
将当前多通道流数据合并为单通道流数据
Args:
data: 多通道数据
channels: 通道数
Returns:
mono_data_bytes: 单通道数据
"""
# (length * channels,)
data_np = np.frombuffer(data, dtype=np.int16)
# (length, channels)
data_np_r = data_np.reshape(-1, channels)
# (length,)
mono_data = np.mean(data_np_r.astype(np.float32), axis=1)
mono_data = mono_data.astype(np.int16)
mono_data_bytes = mono_data.tobytes()
return mono_data_bytes
class AudioStream:
def __init__(self, audio_type=1):
"""
获取系统音频流
初始化参数:
audio_type: 0-系统音频输出流不支持不会生效1-系统音频输入流(默认)
chunk_rate: 每秒采集音频块的数量默认为20
"""
def __init__(self, audio_type=1, chunk_rate=20):
self.audio_type = audio_type
self.mic = pyaudio.PyAudio()
self.device = self.mic.get_default_input_device_info()
@@ -33,7 +18,7 @@ class AudioStream:
self.FORMAT = pyaudio.paInt16
self.CHANNELS = self.device["maxInputChannels"]
self.RATE = int(self.device["defaultSampleRate"])
self.CHUNK = self.RATE // 20
self.CHUNK = self.RATE // chunk_rate
self.INDEX = self.device["index"]
def printInfo(self):
@@ -62,13 +47,20 @@ class AudioStream:
if self.stream: return self.stream
self.stream = self.mic.open(
format = self.FORMAT,
channels = self.CHANNELS,
channels = int(self.CHANNELS),
rate = self.RATE,
input = True,
input_device_index = self.INDEX
input_device_index = int(self.INDEX)
)
return self.stream
def read_chunk(self):
"""
读取音频数据
"""
if not self.stream: return None
return self.stream.read(self.CHUNK)
def closeStream(self):
"""
关闭系统音频输出流
@@ -76,4 +68,4 @@ class AudioStream:
if self.stream is None: return
self.stream.stop_stream()
self.stream.close()
self.stream = None
self.stream = None