refactor(engine): 重构字幕引擎

- 更新 GummyTranslator 类，优化字幕生成逻辑 - 移除 audioprcs 模块，音频处理功能转移到 utils 模块 - 重构 sysaudio 模块，提高音频流管理的灵活性和稳定性 - 修改 TODO.md，完成按时间降序排列字幕记录的功能 - 更新文档，说明因资源限制将不再维护英文和日文文档
2026-02-16 05:01:07 +08:00 · 2025-07-26 23:37:24 +08:00
parent 8e575a9ba3
commit 3792eb88b6
18 changed files with 245 additions and 136 deletions
--- a/engine/audio2text/gummy.py
+++ b/engine/audio2text/gummy.py
@@ -6,8 +6,8 @@ from dashscope.audio.asr import (
 )
 import dashscope
 from datetime import datetime
-import json
-import sys
+from utils import stdout_cmd, stdout_obj
+

 class Callback(TranslationRecognizerCallback):
    """
@@ -17,15 +17,16 @@ class Callback(TranslationRecognizerCallback):
        super().__init__()
        self.usage = 0
        self.cur_id = -1
+        self.index = 0
        self.time_str = ''

    def on_open(self) -> None:
-        # print("on_open")
-        pass
+        self.cur_id = -1
+        self.time_str = ''
+        stdout_cmd('info', 'Gummy translator started.')

    def on_close(self) -> None:
-        # print("on_close")
-        pass
+        stdout_cmd('info', 'Gummy translator closed.')

    def on_event(
        self,
@@ -35,17 +36,18 @@ class Callback(TranslationRecognizerCallback):
        usage
    ) -> None:
        caption = {}
+
        if transcription_result is not None:
-            caption['index'] = transcription_result.sentence_id
-            caption['text'] = transcription_result.text
-            if caption['index'] != self.cur_id:
-                self.cur_id = caption['index']
-                cur_time = datetime.now().strftime('%H:%M:%S.%f')[:-3]
-                caption['time_s'] = cur_time
-                self.time_str = cur_time
-            else:
-                caption['time_s'] = self.time_str
+            if self.cur_id != transcription_result.sentence_id:
+                self.time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3]
+                self.cur_id = transcription_result.sentence_id
+                self.index += 1  
+            caption['command'] = 'caption'
+            caption['index'] = self.index
+            caption['time_s'] = self.time_str
            caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
+            caption['end'] = transcription_result.is_sentence_end
+            caption['text'] = transcription_result.text
            caption['translation'] = ""

        if translation_result is not None:
@@ -55,19 +57,8 @@ class Callback(TranslationRecognizerCallback):
        if usage:
            self.usage += usage['duration']

-        # print(caption)
-        self.send_to_node(caption)
+        stdout_obj(caption)

-    def send_to_node(self, data):
-        """
-        将数据发送到 Node.js 进程
-        """
-        try:
-            json_data = json.dumps(data) + '\n'
-            sys.stdout.write(json_data)
-            sys.stdout.flush()
-        except Exception as e:
-            print(f"Error sending data to Node.js: {e}", file=sys.stderr)

 class GummyTranslator:
    """
@@ -78,7 +69,7 @@ class GummyTranslator:
        source: 源语言代码字符串（zh, en, ja 等）
        target: 目标语言代码字符串（zh, en, ja 等）
    """
-    def __init__(self, rate, source, target, api_key):
+    def __init__(self, rate: int, source: str, target: str | None, api_key: str | None):
        if api_key:
            dashscope.api_key = api_key
        self.translator = TranslationRecognizerRealtime(
--- a/engine/audioprcs/init.py
+++ b/engine/audioprcs/init.py
@@ -1 +0,0 @@
-from .process import mergeChunkChannels, resampleRawChunk, resampleMonoChunk
--- a/engine/main-gummy.py
+++ b/engine/main-gummy.py
@@ -1,21 +1,11 @@
 import sys
 import argparse
-
-if sys.platform == 'win32':
-    from sysaudio.win import AudioStream
-elif sys.platform == 'darwin':
-    from sysaudio.darwin import AudioStream
-elif sys.platform == 'linux':
-    from sysaudio.linux import AudioStream
-else:
-    raise NotImplementedError(f"Unsupported platform: {sys.platform}")
-
-from audioprcs import mergeChunkChannels
+from sysaudio import AudioStream
+from utils import merge_chunk_channels
 from audio2text import InvalidParameter, GummyTranslator


 def convert_audio_to_text(s_lang, t_lang, audio_type, chunk_rate, api_key):
-    sys.stdout.reconfigure(line_buffering=True) # type: ignore
    stream = AudioStream(audio_type, chunk_rate)

    if t_lang == 'none':
@@ -23,20 +13,21 @@ def convert_audio_to_text(s_lang, t_lang, audio_type, chunk_rate, api_key):
    else:
        gummy = GummyTranslator(stream.RATE, s_lang, t_lang, api_key)

-    stream.openStream()
+    stream.open_stream()
    gummy.start()

    while True:
        try:
            chunk = stream.read_chunk()
-            chunk_mono = mergeChunkChannels(chunk, stream.CHANNELS)
+            if chunk is None: continue
+            chunk_mono = merge_chunk_channels(chunk, stream.CHANNELS)
            try:
                gummy.send_audio_frame(chunk_mono)
            except InvalidParameter:
                gummy.start()
                gummy.send_audio_frame(chunk_mono)
        except KeyboardInterrupt:
-            stream.closeStream()
+            stream.close_stream()
            gummy.stop()
            break

--- a/engine/main-vosk.py
+++ b/engine/main-vosk.py
@@ -4,17 +4,9 @@ import argparse
 from datetime import datetime
 import numpy.core.multiarray

-if sys.platform == 'win32':
-    from sysaudio.win import AudioStream
-elif sys.platform == 'darwin':
-    from sysaudio.darwin import AudioStream
-elif sys.platform == 'linux':
-    from sysaudio.linux import AudioStream
-else:
-    raise NotImplementedError(f"Unsupported platform: {sys.platform}")
-
+from sysaudio import AudioStream
 from vosk import Model, KaldiRecognizer, SetLogLevel
-from audioprcs import resampleRawChunk
+from utils import resample_chunk_mono

 SetLogLevel(-1)

@@ -30,7 +22,7 @@ def convert_audio_to_text(audio_type, chunk_rate, model_path):
    recognizer = KaldiRecognizer(model, 16000)

    stream = AudioStream(audio_type, chunk_rate)
-    stream.openStream()
+    stream.open_stream()

    time_str = ''
    cur_id = 0
@@ -38,7 +30,8 @@ def convert_audio_to_text(audio_type, chunk_rate, model_path):

    while True:
        chunk = stream.read_chunk()
-        chunk_mono = resampleRawChunk(chunk, stream.CHANNELS, stream.RATE, 16000)
+        if chunk is None: continue
+        chunk_mono = resample_chunk_mono(chunk, stream.CHANNELS, stream.RATE, 16000)

        caption = {}
        if recognizer.AcceptWaveform(chunk_mono):
--- a/engine/sysaudio/init.py
+++ b/engine/sysaudio/init.py
@@ -0,0 +1,10 @@
+import sys
+
+if sys.platform == "win32":
+    from .win import AudioStream
+elif sys.platform == "darwin":
+    from .darwin import AudioStream
+elif sys.platform == "linux":
+    from .linux import AudioStream
+else:
+    raise NotImplementedError(f"Unsupported platform: {sys.platform}")
--- a/engine/sysaudio/darwin.py
+++ b/engine/sysaudio/darwin.py
@@ -1,11 +1,24 @@
 """获取 MacOS 系统音频输入/输出流"""

 import pyaudio
+from textwrap import dedent
+
+
+def get_blackhole_device(mic: pyaudio.PyAudio):
+    """
+    获取 BlackHole 设备
+    """
+    device_count = mic.get_device_count()
+    for i in range(device_count):
+        dev_info = mic.get_device_info_by_index(i)
+        if 'blackhole' in str(dev_info["name"]).lower():    
+            return dev_info
+    raise Exception("The device containing BlackHole was not found.")


 class AudioStream:
    """
-    获取系统音频流（支持 BlackHole 作为系统音频输出捕获）
+    获取系统音频流（如果要捕获输出音频，仅支持 BlackHole 作为系统音频输出捕获）

    初始化参数：
        audio_type: 0-系统音频输出流（需配合 BlackHole），1-系统音频输入流
@@ -15,46 +28,40 @@ class AudioStream:
        self.audio_type = audio_type
        self.mic = pyaudio.PyAudio()
        if self.audio_type == 0:
-            self.device = self.getOutputDeviceInfo()
+            self.device = get_blackhole_device(self.mic)
        else:
            self.device = self.mic.get_default_input_device_info()
+        self.stop_signal = False
        self.stream = None
-        self.SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16)
+        self.INDEX = self.device["index"]
        self.FORMAT = pyaudio.paInt16
-        self.CHANNELS = self.device["maxInputChannels"]
+        self.SAMP_WIDTH = pyaudio.get_sample_size(self.FORMAT)
+        self.CHANNELS = int(self.device["maxInputChannels"])
        self.RATE = int(self.device["defaultSampleRate"])
        self.CHUNK = self.RATE // chunk_rate
-        self.INDEX = self.device["index"]

-    def getOutputDeviceInfo(self):
-        """查找指定关键词的输入设备"""
-        device_count = self.mic.get_device_count()
-        for i in range(device_count):
-            dev_info = self.mic.get_device_info_by_index(i)
-            if 'blackhole' in dev_info["name"].lower():    
-                return dev_info
-        raise Exception("The device containing BlackHole was not found.")
-
-    def printInfo(self):
+    def get_info(self):
        dev_info = f"""
-        采样输入设备：
+        采样设备：
            - 设备类型：{ "音频输出" if self.audio_type == 0 else "音频输入" }
-            - 序号：{self.device['index']}
-            - 名称：{self.device['name']}
+            - 设备序号：{self.device['index']}
+            - 设备名称：{self.device['name']}
            - 最大输入通道数：{self.device['maxInputChannels']}
            - 默认低输入延迟：{self.device['defaultLowInputLatency']}s
            - 默认高输入延迟：{self.device['defaultHighInputLatency']}s
            - 默认采样率：{self.device['defaultSampleRate']}Hz
+            - 是否回环设备：{self.device['isLoopbackDevice']}

-        音频样本块大小：{self.CHUNK}
+        设备序号：{self.INDEX}
+        样本格式：{self.FORMAT}
        样本位宽：{self.SAMP_WIDTH}
-        采样格式：{self.FORMAT}
-        音频通道数：{self.CHANNELS}
-        音频采样率：{self.RATE}
+        样本通道数：{self.CHANNELS}
+        样本采样率：{self.RATE}
+        样本块大小：{self.CHUNK}
        """
-        print(dev_info)
+        return dedent(dev_info).strip()

-    def openStream(self):
+    def open_stream(self):
        """
        打开并返回系统音频输出流
        """
@@ -72,14 +79,24 @@ class AudioStream:
        """
        读取音频数据
        """
+        if self.stop_signal:
+            self.close_stream()
+            return None
        if not self.stream: return None
        return self.stream.read(self.CHUNK, exception_on_overflow=False)

-    def closeStream(self):
+    def close_stream_signal(self):
        """
-        关闭系统音频输出流
+        线程安全的关闭系统音频输入流，不一定会立即关闭
        """
-        if self.stream is None: return
-        self.stream.stop_stream()
-        self.stream.close()
-        self.stream = None
+        self.stop_signal = True
+
+    def close_stream(self):
+        """
+        立即关闭系统音频输入流
+        """
+        if self.stream is not None:
+            self.stream.stop_stream()
+            self.stream.close()
+            self.stream = None
+        self.stop_signal = False
--- a/engine/sysaudio/linux.py
+++ b/engine/sysaudio/linux.py
@@ -1,8 +1,10 @@
 """获取 Linux 系统音频输入流"""

 import subprocess
+from textwrap import dedent

-def findMonitorSource():
+
+def find_monitor_source():
    result = subprocess.run(
        ["pactl", "list", "short", "sources"],
        stdout=subprocess.PIPE, text=True
@@ -16,7 +18,8 @@ def findMonitorSource():

    raise RuntimeError("System output monitor device not found")

-def findInputSource():
+
+def find_input_source():
    result = subprocess.run(
        ["pactl", "list", "short", "sources"],
        stdout=subprocess.PIPE, text=True
@@ -28,8 +31,10 @@ def findInputSource():
        name = parts[1]
        if ".monitor" not in name:
            return name
+
    raise RuntimeError("Microphone input device not found")

+
 class AudioStream:
    """
    获取系统音频流
@@ -42,34 +47,33 @@ class AudioStream:
        self.audio_type = audio_type

        if self.audio_type == 0:
-            self.source = findMonitorSource()
+            self.source = find_monitor_source()
        else:
-            self.source = findInputSource()
-
+            self.source = find_input_source()
+        self.stop_signal = False
        self.process = None
-
-        self.SAMP_WIDTH = 2
        self.FORMAT = 16
+        self.SAMP_WIDTH = 2
        self.CHANNELS = 2
        self.RATE = 48000
        self.CHUNK = self.RATE // chunk_rate

-    def printInfo(self):
+    def get_info(self):
        dev_info = f"""
        音频捕获进程：
            - 捕获类型：{"音频输出" if self.audio_type == 0 else "音频输入"}
            - 设备源：{self.source}
-            - 捕获进程PID：{self.process.pid if self.process else "None"}
+            - 捕获进程 PID：{self.process.pid if self.process else "None"}

-        音频样本块大小：{self.CHUNK}
+        样本格式：{self.FORMAT}
        样本位宽：{self.SAMP_WIDTH}
-        采样格式：{self.FORMAT}
-        音频通道数：{self.CHANNELS}
-        音频采样率：{self.RATE}
+        样本通道数：{self.CHANNELS}
+        样本采样率：{self.RATE}
+        样本块大小：{self.CHUNK}
        """
        print(dev_info)

-    def openStream(self):
+    def open_stream(self):
        """
        启动音频捕获进程
        """
@@ -82,13 +86,23 @@ class AudioStream:
        """
        读取音频数据
        """
-        if self.process:
+        if self.stop_signal:
+            self.close_stream()
+            return None
+        if self.process and self.process.stdout:
            return self.process.stdout.read(self.CHUNK)
        return None

-    def closeStream(self):
+    def close_stream_signal(self):
+        """
+        线程安全的关闭系统音频输入流，不一定会立即关闭
+        """
+        self.stop_signal = True
+
+    def close_stream(self):
        """
        关闭系统音频捕获进程
        """
        if self.process:
            self.process.terminate()
+        self.stop_signal = False
--- a/engine/sysaudio/win.py
+++ b/engine/sysaudio/win.py
@@ -1,14 +1,15 @@
 """获取 Windows 系统音频输入/输出流"""

 import pyaudiowpatch as pyaudio
+from textwrap import dedent


-def getDefaultLoopbackDevice(mic: pyaudio.PyAudio, info = True)->dict:
+def get_default_loopback_device(mic: pyaudio.PyAudio, info = True)->dict:
    """
    获取默认的系统音频输出的回环设备
    Args:
-        mic (pyaudio.PyAudio): pyaudio对象
-        info (bool, optional): 是否打印设备信息
+        mic: pyaudio对象
+        info: 是否打印设备信息

    Returns:
        dict: 系统音频输出的回环设备
@@ -51,38 +52,40 @@ class AudioStream:
        self.audio_type = audio_type
        self.mic = pyaudio.PyAudio()
        if self.audio_type == 0:
-            self.device = getDefaultLoopbackDevice(self.mic, False)
+            self.device = get_default_loopback_device(self.mic, False)
        else:
            self.device = self.mic.get_default_input_device_info()
+        self.stop_signal = False
        self.stream = None
-        self.SAMP_WIDTH = pyaudio.get_sample_size(pyaudio.paInt16)
+        self.INDEX = self.device["index"]
        self.FORMAT = pyaudio.paInt16
+        self.SAMP_WIDTH = pyaudio.get_sample_size(self.FORMAT)
        self.CHANNELS = int(self.device["maxInputChannels"])
        self.RATE = int(self.device["defaultSampleRate"])
        self.CHUNK = self.RATE // chunk_rate
-        self.INDEX = self.device["index"]

-    def printInfo(self):
+    def get_info(self):
        dev_info = f"""
        采样设备：
            - 设备类型：{ "音频输出" if self.audio_type == 0 else "音频输入" }
-            - 序号：{self.device['index']}
-            - 名称：{self.device['name']}
+            - 设备序号：{self.device['index']}
+            - 设备名称：{self.device['name']}
            - 最大输入通道数：{self.device['maxInputChannels']}
            - 默认低输入延迟：{self.device['defaultLowInputLatency']}s
            - 默认高输入延迟：{self.device['defaultHighInputLatency']}s
            - 默认采样率：{self.device['defaultSampleRate']}Hz
            - 是否回环设备：{self.device['isLoopbackDevice']}

-        音频样本块大小：{self.CHUNK}
+        设备序号：{self.INDEX}
+        样本格式：{self.FORMAT}
        样本位宽：{self.SAMP_WIDTH}
-        采样格式：{self.FORMAT}
-        音频通道数：{self.CHANNELS}
-        音频采样率：{self.RATE}
+        样本通道数：{self.CHANNELS}
+        样本采样率：{self.RATE}
+        样本块大小：{self.CHUNK}
        """
-        print(dev_info)
+        return dedent(dev_info).strip()

-    def openStream(self):
+    def open_stream(self):
        """
        打开并返回系统音频输出流
        """
@@ -96,18 +99,28 @@ class AudioStream:
        )
        return self.stream

-    def read_chunk(self):
+    def read_chunk(self) -> bytes | None:
        """
        读取音频数据
        """
+        if self.stop_signal:
+            self.close_stream()
+            return None
        if not self.stream: return None
        return self.stream.read(self.CHUNK, exception_on_overflow=False)

-    def closeStream(self):
+    def close_stream_signal(self):
        """
-        关闭系统音频输出流
+        线程安全的关闭系统音频输入流，不一定会立即关闭
        """
-        if self.stream is None: return
-        self.stream.stop_stream()
-        self.stream.close()
-        self.stream = None
+        self.stop_signal = True
+
+    def close_stream(self):
+        """
+        关闭系统音频输入流
+        """
+        if self.stream is not None:
+            self.stream.stop_stream()
+            self.stream.close()
+            self.stream = None
+        self.stop_signal = False
--- a/engine/utils/init.py
+++ b/engine/utils/init.py
@@ -0,0 +1,2 @@
+from .process import merge_chunk_channels, resample_chunk_mono, resample_mono_chunk
+from .sysout import stdout, stdout_cmd, stdout_obj, stderr
--- a/engine/audioprcs/process.py
+++ b/engine/audioprcs/process.py
@@ -1,16 +1,17 @@
 import samplerate
 import numpy as np

-def mergeChunkChannels(chunk, channels):
+
+def merge_chunk_channels(chunk: bytes, channels: int) -> bytes:
    """
    将当前多通道音频数据块转换为单通道音频数据块

    Args:
-        chunk: (bytes)多通道音频数据块
+        chunk: 多通道音频数据块
        channels: 通道数

    Returns:
-        (bytes)单通道音频数据块
+        单通道音频数据块
    """
    # (length * channels,)
    chunk_np = np.frombuffer(chunk, dtype=np.int16)
@@ -22,19 +23,19 @@ def mergeChunkChannels(chunk, channels):
    return chunk_mono.tobytes()


-def resampleRawChunk(chunk, channels, orig_sr, target_sr, mode="sinc_best"):
+def resample_chunk_mono(chunk: bytes, channels: int, orig_sr: int, target_sr: int, mode="sinc_best") -> bytes:
    """
    将当前多通道音频数据块转换成单通道音频数据块，然后进行重采样

    Args:
-        chunk: (bytes)多通道音频数据块
+        chunk: 多通道音频数据块
        channels: 通道数
        orig_sr: 原始采样率
        target_sr: 目标采样率
        mode: 重采样模式，可选：'sinc_best' | 'sinc_medium' | 'sinc_fastest' | 'zero_order_hold' | 'linear'

    Return:
-        (bytes)单通道音频数据块
+        单通道音频数据块
    """
    # (length * channels,)
    chunk_np = np.frombuffer(chunk, dtype=np.int16)
@@ -44,22 +45,23 @@ def resampleRawChunk(chunk, channels, orig_sr, target_sr, mode="sinc_best"):
    chunk_mono_f = np.mean(chunk_np.astype(np.float32), axis=1)
    chunk_mono = chunk_mono_f.astype(np.int16)
    ratio = target_sr / orig_sr
-    chunk_mono_r =  samplerate.resample(chunk_mono, ratio, converter_type=mode)
+    chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode)
    chunk_mono_r = np.round(chunk_mono_r).astype(np.int16)
    return chunk_mono_r.tobytes()

-def resampleMonoChunk(chunk, orig_sr, target_sr, mode="sinc_best"):
+
+def resample_mono_chunk(chunk: bytes, orig_sr: int, target_sr: int, mode="sinc_best") -> bytes:
    """
    将当前单通道音频块进行重采样

    Args:
-        chunk: (bytes)单通道音频数据块
+        chunk: 单通道音频数据块
        orig_sr: 原始采样率
        target_sr: 目标采样率
        mode: 重采样模式，可选：'sinc_best' | 'sinc_medium' | 'sinc_fastest' | 'zero_order_hold' | 'linear'

    Return:
-        (bytes)单通道音频数据块
+        单通道音频数据块
    """
    chunk_np = np.frombuffer(chunk, dtype=np.int16)
    ratio = target_sr / orig_sr
--- a/engine/utils/sysout.py
+++ b/engine/utils/sysout.py
@@ -0,0 +1,18 @@
+import sys
+import json
+
+def stdout(text: str):
+    stdout_cmd("print", text)
+
+def stdout_cmd(command: str, content = ""):
+    msg = { "command": command, "content": content }
+    sys.stdout.write(json.dumps(msg) + "\n")
+    sys.stdout.flush()
+
+def stdout_obj(obj):
+    sys.stdout.write(json.dumps(obj) + "\n")
+    sys.stdout.flush()
+
+def stderr(text: str):
+    sys.stderr.write(text + "\n")
+    sys.stderr.flush()
				`@@ -1 +0,0 @@`
				`from .process import mergeChunkChannels, resampleRawChunk, resampleMonoChunk`