feat(engine): 重构字幕引擎并实现 WebSocket 通信

- 重构了 Gummy 和 Vosk 字幕引擎的代码，提高了可扩展性和可读性 - 合并 Gummy 和 Vosk 引擎为单个可执行文件 - 实现了字幕引擎和主程序之间的 WebSocket 通信，避免了孤儿进程问题
2026-03-02 00:24:40 +08:00 · 2025-07-28 15:49:52 +08:00
parent b658ef5440
commit cd9f3a847d
19 changed files with 242 additions and 293 deletions
--- a/engine/utils/audioprcs.py
+++ b/engine/utils/audioprcs.py
@@ -0,0 +1,75 @@
+import samplerate
+import numpy as np
+import numpy.core.multiarray
+
+def merge_chunk_channels(chunk: bytes, channels: int) -> bytes:
+    """
+    将当前多通道音频数据块转换为单通道音频数据块
+
+    Args:
+        chunk: 多通道音频数据块
+        channels: 通道数
+
+    Returns:
+        单通道音频数据块
+    """
+    if channels == 1: return chunk
+    # (length * channels,)
+    chunk_np = np.frombuffer(chunk, dtype=np.int16)
+    # (length, channels)
+    chunk_np = chunk_np.reshape(-1, channels)
+    # (length,)
+    chunk_mono_f = np.mean(chunk_np.astype(np.float32), axis=1)
+    chunk_mono = np.round(chunk_mono_f).astype(np.int16)
+    return chunk_mono.tobytes()
+
+
+def resample_chunk_mono(chunk: bytes, channels: int, orig_sr: int, target_sr: int, mode="sinc_best") -> bytes:
+    """
+    将当前多通道音频数据块转换成单通道音频数据块，然后进行重采样
+
+    Args:
+        chunk: 多通道音频数据块
+        channels: 通道数
+        orig_sr: 原始采样率
+        target_sr: 目标采样率
+        mode: 重采样模式，可选：'sinc_best' | 'sinc_medium' | 'sinc_fastest' | 'zero_order_hold' | 'linear'
+
+    Return:
+        单通道音频数据块
+    """
+    if channels == 1:
+        chunk_mono = chunk
+    else:
+        # (length * channels,)
+        chunk_np = np.frombuffer(chunk, dtype=np.int16)
+        # (length, channels)
+        chunk_np = chunk_np.reshape(-1, channels)
+        # (length,)
+        chunk_mono_f = np.mean(chunk_np.astype(np.float32), axis=1)
+        chunk_mono = chunk_mono_f.astype(np.int16)
+
+    ratio = target_sr / orig_sr
+    chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode)
+    chunk_mono_r = np.round(chunk_mono_r).astype(np.int16)
+    return chunk_mono_r.tobytes()
+
+
+def resample_mono_chunk(chunk: bytes, orig_sr: int, target_sr: int, mode="sinc_best") -> bytes:
+    """
+    将当前单通道音频块进行重采样
+
+    Args:
+        chunk: 单通道音频数据块
+        orig_sr: 原始采样率
+        target_sr: 目标采样率
+        mode: 重采样模式，可选：'sinc_best' | 'sinc_medium' | 'sinc_fastest' | 'zero_order_hold' | 'linear'
+
+    Return:
+        单通道音频数据块
+    """
+    chunk_np = np.frombuffer(chunk, dtype=np.int16)
+    ratio = target_sr / orig_sr
+    chunk_r =  samplerate.resample(chunk_np, ratio, converter_type=mode)
+    chunk_r = np.round(chunk_r).astype(np.int16)
+    return chunk_r.tobytes()