feat(engine): 重构字幕引擎，新增 Sherpa-ONNX SenseVoice 语音识别模型

- 重构字幕引擎，将音频采集改为在新线程上进行 - 重构 audio2text 中的类，调整运行逻辑 - 更新 main 函数，添加对 Sosv 模型的支持 - 修改 AudioStream 类，默认使用 16000Hz 采样率
2026-02-17 14:14:53 +08:00 · 2025-09-06 20:49:46 +08:00
parent 2b7ce06f04
commit eba2c5ca45
14 changed files with 377 additions and 112 deletions
--- a/engine/utils/audioprcs.py
+++ b/engine/utils/audioprcs.py
@@ -49,9 +49,18 @@ def resample_chunk_mono(chunk: bytes, channels: int, orig_sr: int, target_sr: in
        # (length,)
        chunk_mono = np.mean(chunk_np.astype(np.float32), axis=1)

+    if orig_sr == target_sr:
+        return chunk_mono.astype(np.int16).tobytes()
+    
    ratio = target_sr / orig_sr
    chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode)
    chunk_mono_r = np.round(chunk_mono_r).astype(np.int16)
+    real_len = round(chunk_mono.shape[0] * ratio)
+    if(chunk_mono_r.shape[0] > real_len):
+        chunk_mono_r = chunk_mono_r[:real_len]
+    else:
+        while chunk_mono_r.shape[0] < real_len:
+            chunk_mono_r = np.append(chunk_mono_r, chunk_mono_r[-1])
    return chunk_mono_r.tobytes()


@@ -81,9 +90,18 @@ def resample_chunk_mono_np(chunk: bytes, channels: int, orig_sr: int, target_sr:
        # (length,)
        chunk_mono = np.mean(chunk_np.astype(np.float32), axis=1)

+    if orig_sr == target_sr:
+        return chunk_mono.astype(dtype)
+
    ratio = target_sr / orig_sr
    chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode)
    chunk_mono_r = chunk_mono_r.astype(dtype)
+    real_len = round(chunk_mono.shape[0] * ratio)
+    if(chunk_mono_r.shape[0] > real_len):
+        chunk_mono_r = chunk_mono_r[:real_len]
+    else:
+        while chunk_mono_r.shape[0] < real_len:
+            chunk_mono_r = np.append(chunk_mono_r, chunk_mono_r[-1])
    return chunk_mono_r


@@ -100,9 +118,16 @@ def resample_mono_chunk(chunk: bytes, orig_sr: int, target_sr: int, mode="sinc_b
    Return:
        单通道音频数据块
    """
+    if orig_sr == target_sr: return chunk
    chunk_np = np.frombuffer(chunk, dtype=np.int16)
    chunk_np = chunk_np.astype(np.float32)
    ratio = target_sr / orig_sr
    chunk_r =  samplerate.resample(chunk_np, ratio, converter_type=mode)
    chunk_r = np.round(chunk_r).astype(np.int16)
+    real_len = round(chunk_np.shape[0] * ratio)
+    if(chunk_r.shape[0] > real_len):
+        chunk_r = chunk_r[:real_len]
+    else:
+        while chunk_r.shape[0] < real_len:
+            chunk_r = np.append(chunk_r, chunk_r[-1])
    return chunk_r.tobytes()