feat(engine): 替换重采样模型、SOSV 添加标点恢复模型

- 将 samplerate 库替换为 resampy 库，提高重采样质量 - Shepra-ONNX SenseVoice 添加中文和英语标点恢复模型
2026-02-22 09:24:41 +08:00 · 2025-09-06 23:15:33 +08:00
parent eba2c5ca45
commit 6bff978b88
7 changed files with 85 additions and 113 deletions
--- a/engine/audio2text/sosv.py
+++ b/engine/audio2text/sosv.py
@@ -9,6 +9,7 @@ https://github.com/k2-fsa/sherpa-onnx/blob/master/python-api-examples/simulate-s
 import time
 from datetime import datetime
 import sherpa_onnx
+import threading
 import numpy as np

 from utils import shared_data
@@ -23,23 +24,27 @@ class SosvRecognizer:
    初始化参数：
        model_path: Shepra ONNX Sense Voice 识别模型路径
        vad_model: Silero VAD 模型路径
+        source: 识别源语言(auto, zh, en, ja, ko, yue)
        target: 翻译目标语言
        trans_model: 翻译模型名称
        ollama_name: Ollama 模型名称
    """
-    def __init__(self, model_path: str, target: str | None, trans_model: str, ollama_name: str):
+    def __init__(self, model_path: str, source: str, target: str | None, trans_model: str, ollama_name: str):
        if model_path.startswith('"'):
            model_path = model_path[1:]
        if model_path.endswith('"'):
            model_path = model_path[:-1]
        self.model_path = model_path
+        self.ext = ""
+        if self.model_path[-4:] == "int8":
+            self.ext = ".int8"
+        self.source = source
        self.target = target
        if trans_model == 'google':
            self.trans_func = google_translate
        else:
            self.trans_func = ollama_translate
        self.ollama_name = ollama_name
-
        self.time_str = ''
        self.cur_id = 0
        self.prev_content = ''
@@ -47,19 +52,39 @@ class SosvRecognizer:
    def start(self):
        """启动 Sense Voice 模型"""
        self.recognizer = sherpa_onnx.OfflineRecognizer.from_sense_voice(
-            model=f"{self.model_path}/model.onnx",
-            tokens=f"{self.model_path}/tokens.txt",
+            model=f"{self.model_path}/sensevoice/model{self.ext}.onnx",
+            tokens=f"{self.model_path}/sensevoice/tokens.txt",
+            language=self.source,
            num_threads = 2,
        )
-        config = sherpa_onnx.VadModelConfig()
-        config.silero_vad.model = f"{self.model_path}/silero_vad.onnx"
-        config.silero_vad.threshold = 0.5
-        config.silero_vad.min_silence_duration = 0.1
-        config.silero_vad.min_speech_duration = 0.25
-        config.silero_vad.max_speech_duration = 8
-        config.sample_rate = 16000
-        self.window_size = config.silero_vad.window_size
-        self.vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=100)
+        
+        vad_config = sherpa_onnx.VadModelConfig()
+        vad_config.silero_vad.model = f"{self.model_path}/silero_vad.onnx"
+        vad_config.silero_vad.threshold = 0.5
+        vad_config.silero_vad.min_silence_duration = 0.1
+        vad_config.silero_vad.min_speech_duration = 0.25
+        vad_config.silero_vad.max_speech_duration = 8
+        vad_config.sample_rate = 16000
+        self.window_size = vad_config.silero_vad.window_size
+        self.vad = sherpa_onnx.VoiceActivityDetector(vad_config, buffer_size_in_seconds=100)
+
+        if self.source == 'en':
+            model_config = sherpa_onnx.OnlinePunctuationModelConfig(
+                cnn_bilstm=f"{self.model_path}/punct-en/model{self.ext}.onnx",
+                bpe_vocab=f"{self.model_path}/punct-en/bpe.vocab"
+            )
+            punct_config = sherpa_onnx.OnlinePunctuationConfig(
+                model_config=model_config,
+            )
+            self.punct = sherpa_onnx.OnlinePunctuation(punct_config)
+        else:
+            punct_config = sherpa_onnx.OfflinePunctuationConfig(
+                model=sherpa_onnx.OfflinePunctuationModelConfig(
+                    ct_transformer=f"{self.model_path}/punct/model{self.ext}.onnx"
+                ),
+            )
+            self.punct = sherpa_onnx.OfflinePunctuation(punct_config)
+
        self.buffer = []
        self.offset = 0
        self.started = False
@@ -112,15 +137,27 @@ class SosvRecognizer:
            self.vad.pop()
            self.recognizer.decode_stream(stream)
            text = stream.result.text.strip()
-            
+
+            if self.source == 'en':
+                text_with_punct = self.punct.add_punctuation_with_case(text)
+            else:
+                text_with_punct = self.punct.add_punctuation(text)
+
            caption['index'] = self.cur_id
-            caption['text'] = text
+            caption['text'] = text_with_punct
            caption['time_s'] = self.time_str
            caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
+            if text:
+                stdout_obj(caption)
+                if self.target:
+                    th = threading.Thread(
+                        target=self.trans_func,
+                        args=(self.ollama_name, self.target, caption['text'], self.time_str),
+                        daemon=True
+                    )
+                    th.start()    
+                self.cur_id += 1
            self.prev_content = ''
-            stdout_obj(caption)
-            
-            self.cur_id += 1
            self.time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3]
            self.buffer = []
            self.offset = 0
--- a/engine/audio2text/vosk.py
+++ b/engine/audio2text/vosk.py
@@ -62,11 +62,12 @@ class VoskRecognizer:
            self.prev_content = ''
            if content == '': return
            self.cur_id += 1
+            
            if self.target:
-                self.trans_time = time.time()
                th = threading.Thread(
                    target=self.trans_func,
-                    args=(self.ollama_name, self.target, caption['text'], self.time_str)
+                    args=(self.ollama_name, self.target, caption['text'], self.time_str),
+                    daemon=True
                )
                th.start()
        else: