From 6bff978b88b48346060458d419483f1706424bdf Mon Sep 17 00:00:00 2001
From: himeditator <hironin@foxmail.com>
Date: Sat, 6 Sep 2025 23:15:33 +0800
Subject: [PATCH] =?UTF-8?q?feat(engine):=20=E6=9B=BF=E6=8D=A2=E9=87=8D?=
 =?UTF-8?q?=E9=87=87=E6=A0=B7=E6=A8=A1=E5=9E=8B=E3=80=81SOSV=20=E6=B7=BB?=
 =?UTF-8?q?=E5=8A=A0=E6=A0=87=E7=82=B9=E6=81=A2=E5=A4=8D=E6=A8=A1=E5=9E=8B?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- 将 samplerate 库替换为 resampy 库，提高重采样质量
- Shepra-ONNX SenseVoice 添加中文和英语标点恢复模型
---
 engine/audio2text/sosv.py   | 73 ++++++++++++++++++++++++--------
 engine/audio2text/vosk.py   |  5 ++-
 engine/main.py              | 26 +++++++-----
 engine/requirements.txt     |  3 +-
 engine/utils/__init__.py    |  7 +---
 engine/utils/audioprcs.py   | 83 ++++---------------------------------
 engine/utils/translation.py |  1 +
 7 files changed, 85 insertions(+), 113 deletions(-)

diff --git a/engine/audio2text/sosv.py b/engine/audio2text/sosv.py
index 4c59ce4..28e38e3 100644
--- a/engine/audio2text/sosv.py
+++ b/engine/audio2text/sosv.py
@@ -9,6 +9,7 @@ https://github.com/k2-fsa/sherpa-onnx/blob/master/python-api-examples/simulate-s
 import time
 from datetime import datetime
 import sherpa_onnx
+import threading
 import numpy as np
 
 from utils import shared_data
@@ -23,23 +24,27 @@ class SosvRecognizer:
     初始化参数：
         model_path: Shepra ONNX Sense Voice 识别模型路径
         vad_model: Silero VAD 模型路径
+        source: 识别源语言(auto, zh, en, ja, ko, yue)
         target: 翻译目标语言
         trans_model: 翻译模型名称
         ollama_name: Ollama 模型名称
     """
-    def __init__(self, model_path: str, target: str | None, trans_model: str, ollama_name: str):
+    def __init__(self, model_path: str, source: str, target: str | None, trans_model: str, ollama_name: str):
         if model_path.startswith('"'):
             model_path = model_path[1:]
         if model_path.endswith('"'):
             model_path = model_path[:-1]
         self.model_path = model_path
+        self.ext = ""
+        if self.model_path[-4:] == "int8":
+            self.ext = ".int8"
+        self.source = source
         self.target = target
         if trans_model == 'google':
             self.trans_func = google_translate
         else:
             self.trans_func = ollama_translate
         self.ollama_name = ollama_name
-
         self.time_str = ''
         self.cur_id = 0
         self.prev_content = ''
@@ -47,19 +52,39 @@ class SosvRecognizer:
     def start(self):
         """启动 Sense Voice 模型"""
         self.recognizer = sherpa_onnx.OfflineRecognizer.from_sense_voice(
-            model=f"{self.model_path}/model.onnx",
-            tokens=f"{self.model_path}/tokens.txt",
+            model=f"{self.model_path}/sensevoice/model{self.ext}.onnx",
+            tokens=f"{self.model_path}/sensevoice/tokens.txt",
+            language=self.source,
             num_threads = 2,
         )
-        config = sherpa_onnx.VadModelConfig()
-        config.silero_vad.model = f"{self.model_path}/silero_vad.onnx"
-        config.silero_vad.threshold = 0.5
-        config.silero_vad.min_silence_duration = 0.1
-        config.silero_vad.min_speech_duration = 0.25
-        config.silero_vad.max_speech_duration = 8
-        config.sample_rate = 16000
-        self.window_size = config.silero_vad.window_size
-        self.vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=100)
+        
+        vad_config = sherpa_onnx.VadModelConfig()
+        vad_config.silero_vad.model = f"{self.model_path}/silero_vad.onnx"
+        vad_config.silero_vad.threshold = 0.5
+        vad_config.silero_vad.min_silence_duration = 0.1
+        vad_config.silero_vad.min_speech_duration = 0.25
+        vad_config.silero_vad.max_speech_duration = 8
+        vad_config.sample_rate = 16000
+        self.window_size = vad_config.silero_vad.window_size
+        self.vad = sherpa_onnx.VoiceActivityDetector(vad_config, buffer_size_in_seconds=100)
+
+        if self.source == 'en':
+            model_config = sherpa_onnx.OnlinePunctuationModelConfig(
+                cnn_bilstm=f"{self.model_path}/punct-en/model{self.ext}.onnx",
+                bpe_vocab=f"{self.model_path}/punct-en/bpe.vocab"
+            )
+            punct_config = sherpa_onnx.OnlinePunctuationConfig(
+                model_config=model_config,
+            )
+            self.punct = sherpa_onnx.OnlinePunctuation(punct_config)
+        else:
+            punct_config = sherpa_onnx.OfflinePunctuationConfig(
+                model=sherpa_onnx.OfflinePunctuationModelConfig(
+                    ct_transformer=f"{self.model_path}/punct/model{self.ext}.onnx"
+                ),
+            )
+            self.punct = sherpa_onnx.OfflinePunctuation(punct_config)
+
         self.buffer = []
         self.offset = 0
         self.started = False
@@ -112,15 +137,27 @@ class SosvRecognizer:
             self.vad.pop()
             self.recognizer.decode_stream(stream)
             text = stream.result.text.strip()
-            
+
+            if self.source == 'en':
+                text_with_punct = self.punct.add_punctuation_with_case(text)
+            else:
+                text_with_punct = self.punct.add_punctuation(text)
+
             caption['index'] = self.cur_id
-            caption['text'] = text
+            caption['text'] = text_with_punct
             caption['time_s'] = self.time_str
             caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
+            if text:
+                stdout_obj(caption)
+                if self.target:
+                    th = threading.Thread(
+                        target=self.trans_func,
+                        args=(self.ollama_name, self.target, caption['text'], self.time_str),
+                        daemon=True
+                    )
+                    th.start()    
+                self.cur_id += 1
             self.prev_content = ''
-            stdout_obj(caption)
-            
-            self.cur_id += 1
             self.time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3]
             self.buffer = []
             self.offset = 0
diff --git a/engine/audio2text/vosk.py b/engine/audio2text/vosk.py
index 600f866..86b0991 100644
--- a/engine/audio2text/vosk.py
+++ b/engine/audio2text/vosk.py
@@ -62,11 +62,12 @@ class VoskRecognizer:
             self.prev_content = ''
             if content == '': return
             self.cur_id += 1
+            
             if self.target:
-                self.trans_time = time.time()
                 th = threading.Thread(
                     target=self.trans_func,
-                    args=(self.ollama_name, self.target, caption['text'], self.time_str)
+                    args=(self.ollama_name, self.target, caption['text'], self.time_str),
+                    daemon=True
                 )
                 th.start()
         else:
diff --git a/engine/main.py b/engine/main.py
index d1d6177..225c369 100644
--- a/engine/main.py
+++ b/engine/main.py
@@ -10,23 +10,26 @@ from audio2text import SosvRecognizer
 from sysaudio import AudioStream
 
 
-def audio_recording(stream: AudioStream, resample: bool, save = False):
+def audio_recording(stream: AudioStream, resample: bool, save = False, path = ''):
     global shared_data
     stream.open_stream()
+    wf = None
     if save:
-        wf = wave.open(f'record.wav', 'wb')
-        wf.setnchannels(1)
+        if path != '':
+            path += '/'
+        wf = wave.open(f'{path}record.wav', 'wb')
+        wf.setnchannels(stream.CHANNELS)
         wf.setsampwidth(stream.SAMP_WIDTH)
-        wf.setframerate(16000)
+        wf.setframerate(stream.CHUNK_RATE)
     while shared_data.status == 'running':
         raw_chunk = stream.read_chunk()
+        if save: wf.writeframes(raw_chunk) # type: ignore
         if raw_chunk is None: continue
         if resample:
             chunk = resample_chunk_mono(raw_chunk, stream.CHANNELS, stream.RATE, 16000)
         else:
             chunk = merge_chunk_channels(raw_chunk, stream.CHANNELS)
         shared_data.chunk_queue.put(chunk)
-        if save: wf.writeframes(chunk) # type: ignore
     if save: wf.close() # type: ignore
     stream.close_stream_signal()
 
@@ -88,21 +91,22 @@ def main_vosk(a: int, c: int, vosk: str, t: str, tm: str, omn: str):
     engine.stop()
 
 
-def main_sosv(a: int, c: int, sosv: str, t: str, tm: str, omn: str):
+def main_sosv(a: int, c: int, sosv: str, s: str, t: str, tm: str, omn: str):
     """
     Parameters:
         a: Audio source: 0 for output, 1 for input
         c: Chunk number in 1 second
         sosv: Sherpa-ONNX SenseVoice model path
+        s: Source language
         t: Target language
         tm: Translation model type, ollama or google
         omn: Ollama model name
     """
     stream = AudioStream(a, c)
     if t == 'none':
-        engine = SosvRecognizer(sosv, None, tm, omn)
+        engine = SosvRecognizer(sosv, s, None, tm, omn)
     else:
-        engine = SosvRecognizer(sosv, t, tm, omn)
+        engine = SosvRecognizer(sosv, s, t, tm, omn)
 
     engine.start()
     stream_thread = threading.Thread(
@@ -120,14 +124,15 @@ def main_sosv(a: int, c: int, sosv: str, t: str, tm: str, omn: str):
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='Convert system audio stream to text')
-    # both
+    # all
     parser.add_argument('-e', '--caption_engine', default='gummy', help='Caption engine: gummy or vosk')
     parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output, 1 for input')
     parser.add_argument('-c', '--chunk_rate', default=10, help='Number of audio stream chunks collected per second')
     parser.add_argument('-p', '--port', default=0, help='The port to run the server on, 0 for no server')
     parser.add_argument('-t', '--target_language', default='zh', help='Target language code, "none" for no translation')
+    # gummy and sosv
+    parser.add_argument('-s', '--source_language', default='auto', help='Source language code')
     # gummy only
-    parser.add_argument('-s', '--source_language', default='en', help='Source language code')
     parser.add_argument('-k', '--api_key', default='', help='API KEY for Gummy model')
     # vosk and sosv
     parser.add_argument('-tm', '--translation_model', default='ollama', help='Model for translation: ollama or google')
@@ -165,6 +170,7 @@ if __name__ == "__main__":
             int(args.audio_type),
             int(args.chunk_rate),
             args.sosv_model,
+            args.source_language,
             args.target_language,
             args.translation_model,
             args.ollama_name
diff --git a/engine/requirements.txt b/engine/requirements.txt
index bbd3925..3d33f8f 100644
--- a/engine/requirements.txt
+++ b/engine/requirements.txt
@@ -1,9 +1,10 @@
 dashscope
 numpy
-samplerate
+resampy
 vosk
 pyinstaller
 pyaudio; sys_platform == 'darwin'
 pyaudiowpatch; sys_platform == 'win32'
 googletrans
 ollama
+sherpa_onnx
\ No newline at end of file
diff --git a/engine/utils/__init__.py b/engine/utils/__init__.py
index 6950ff2..2acdad8 100644
--- a/engine/utils/__init__.py
+++ b/engine/utils/__init__.py
@@ -1,9 +1,4 @@
-from .audioprcs import (
-    merge_chunk_channels,
-    resample_chunk_mono,
-    resample_chunk_mono_np,
-    resample_mono_chunk
-)
+from .audioprcs import merge_chunk_channels, resample_chunk_mono
 from .sysout import stdout, stdout_err, stdout_cmd, stdout_obj, stderr
 from .shared import shared_data
 from .server import start_server
diff --git a/engine/utils/audioprcs.py b/engine/utils/audioprcs.py
index 5422a30..2f11ee1 100644
--- a/engine/utils/audioprcs.py
+++ b/engine/utils/audioprcs.py
@@ -1,4 +1,4 @@
-import samplerate
+import resampy
 import numpy as np
 import numpy.core.multiarray # do not remove
 
@@ -24,16 +24,15 @@ def merge_chunk_channels(chunk: bytes, channels: int) -> bytes:
     return chunk_mono.tobytes()
 
 
-def resample_chunk_mono(chunk: bytes, channels: int, orig_sr: int, target_sr: int, mode="sinc_best") -> bytes:
+def resample_chunk_mono(chunk: bytes, channels: int, orig_sr: int, target_sr: int) -> bytes:
     """
-    将当前多通道音频数据块转换成单通道音频数据块，然后进行重采样
+    将当前多通道音频数据块转换成单通道音频数据块，并进行重采样
 
     Args:
         chunk: 多通道音频数据块
         channels: 通道数
         orig_sr: 原始采样率
         target_sr: 目标采样率
-        mode: 重采样模式，可选：'sinc_best' | 'sinc_medium' | 'sinc_fastest' | 'zero_order_hold' | 'linear'
 
     Return:
         单通道音频数据块
@@ -52,82 +51,14 @@ def resample_chunk_mono(chunk: bytes, channels: int, orig_sr: int, target_sr: in
     if orig_sr == target_sr:
         return chunk_mono.astype(np.int16).tobytes()
     
-    ratio = target_sr / orig_sr
-    chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode)
+    chunk_mono_r = resampy.resample(chunk_mono, orig_sr, target_sr)
     chunk_mono_r = np.round(chunk_mono_r).astype(np.int16)
-    real_len = round(chunk_mono.shape[0] * ratio)
+    real_len = round(chunk_mono.shape[0] * target_sr / orig_sr)
+    if(chunk_mono_r.shape[0] != real_len):
+        print(chunk_mono_r.shape[0], real_len)
     if(chunk_mono_r.shape[0] > real_len):
         chunk_mono_r = chunk_mono_r[:real_len]
     else:
         while chunk_mono_r.shape[0] < real_len:
             chunk_mono_r = np.append(chunk_mono_r, chunk_mono_r[-1])
     return chunk_mono_r.tobytes()
-
-
-def resample_chunk_mono_np(chunk: bytes, channels: int, orig_sr: int, target_sr: int, mode="sinc_best", dtype=np.float32) -> np.ndarray:
-    """
-    将当前多通道音频数据块转换成单通道音频数据块，然后进行重采样，返回 Numpy 数组
-
-    Args:
-        chunk: 多通道音频数据块
-        channels: 通道数
-        orig_sr: 原始采样率
-        target_sr: 目标采样率
-        mode: 重采样模式，可选：'sinc_best' | 'sinc_medium' | 'sinc_fastest' | 'zero_order_hold' | 'linear'
-        dtype: 返回 Numpy 数组的数据类型
-
-    Return:
-        单通道音频数据块
-    """
-    if channels == 1:
-        chunk_mono = np.frombuffer(chunk, dtype=np.int16)
-        chunk_mono = chunk_mono.astype(np.float32)
-    else:
-        # (length * channels,)
-        chunk_np = np.frombuffer(chunk, dtype=np.int16)
-        # (length, channels)
-        chunk_np = chunk_np.reshape(-1, channels)
-        # (length,)
-        chunk_mono = np.mean(chunk_np.astype(np.float32), axis=1)
-
-    if orig_sr == target_sr:
-        return chunk_mono.astype(dtype)
-
-    ratio = target_sr / orig_sr
-    chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode)
-    chunk_mono_r = chunk_mono_r.astype(dtype)
-    real_len = round(chunk_mono.shape[0] * ratio)
-    if(chunk_mono_r.shape[0] > real_len):
-        chunk_mono_r = chunk_mono_r[:real_len]
-    else:
-        while chunk_mono_r.shape[0] < real_len:
-            chunk_mono_r = np.append(chunk_mono_r, chunk_mono_r[-1])
-    return chunk_mono_r
-
-
-def resample_mono_chunk(chunk: bytes, orig_sr: int, target_sr: int, mode="sinc_best") -> bytes:
-    """
-    将当前单通道音频块进行重采样
-
-    Args:
-        chunk: 单通道音频数据块
-        orig_sr: 原始采样率
-        target_sr: 目标采样率
-        mode: 重采样模式，可选：'sinc_best' | 'sinc_medium' | 'sinc_fastest' | 'zero_order_hold' | 'linear'
-
-    Return:
-        单通道音频数据块
-    """
-    if orig_sr == target_sr: return chunk
-    chunk_np = np.frombuffer(chunk, dtype=np.int16)
-    chunk_np = chunk_np.astype(np.float32)
-    ratio = target_sr / orig_sr
-    chunk_r =  samplerate.resample(chunk_np, ratio, converter_type=mode)
-    chunk_r = np.round(chunk_r).astype(np.int16)
-    real_len = round(chunk_np.shape[0] * ratio)
-    if(chunk_r.shape[0] > real_len):
-        chunk_r = chunk_r[:real_len]
-    else:
-        while chunk_r.shape[0] < real_len:
-            chunk_r = np.append(chunk_r, chunk_r[-1])
-    return chunk_r.tobytes()
diff --git a/engine/utils/translation.py b/engine/utils/translation.py
index 33d0c6f..ed3215d 100644
--- a/engine/utils/translation.py
+++ b/engine/utils/translation.py
@@ -13,6 +13,7 @@ lang_map = {
     'ru': 'Russian',
     'ja': 'Japanese',
     'ko': 'Korean',
+    'zh': 'Chinese',
     'zh-cn': 'Chinese'
 }