feat(engine): 重构字幕引擎,新增 Sherpa-ONNX SenseVoice 语音识别模型

- 重构字幕引擎,将音频采集改为在新线程上进行
- 重构 audio2text 中的类,调整运行逻辑
- 更新 main 函数,添加对 Sosv 模型的支持
- 修改 AudioStream 类,默认使用 16000Hz 采样率
This commit is contained in:
himeditator
2025-09-06 20:49:46 +08:00
parent 2b7ce06f04
commit eba2c5ca45
14 changed files with 377 additions and 112 deletions

View File

@@ -5,6 +5,6 @@ from .audioprcs import (
resample_mono_chunk
)
from .sysout import stdout, stdout_err, stdout_cmd, stdout_obj, stderr
from .thdata import thread_data
from .shared import shared_data
from .server import start_server
from .translation import ollama_translate, google_translate

View File

@@ -49,9 +49,18 @@ def resample_chunk_mono(chunk: bytes, channels: int, orig_sr: int, target_sr: in
# (length,)
chunk_mono = np.mean(chunk_np.astype(np.float32), axis=1)
if orig_sr == target_sr:
return chunk_mono.astype(np.int16).tobytes()
ratio = target_sr / orig_sr
chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode)
chunk_mono_r = np.round(chunk_mono_r).astype(np.int16)
real_len = round(chunk_mono.shape[0] * ratio)
if(chunk_mono_r.shape[0] > real_len):
chunk_mono_r = chunk_mono_r[:real_len]
else:
while chunk_mono_r.shape[0] < real_len:
chunk_mono_r = np.append(chunk_mono_r, chunk_mono_r[-1])
return chunk_mono_r.tobytes()
@@ -81,9 +90,18 @@ def resample_chunk_mono_np(chunk: bytes, channels: int, orig_sr: int, target_sr:
# (length,)
chunk_mono = np.mean(chunk_np.astype(np.float32), axis=1)
if orig_sr == target_sr:
return chunk_mono.astype(dtype)
ratio = target_sr / orig_sr
chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode)
chunk_mono_r = chunk_mono_r.astype(dtype)
real_len = round(chunk_mono.shape[0] * ratio)
if(chunk_mono_r.shape[0] > real_len):
chunk_mono_r = chunk_mono_r[:real_len]
else:
while chunk_mono_r.shape[0] < real_len:
chunk_mono_r = np.append(chunk_mono_r, chunk_mono_r[-1])
return chunk_mono_r
@@ -100,9 +118,16 @@ def resample_mono_chunk(chunk: bytes, orig_sr: int, target_sr: int, mode="sinc_b
Return:
单通道音频数据块
"""
if orig_sr == target_sr: return chunk
chunk_np = np.frombuffer(chunk, dtype=np.int16)
chunk_np = chunk_np.astype(np.float32)
ratio = target_sr / orig_sr
chunk_r = samplerate.resample(chunk_np, ratio, converter_type=mode)
chunk_r = np.round(chunk_r).astype(np.int16)
real_len = round(chunk_np.shape[0] * ratio)
if(chunk_r.shape[0] > real_len):
chunk_r = chunk_r[:real_len]
else:
while chunk_r.shape[0] < real_len:
chunk_r = np.append(chunk_r, chunk_r[-1])
return chunk_r.tobytes()

View File

@@ -1,13 +1,12 @@
import socket
import threading
import json
# import time
from utils import thread_data, stdout_cmd, stderr
from utils import shared_data, stdout_cmd, stderr
def handle_client(client_socket):
global thread_data
while thread_data.status == 'running':
global shared_data
while shared_data.status == 'running':
try:
data = client_socket.recv(4096).decode('utf-8')
if not data:
@@ -15,13 +14,13 @@ def handle_client(client_socket):
data = json.loads(data)
if data['command'] == 'stop':
thread_data.status = 'stop'
shared_data.status = 'stop'
break
except Exception as e:
stderr(f'Communication error: {e}')
break
thread_data.status = 'stop'
shared_data.status = 'stop'
client_socket.close()
@@ -34,7 +33,6 @@ def start_server(port: int):
stderr(str(e))
stdout_cmd('kill')
return
# time.sleep(20)
stdout_cmd('connect')
client, addr = server.accept()

8
engine/utils/shared.py Normal file
View File

@@ -0,0 +1,8 @@
import queue
class SharedData:
def __init__(self):
self.status = "running"
self.chunk_queue = queue.Queue()
shared_data = SharedData()

View File

@@ -1,5 +0,0 @@
class ThreadData:
def __init__(self):
self.status = "running"
thread_data = ThreadData()