Files
himeditator eba2c5ca45 feat(engine): 重构字幕引擎,新增 Sherpa-ONNX SenseVoice 语音识别模型
- 重构字幕引擎,将音频采集改为在新线程上进行
- 重构 audio2text 中的类,调整运行逻辑
- 更新 main 函数,添加对 Sosv 模型的支持
- 修改 AudioStream 类,默认使用 16000Hz 采样率
2025-09-06 20:49:46 +08:00

118 lines
3.8 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from dashscope.audio.asr import (
TranslationRecognizerCallback,
TranscriptionResult,
TranslationResult,
TranslationRecognizerRealtime
)
import dashscope
from dashscope.common.error import InvalidParameter
from datetime import datetime
from utils import stdout_cmd, stdout_obj, stdout_err
from utils import shared_data
class Callback(TranslationRecognizerCallback):
"""
语音大模型流式传输回调对象
"""
def __init__(self):
super().__init__()
self.index = 0
self.usage = 0
self.cur_id = -1
self.time_str = ''
def on_open(self) -> None:
self.usage = 0
self.cur_id = -1
self.time_str = ''
stdout_cmd('info', 'Gummy translator started.')
def on_close(self) -> None:
stdout_cmd('info', 'Gummy translator closed.')
stdout_cmd('usage', str(self.usage))
def on_event(
self,
request_id,
transcription_result: TranscriptionResult,
translation_result: TranslationResult,
usage
) -> None:
caption = {}
if transcription_result is not None:
if self.cur_id != transcription_result.sentence_id:
self.time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3]
self.cur_id = transcription_result.sentence_id
self.index += 1
caption['command'] = 'caption'
caption['index'] = self.index
caption['time_s'] = self.time_str
caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
caption['text'] = transcription_result.text
caption['translation'] = ""
if translation_result is not None:
lang = translation_result.get_language_list()[0]
caption['translation'] = translation_result.get_translation(lang).text
if usage:
self.usage += usage['duration']
if 'text' in caption:
stdout_obj(caption)
class GummyRecognizer:
"""
使用 Gummy 引擎流式处理的音频数据,并在标准输出中输出与 Auto Caption 软件可读取的 JSON 字符串数据
初始化参数:
rate: 音频采样率
source: 源语言代码字符串zh, en, ja 等)
target: 目标语言代码字符串zh, en, ja 等)
api_key: 阿里云百炼平台 API KEY
"""
def __init__(self, rate: int, source: str, target: str | None, api_key: str | None):
if api_key:
dashscope.api_key = api_key
self.translator = TranslationRecognizerRealtime(
model = "gummy-realtime-v1",
format = "pcm",
sample_rate = rate,
transcription_enabled = True,
translation_enabled = (target is not None),
source_language = source,
translation_target_languages = [target],
callback = Callback()
)
def start(self):
"""启动 Gummy 引擎"""
self.translator.start()
def translate(self):
"""持续读取共享数据中的音频帧,并进行语音识别,将识别结果输出到标准输出中"""
global shared_data
restart_count = 0
while shared_data.status == 'running':
chunk = shared_data.chunk_queue.get()
try:
self.translator.send_audio_frame(chunk)
except InvalidParameter as e:
restart_count += 1
if restart_count > 5:
stdout_err(str(e))
shared_data.status = "kill"
stdout_cmd('kill')
break
else:
stdout_cmd('info', f'Gummy engine stopped, restart attempt: {restart_count}...')
def stop(self):
"""停止 Gummy 引擎"""
try:
self.translator.stop()
except Exception:
return