feat(engine): 重构字幕引擎,新增 Sherpa-ONNX SenseVoice 语音识别模型

- 重构字幕引擎,将音频采集改为在新线程上进行
- 重构 audio2text 中的类,调整运行逻辑
- 更新 main 函数,添加对 Sosv 模型的支持
- 修改 AudioStream 类,默认使用 16000Hz 采样率
This commit is contained in:
himeditator
2025-09-06 20:49:46 +08:00
parent 2b7ce06f04
commit eba2c5ca45
14 changed files with 377 additions and 112 deletions

View File

@@ -1,3 +1,3 @@
from dashscope.common.error import InvalidParameter
from .gummy import GummyRecognizer from .gummy import GummyRecognizer
from .vosk import VoskRecognizer from .vosk import VoskRecognizer
from .sosv import SosvRecognizer

View File

@@ -5,9 +5,10 @@ from dashscope.audio.asr import (
TranslationRecognizerRealtime TranslationRecognizerRealtime
) )
import dashscope import dashscope
from dashscope.common.error import InvalidParameter
from datetime import datetime from datetime import datetime
from utils import stdout_cmd, stdout_obj, stderr from utils import stdout_cmd, stdout_obj, stdout_err
from utils import shared_data
class Callback(TranslationRecognizerCallback): class Callback(TranslationRecognizerCallback):
""" """
@@ -90,9 +91,23 @@ class GummyRecognizer:
"""启动 Gummy 引擎""" """启动 Gummy 引擎"""
self.translator.start() self.translator.start()
def send_audio_frame(self, data): def translate(self):
"""发送音频帧,擎将自动识别将识别结果输出到标准输出中""" """持续读取共享数据中的音频帧,并进行语音识别将识别结果输出到标准输出中"""
self.translator.send_audio_frame(data) global shared_data
restart_count = 0
while shared_data.status == 'running':
chunk = shared_data.chunk_queue.get()
try:
self.translator.send_audio_frame(chunk)
except InvalidParameter as e:
restart_count += 1
if restart_count > 5:
stdout_err(str(e))
shared_data.status = "kill"
stdout_cmd('kill')
break
else:
stdout_cmd('info', f'Gummy engine stopped, restart attempt: {restart_count}...')
def stop(self): def stop(self):
"""停止 Gummy 引擎""" """停止 Gummy 引擎"""

139
engine/audio2text/sosv.py Normal file
View File

@@ -0,0 +1,139 @@
"""
Shepra-ONNX SenseVoice Model
This code file references the following:
https://github.com/k2-fsa/sherpa-onnx/blob/master/python-api-examples/simulate-streaming-sense-voice-microphone.py
"""
import time
from datetime import datetime
import sherpa_onnx
import numpy as np
from utils import shared_data
from utils import stdout_cmd, stdout_obj
from utils import google_translate, ollama_translate
class SosvRecognizer:
"""
使用 Sense Voice 非流式模型处理流式音频数据,并在标准输出中输出 Auto Caption 软件可读取的 JSON 字符串数据
初始化参数:
model_path: Shepra ONNX Sense Voice 识别模型路径
vad_model: Silero VAD 模型路径
target: 翻译目标语言
trans_model: 翻译模型名称
ollama_name: Ollama 模型名称
"""
def __init__(self, model_path: str, target: str | None, trans_model: str, ollama_name: str):
if model_path.startswith('"'):
model_path = model_path[1:]
if model_path.endswith('"'):
model_path = model_path[:-1]
self.model_path = model_path
self.target = target
if trans_model == 'google':
self.trans_func = google_translate
else:
self.trans_func = ollama_translate
self.ollama_name = ollama_name
self.time_str = ''
self.cur_id = 0
self.prev_content = ''
def start(self):
"""启动 Sense Voice 模型"""
self.recognizer = sherpa_onnx.OfflineRecognizer.from_sense_voice(
model=f"{self.model_path}/model.onnx",
tokens=f"{self.model_path}/tokens.txt",
num_threads = 2,
)
config = sherpa_onnx.VadModelConfig()
config.silero_vad.model = f"{self.model_path}/silero_vad.onnx"
config.silero_vad.threshold = 0.5
config.silero_vad.min_silence_duration = 0.1
config.silero_vad.min_speech_duration = 0.25
config.silero_vad.max_speech_duration = 8
config.sample_rate = 16000
self.window_size = config.silero_vad.window_size
self.vad = sherpa_onnx.VoiceActivityDetector(config, buffer_size_in_seconds=100)
self.buffer = []
self.offset = 0
self.started = False
self.started_time = .0
self.time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3]
stdout_cmd('info', 'Shepra ONNX Sense Voice recognizer started.')
def send_audio_frame(self, data: bytes):
"""
发送音频帧给 SOSV 引擎,引擎将自动识别并将识别结果输出到标准输出中
Args:
data: 音频帧数据,采样率必须为 16000Hz
"""
caption = {}
caption['command'] = 'caption'
caption['translation'] = ''
data_np = np.frombuffer(data, dtype=np.int16).astype(np.float32)
self.buffer = np.concatenate([self.buffer, data_np])
while self.offset + self.window_size < len(self.buffer):
self.vad.accept_waveform(self.buffer[self.offset: self.offset + self.window_size])
if not self.started and self.vad.is_speech_detected():
self.started = True
self.started_time = time.time()
self.offset += self.window_size
if not self.started:
if len(self.buffer) > 10 * self.window_size:
self.offset -= len(self.buffer) - 10 * self.window_size
self.buffer = self.buffer[-10 * self.window_size:]
if self.started and time.time() - self.started_time > 0.2:
stream = self.recognizer.create_stream()
stream.accept_waveform(16000, self.buffer)
self.recognizer.decode_stream(stream)
text = stream.result.text.strip()
if text and self.prev_content != text:
caption['index'] = self.cur_id
caption['text'] = text
caption['time_s'] = self.time_str
caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
self.prev_content = text
stdout_obj(caption)
self.started_time = time.time()
while not self.vad.empty():
stream = self.recognizer.create_stream()
stream.accept_waveform(16000, self.vad.front.samples)
self.vad.pop()
self.recognizer.decode_stream(stream)
text = stream.result.text.strip()
caption['index'] = self.cur_id
caption['text'] = text
caption['time_s'] = self.time_str
caption['time_t'] = datetime.now().strftime('%H:%M:%S.%f')[:-3]
self.prev_content = ''
stdout_obj(caption)
self.cur_id += 1
self.time_str = datetime.now().strftime('%H:%M:%S.%f')[:-3]
self.buffer = []
self.offset = 0
self.started = False
self.started_time = .0
def translate(self):
"""持续读取共享数据中的音频帧,并进行语音识别,将识别结果输出到标准输出中"""
global shared_data
while shared_data.status == 'running':
chunk = shared_data.chunk_queue.get()
self.send_audio_frame(chunk)
def stop(self):
"""停止 Sense Voice 模型"""
stdout_cmd('info', 'Shepra ONNX Sense Voice recognizer closed.')

View File

@@ -4,6 +4,7 @@ import time
from datetime import datetime from datetime import datetime
from vosk import Model, KaldiRecognizer, SetLogLevel from vosk import Model, KaldiRecognizer, SetLogLevel
from utils import shared_data
from utils import stdout_cmd, stdout_obj, google_translate, ollama_translate from utils import stdout_cmd, stdout_obj, google_translate, ollama_translate
@@ -82,6 +83,13 @@ class VoskRecognizer:
stdout_obj(caption) stdout_obj(caption)
def translate(self):
"""持续读取共享数据中的音频帧,并进行语音识别,将识别结果输出到标准输出中"""
global shared_data
while shared_data.status == 'running':
chunk = shared_data.chunk_queue.get()
self.send_audio_frame(chunk)
def stop(self): def stop(self):
"""停止 Vosk 引擎""" """停止 Vosk 引擎"""
stdout_cmd('info', 'Vosk recognizer closed.') stdout_cmd('info', 'Vosk recognizer closed.')

View File

@@ -1,70 +1,120 @@
import wave
import argparse import argparse
from utils import stdout_cmd, stdout_err import threading
from utils import thread_data, start_server from utils import stdout, stdout_cmd
from utils import shared_data, start_server
from utils import merge_chunk_channels, resample_chunk_mono from utils import merge_chunk_channels, resample_chunk_mono
from audio2text import InvalidParameter, GummyRecognizer from audio2text import GummyRecognizer
from audio2text import VoskRecognizer from audio2text import VoskRecognizer
from audio2text import SosvRecognizer
from sysaudio import AudioStream from sysaudio import AudioStream
def audio_recording(stream: AudioStream, resample: bool, save = False):
global shared_data
stream.open_stream()
if save:
wf = wave.open(f'record.wav', 'wb')
wf.setnchannels(1)
wf.setsampwidth(stream.SAMP_WIDTH)
wf.setframerate(16000)
while shared_data.status == 'running':
raw_chunk = stream.read_chunk()
if raw_chunk is None: continue
if resample:
chunk = resample_chunk_mono(raw_chunk, stream.CHANNELS, stream.RATE, 16000)
else:
chunk = merge_chunk_channels(raw_chunk, stream.CHANNELS)
shared_data.chunk_queue.put(chunk)
if save: wf.writeframes(chunk) # type: ignore
if save: wf.close() # type: ignore
stream.close_stream_signal()
def main_gummy(s: str, t: str, a: int, c: int, k: str): def main_gummy(s: str, t: str, a: int, c: int, k: str):
global thread_data """
Parameters:
s: Source language
t: Target language
k: Aliyun Bailian API key
"""
stream = AudioStream(a, c) stream = AudioStream(a, c)
if t == 'none': if t == 'none':
engine = GummyRecognizer(stream.RATE, s, None, k) engine = GummyRecognizer(stream.RATE, s, None, k)
else: else:
engine = GummyRecognizer(stream.RATE, s, t, k) engine = GummyRecognizer(stream.RATE, s, t, k)
stream.open_stream()
engine.start() engine.start()
chunk_mono = bytes() stream_thread = threading.Thread(
target=audio_recording,
restart_count = 0 args=(stream, False),
while thread_data.status == "running": daemon=True
try: )
chunk = stream.read_chunk() stream_thread.start()
if chunk is None: continue try:
chunk_mono = merge_chunk_channels(chunk, stream.CHANNELS) engine.translate()
try: except KeyboardInterrupt:
engine.send_audio_frame(chunk_mono) stdout("Keyboard interrupt detected. Exiting...")
except InvalidParameter as e:
restart_count += 1
if restart_count > 5:
stdout_err(str(e))
thread_data.status = "kill"
stdout_cmd('kill')
break
else:
stdout_cmd('info', f'Gummy engine stopped, restart attempt: {restart_count}...')
except KeyboardInterrupt:
break
engine.send_audio_frame(chunk_mono)
stream.close_stream()
engine.stop() engine.stop()
def main_vosk(a: int, c: int, m: str, t: str, tm: str, on: str): def main_vosk(a: int, c: int, vosk: str, t: str, tm: str, omn: str):
global thread_data """
Parameters:
a: Audio source: 0 for output, 1 for input
c: Chunk number in 1 second
vosk: Vosk model path
t: Target language
tm: Translation model type, ollama or google
omn: Ollama model name
"""
stream = AudioStream(a, c) stream = AudioStream(a, c)
engine = VoskRecognizer( if t == 'none':
m, None if t == 'none' else t, engine = VoskRecognizer(vosk, None, tm, omn)
tm, on else:
) engine = VoskRecognizer(vosk, t, tm, omn)
stream.open_stream()
engine.start() engine.start()
stream_thread = threading.Thread(
target=audio_recording,
args=(stream, True),
daemon=True
)
stream_thread.start()
try:
engine.translate()
except KeyboardInterrupt:
stdout("Keyboard interrupt detected. Exiting...")
engine.stop()
while thread_data.status == "running":
try:
chunk = stream.read_chunk()
if chunk is None: continue
chunk_mono = resample_chunk_mono(chunk, stream.CHANNELS, stream.RATE, 16000)
engine.send_audio_frame(chunk_mono)
except KeyboardInterrupt:
break
stream.close_stream() def main_sosv(a: int, c: int, sosv: str, t: str, tm: str, omn: str):
"""
Parameters:
a: Audio source: 0 for output, 1 for input
c: Chunk number in 1 second
sosv: Sherpa-ONNX SenseVoice model path
t: Target language
tm: Translation model type, ollama or google
omn: Ollama model name
"""
stream = AudioStream(a, c)
if t == 'none':
engine = SosvRecognizer(sosv, None, tm, omn)
else:
engine = SosvRecognizer(sosv, t, tm, omn)
engine.start()
stream_thread = threading.Thread(
target=audio_recording,
args=(stream, True),
daemon=True
)
stream_thread.start()
try:
engine.translate()
except KeyboardInterrupt:
stdout("Keyboard interrupt detected. Exiting...")
engine.stop() engine.stop()
@@ -74,19 +124,22 @@ if __name__ == "__main__":
parser.add_argument('-e', '--caption_engine', default='gummy', help='Caption engine: gummy or vosk') parser.add_argument('-e', '--caption_engine', default='gummy', help='Caption engine: gummy or vosk')
parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output, 1 for input') parser.add_argument('-a', '--audio_type', default=0, help='Audio stream source: 0 for output, 1 for input')
parser.add_argument('-c', '--chunk_rate', default=10, help='Number of audio stream chunks collected per second') parser.add_argument('-c', '--chunk_rate', default=10, help='Number of audio stream chunks collected per second')
parser.add_argument('-p', '--port', default=8080, help='The port to run the server on, 0 for no server') parser.add_argument('-p', '--port', default=0, help='The port to run the server on, 0 for no server')
parser.add_argument('-t', '--target_language', default='zh', help='Target language code, "none" for no translation') parser.add_argument('-t', '--target_language', default='zh', help='Target language code, "none" for no translation')
# gummy only # gummy only
parser.add_argument('-s', '--source_language', default='en', help='Source language code') parser.add_argument('-s', '--source_language', default='en', help='Source language code')
parser.add_argument('-k', '--api_key', default='', help='API KEY for Gummy model') parser.add_argument('-k', '--api_key', default='', help='API KEY for Gummy model')
# vosk and sosv
parser.add_argument('-tm', '--translation_model', default='ollama', help='Model for translation: ollama or google')
parser.add_argument('-omn', '--ollama_name', default='', help='Ollama model name for translation')
# vosk only # vosk only
parser.add_argument('-m', '--model_path', default='', help='The path to the vosk model.') parser.add_argument('-vosk', '--vosk_model', default='', help='The path to the vosk model.')
parser.add_argument('-tm', '--translation_model', default='', help='Google translate API KEY') # sosv only
parser.add_argument('-on', '--ollama_name', default='', help='Ollama model name for translation') parser.add_argument('-sosv', '--sosv_model', default=None, help='The SenseVoice model path')
args = parser.parse_args() args = parser.parse_args()
if int(args.port) == 0: if int(args.port) == 0:
thread_data.status = "running" shared_data.status = "running"
else: else:
start_server(int(args.port)) start_server(int(args.port))
@@ -102,7 +155,16 @@ if __name__ == "__main__":
main_vosk( main_vosk(
int(args.audio_type), int(args.audio_type),
int(args.chunk_rate), int(args.chunk_rate),
args.model_path, args.vosk_model,
args.target_language,
args.translation_model,
args.ollama_name
)
elif args.caption_engine == 'sosv':
main_sosv(
int(args.audio_type),
int(args.chunk_rate),
args.sosv_model,
args.target_language, args.target_language,
args.translation_model, args.translation_model,
args.ollama_name args.ollama_name
@@ -110,5 +172,5 @@ if __name__ == "__main__":
else: else:
raise ValueError('Invalid caption engine specified.') raise ValueError('Invalid caption engine specified.')
if thread_data.status == "kill": if shared_data.status == "kill":
stdout_cmd('kill') stdout_cmd('kill')

View File

@@ -37,14 +37,13 @@ class AudioStream:
self.FORMAT = pyaudio.paInt16 self.FORMAT = pyaudio.paInt16
self.SAMP_WIDTH = pyaudio.get_sample_size(self.FORMAT) self.SAMP_WIDTH = pyaudio.get_sample_size(self.FORMAT)
self.CHANNELS = int(self.device["maxInputChannels"]) self.CHANNELS = int(self.device["maxInputChannels"])
self.RATE = int(self.device["defaultSampleRate"]) self.DEFAULT_RATE = int(self.device["defaultSampleRate"])
self.CHUNK = self.RATE // chunk_rate self.CHUNK_RATE = chunk_rate
def reset_chunk_size(self, chunk_size: int): self.RATE = 16000
""" self.CHUNK = self.RATE // self.CHUNK_RATE
重新设置音频块大小 self.open_stream()
""" self.close_stream()
self.CHUNK = chunk_size
def get_info(self): def get_info(self):
dev_info = f""" dev_info = f"""
@@ -72,16 +71,27 @@ class AudioStream:
打开并返回系统音频输出流 打开并返回系统音频输出流
""" """
if self.stream: return self.stream if self.stream: return self.stream
self.stream = self.mic.open( try:
format = self.FORMAT, self.stream = self.mic.open(
channels = int(self.CHANNELS), format = self.FORMAT,
rate = self.RATE, channels = int(self.CHANNELS),
input = True, rate = self.RATE,
input_device_index = int(self.INDEX) input = True,
) input_device_index = int(self.INDEX)
)
except OSError:
self.RATE = self.DEFAULT_RATE
self.CHUNK = self.RATE // self.CHUNK_RATE
self.stream = self.mic.open(
format = self.FORMAT,
channels = int(self.CHANNELS),
rate = self.RATE,
input = True,
input_device_index = int(self.INDEX)
)
return self.stream return self.stream
def read_chunk(self): def read_chunk(self) -> bytes | None:
""" """
读取音频数据 读取音频数据
""" """

View File

@@ -55,15 +55,10 @@ class AudioStream:
self.FORMAT = 16 self.FORMAT = 16
self.SAMP_WIDTH = 2 self.SAMP_WIDTH = 2
self.CHANNELS = 2 self.CHANNELS = 2
self.RATE = 48000 self.RATE = 16000
self.CHUNK_RATE = chunk_rate
self.CHUNK = self.RATE // chunk_rate self.CHUNK = self.RATE // chunk_rate
def reset_chunk_size(self, chunk_size: int):
"""
重新设置音频块大小
"""
self.CHUNK = chunk_size
def get_info(self): def get_info(self):
dev_info = f""" dev_info = f"""
音频捕获进程: 音频捕获进程:
@@ -84,7 +79,7 @@ class AudioStream:
启动音频捕获进程 启动音频捕获进程
""" """
self.process = subprocess.Popen( self.process = subprocess.Popen(
["parec", "-d", self.source, "--format=s16le", "--rate=48000", "--channels=2"], ["parec", "-d", self.source, "--format=s16le", "--rate=16000", "--channels=2"],
stdout=subprocess.PIPE stdout=subprocess.PIPE
) )

View File

@@ -61,14 +61,13 @@ class AudioStream:
self.FORMAT = pyaudio.paInt16 self.FORMAT = pyaudio.paInt16
self.SAMP_WIDTH = pyaudio.get_sample_size(self.FORMAT) self.SAMP_WIDTH = pyaudio.get_sample_size(self.FORMAT)
self.CHANNELS = int(self.device["maxInputChannels"]) self.CHANNELS = int(self.device["maxInputChannels"])
self.RATE = int(self.device["defaultSampleRate"]) self.DEFAULT_RATE = int(self.device["defaultSampleRate"])
self.CHUNK = self.RATE // chunk_rate self.CHUNK_RATE = chunk_rate
def reset_chunk_size(self, chunk_size: int): self.RATE = 16000
""" self.CHUNK = self.RATE // self.CHUNK_RATE
重新设置音频块大小 self.open_stream()
""" self.close_stream()
self.CHUNK = chunk_size
def get_info(self): def get_info(self):
dev_info = f""" dev_info = f"""
@@ -96,13 +95,24 @@ class AudioStream:
打开并返回系统音频输出流 打开并返回系统音频输出流
""" """
if self.stream: return self.stream if self.stream: return self.stream
self.stream = self.mic.open( try:
format = self.FORMAT, self.stream = self.mic.open(
channels = self.CHANNELS, format = self.FORMAT,
rate = self.RATE, channels = self.CHANNELS,
input = True, rate = self.RATE,
input_device_index = self.INDEX input = True,
) input_device_index = self.INDEX
)
except OSError:
self.RATE = self.DEFAULT_RATE
self.CHUNK = self.RATE // self.CHUNK_RATE
self.stream = self.mic.open(
format = self.FORMAT,
channels = self.CHANNELS,
rate = self.RATE,
input = True,
input_device_index = self.INDEX
)
return self.stream return self.stream
def read_chunk(self) -> bytes | None: def read_chunk(self) -> bytes | None:

View File

@@ -5,6 +5,6 @@ from .audioprcs import (
resample_mono_chunk resample_mono_chunk
) )
from .sysout import stdout, stdout_err, stdout_cmd, stdout_obj, stderr from .sysout import stdout, stdout_err, stdout_cmd, stdout_obj, stderr
from .thdata import thread_data from .shared import shared_data
from .server import start_server from .server import start_server
from .translation import ollama_translate, google_translate from .translation import ollama_translate, google_translate

View File

@@ -49,9 +49,18 @@ def resample_chunk_mono(chunk: bytes, channels: int, orig_sr: int, target_sr: in
# (length,) # (length,)
chunk_mono = np.mean(chunk_np.astype(np.float32), axis=1) chunk_mono = np.mean(chunk_np.astype(np.float32), axis=1)
if orig_sr == target_sr:
return chunk_mono.astype(np.int16).tobytes()
ratio = target_sr / orig_sr ratio = target_sr / orig_sr
chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode) chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode)
chunk_mono_r = np.round(chunk_mono_r).astype(np.int16) chunk_mono_r = np.round(chunk_mono_r).astype(np.int16)
real_len = round(chunk_mono.shape[0] * ratio)
if(chunk_mono_r.shape[0] > real_len):
chunk_mono_r = chunk_mono_r[:real_len]
else:
while chunk_mono_r.shape[0] < real_len:
chunk_mono_r = np.append(chunk_mono_r, chunk_mono_r[-1])
return chunk_mono_r.tobytes() return chunk_mono_r.tobytes()
@@ -81,9 +90,18 @@ def resample_chunk_mono_np(chunk: bytes, channels: int, orig_sr: int, target_sr:
# (length,) # (length,)
chunk_mono = np.mean(chunk_np.astype(np.float32), axis=1) chunk_mono = np.mean(chunk_np.astype(np.float32), axis=1)
if orig_sr == target_sr:
return chunk_mono.astype(dtype)
ratio = target_sr / orig_sr ratio = target_sr / orig_sr
chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode) chunk_mono_r = samplerate.resample(chunk_mono, ratio, converter_type=mode)
chunk_mono_r = chunk_mono_r.astype(dtype) chunk_mono_r = chunk_mono_r.astype(dtype)
real_len = round(chunk_mono.shape[0] * ratio)
if(chunk_mono_r.shape[0] > real_len):
chunk_mono_r = chunk_mono_r[:real_len]
else:
while chunk_mono_r.shape[0] < real_len:
chunk_mono_r = np.append(chunk_mono_r, chunk_mono_r[-1])
return chunk_mono_r return chunk_mono_r
@@ -100,9 +118,16 @@ def resample_mono_chunk(chunk: bytes, orig_sr: int, target_sr: int, mode="sinc_b
Return: Return:
单通道音频数据块 单通道音频数据块
""" """
if orig_sr == target_sr: return chunk
chunk_np = np.frombuffer(chunk, dtype=np.int16) chunk_np = np.frombuffer(chunk, dtype=np.int16)
chunk_np = chunk_np.astype(np.float32) chunk_np = chunk_np.astype(np.float32)
ratio = target_sr / orig_sr ratio = target_sr / orig_sr
chunk_r = samplerate.resample(chunk_np, ratio, converter_type=mode) chunk_r = samplerate.resample(chunk_np, ratio, converter_type=mode)
chunk_r = np.round(chunk_r).astype(np.int16) chunk_r = np.round(chunk_r).astype(np.int16)
real_len = round(chunk_np.shape[0] * ratio)
if(chunk_r.shape[0] > real_len):
chunk_r = chunk_r[:real_len]
else:
while chunk_r.shape[0] < real_len:
chunk_r = np.append(chunk_r, chunk_r[-1])
return chunk_r.tobytes() return chunk_r.tobytes()

View File

@@ -1,13 +1,12 @@
import socket import socket
import threading import threading
import json import json
# import time from utils import shared_data, stdout_cmd, stderr
from utils import thread_data, stdout_cmd, stderr
def handle_client(client_socket): def handle_client(client_socket):
global thread_data global shared_data
while thread_data.status == 'running': while shared_data.status == 'running':
try: try:
data = client_socket.recv(4096).decode('utf-8') data = client_socket.recv(4096).decode('utf-8')
if not data: if not data:
@@ -15,13 +14,13 @@ def handle_client(client_socket):
data = json.loads(data) data = json.loads(data)
if data['command'] == 'stop': if data['command'] == 'stop':
thread_data.status = 'stop' shared_data.status = 'stop'
break break
except Exception as e: except Exception as e:
stderr(f'Communication error: {e}') stderr(f'Communication error: {e}')
break break
thread_data.status = 'stop' shared_data.status = 'stop'
client_socket.close() client_socket.close()
@@ -34,7 +33,6 @@ def start_server(port: int):
stderr(str(e)) stderr(str(e))
stdout_cmd('kill') stdout_cmd('kill')
return return
# time.sleep(20)
stdout_cmd('connect') stdout_cmd('connect')
client, addr = server.accept() client, addr = server.accept()

8
engine/utils/shared.py Normal file
View File

@@ -0,0 +1,8 @@
import queue
class SharedData:
def __init__(self):
self.status = "running"
self.chunk_queue = queue.Queue()
shared_data = SharedData()

View File

@@ -1,5 +0,0 @@
class ThreadData:
def __init__(self):
self.status = "running"
thread_data = ThreadData()

View File

@@ -81,9 +81,9 @@ export class CaptionEngine {
} }
else if(allConfig.controls.engine === 'vosk'){ else if(allConfig.controls.engine === 'vosk'){
this.command.push('-e', 'vosk') this.command.push('-e', 'vosk')
this.command.push('-m', `"${allConfig.controls.modelPath}"`) this.command.push('-vosk', `"${allConfig.controls.modelPath}"`)
this.command.push('-tm', allConfig.controls.transModel) this.command.push('-tm', allConfig.controls.transModel)
this.command.push('-on', allConfig.controls.ollamaName) this.command.push('-omn', allConfig.controls.ollamaName)
} }
} }
Log.info('Engine Path:', this.appPath) Log.info('Engine Path:', this.appPath)